gecko-dev/intl/uconv/ucvlatin/nsUnicodeToTSCII.cpp

459 строки
16 KiB
C++

/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
/* vim:expandtab:shiftwidth=2:tabstop=2:
*/
/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
#include "nsUnicodeToTSCII.h"
#include "nsMemory.h"
#include "tamil.h"
/*
* TSCII is an 8-bit encoding consisting of:
* 0x00..0x7F: ASCII
* 0x80..0x90, 0x95..0x9F, 0xAB..0xFE:
* Tamil letters and glyphs
* 0xA1..0xA5, 0xAA: Tamil combining letters (after the base character)
* 0xA6..0xA8: Tamil combining letters (before the base character)
* 0x91..0x94: Punctuation
* 0xA9: Symbols
*/
//----------------------------------------------------------------------
// Class nsUnicodeToTSCII [implementation]
NS_IMPL_ISUPPORTS1(nsUnicodeToTSCII, nsIUnicodeEncoder)
/*
* During UCS-4 to TSCII conversion, mState contains
* the last byte (or sometimes the last two bytes) to be output.
* This can be:
* 0x00 Nothing pending.
* 0xB8..0xC9, 0x83..0x86 A consonant.
* 0xEC, 0x8A A consonant with VIRAMA sign (final or joining).
* 0x87, 0xC38A Two consonants combined through a VIRAMA sign.
*/
static const uint8_t UnicharToTSCII[] =
{
0, 0, 0, 0xb7, 0, 0xab, 0xac, 0xfe, // 0x0B80..0x0B87
0xae, 0xaf, 0xb0, 0, 0, 0, 0xb1, 0xb2, // 0x0B88..0x0B8F
0xb3, 0, 0xb4, 0xb5, 0xb6, 0xb8, 0, 0, // 0x0B90..0x0B97
0, 0xb9, 0xba, 0, 0x83, 0, 0xbb, 0xbc, // 0x0B98..0x0B9F
0, 0, 0, 0xbd, 0xbe, 0, 0, 0, // 0x0BA0..0x0BA7
0xbf, 0xc9, 0xc0, 0, 0, 0, 0xc1, 0xc2, // 0x0BA8..0x0BAF
0xc3, 0xc8, 0xc4, 0xc7, 0xc6, 0xc5, 0, 0x84, // 0x0BB0..0x0BB7
0x85, 0x86, 0, 0, 0, 0, 0xa1, 0xa2, // 0x0BB8..0x0BBF
0xa3, 0xa4, 0xa5, 0, 0, 0, 0xa6, 0xa7, // 0x0BC0..0x0BC7
0xa8, 0, 0, 0, 0, 0, 0, 0, // 0x0BC8..0x0BCF
0, 0, 0, 0, 0, 0, 0, 0xaa, // 0x0BD0..0x0BD7
0, 0, 0, 0, 0, 0, 0, 0, // 0x0BD8..0x0BDF
0, 0, 0, 0, 0, 0, 0x80, 0x81, // 0x0BE0..0x0BE7
0x8d, 0x8e, 0x8f, 0x90, 0x95, 0x96, 0x97, 0x98, // 0x0BE8..0x0BEF
0x9d, 0x9e, 0x9f, 0, 0, 0, 0, 0, // 0x0BF0..0x0BF7
0, 0, 0, 0, 0, 0, 0, 0 // 0x0BF8..0x0BFF
};
static const uint8_t consonant_with_u[] =
{
0xcc, 0x99, 0xcd, 0x9a, 0xce, 0xcf, 0xd0, 0xd1, 0xd2,
0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb
};
static const uint8_t consonant_with_uu[] =
{
0xdc, 0x9b, 0xdd, 0x9c, 0xde, 0xdf, 0xe0, 0xe1, 0xe2,
0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb
};
static const uint8_t consonant_with_virama[18] =
{
0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4,
0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd
};
// Modified implementation of Unicode to TSCII converter in glibc by
// Bruno Haible. My modifications are based on Unicode 3.0 chap. 9 and
// the code chart for Tamil.
NS_IMETHODIMP
nsUnicodeToTSCII::Convert(const PRUnichar * aSrc, int32_t * aSrcLength,
char * aDest, int32_t * aDestLength)
{
const PRUnichar * src = aSrc;
const PRUnichar * srcEnd = aSrc + *aSrcLength;
char * dest = aDest;
char * destEnd = dest + *aDestLength;
nsresult rv = NS_OK;
while (src < srcEnd && dest < destEnd) {
PRUnichar ch = *src;
if (mBuffer) {
// Attempt to combine the last character with this one.
uint32_t last = mBuffer;
// last : consonant
if (IS_TSC_CONSONANT(last)) {
if (ch == UNI_VOWELSIGN_U && IS_TSC_CONSONANT1(last)) {
*dest++ = consonant_with_u[last - TSC_KA];
mBuffer = 0;
++src;
continue;
}
if (ch == UNI_VOWELSIGN_UU && IS_TSC_CONSONANT1(last)) {
*dest++ = consonant_with_uu[last - TSC_KA];
mBuffer = 0;
++src;
continue;
}
// reorder. vowel sign goes to the left of consonant
if (IS_UNI_LEFT_VOWELSIGN(ch)) {
if (dest + 2 > destEnd)
goto error_more_output;
*dest++ = TSC_LEFT_VOWELSIGN(ch);
*dest++ = last;
mBuffer = 0;
++src;
continue;
}
// split and reorder. consonant goes bet. two parts
if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
if (dest + 3 > destEnd)
goto error_more_output;
*dest++ = TSC_LEFT_VOWEL_PART(ch);
*dest++ = last;
*dest++ = TSC_RIGHT_VOWEL_PART(ch);
mBuffer = 0;
++src;
continue;
}
// Virama
if (ch == UNI_VIRAMA) {
// consonant KA can form a conjunct with consonant SSA(SHA).
// buffer dead consonant 'K' for the now.
if (last == TSC_KA) {
mBuffer = TSC_KA_DEAD;
}
// SA can form a conjunct when followed by 'RA'.
// buffer dead consonant 'S' for the now.
else if (last == TSC_SA) {
mBuffer = TSC_SA_DEAD;
}
else {
*dest++ = IS_TSC_CONSONANT1(last) ?
consonant_with_virama[last - TSC_KA] : last + 5;
mBuffer = 0;
}
++src;
continue;
}
// consonant TA forms a ligature with vowel 'I' or 'II'.
if (last == TSC_TA && (ch == UNI_VOWELSIGN_I || ch == UNI_VOWELSIGN_II)) {
*dest++ = ch - (UNI_VOWELSIGN_I - TSC_TI_LIGA);
mBuffer = 0;
++src;
continue;
}
}
else if (last == TSC_KA_DEAD) {
// Kd + SSA = K.SSA
if (ch == UNI_SSA) {
mBuffer = TSC_KSSA;
++src;
continue;
}
}
else if (last == TSC_SA_DEAD) {
// Sd + RA = S.RA. Buffer RA + Sd.
if (ch == UNI_RA) {
mBuffer = 0xc38a;
++src;
continue;
}
}
else if (last == TSC_KSSA) {
if (ch == UNI_VIRAMA) {
*dest++ = (char) TSC_KSSA_DEAD;
mBuffer = 0;
++src;
continue;
}
// vowel splitting/reordering should be done around conjuncts as well.
// reorder. vowel sign goes to the left of consonant
if (IS_UNI_LEFT_VOWELSIGN(ch)) {
if (dest + 2 > destEnd)
goto error_more_output;
*dest++ = TSC_LEFT_VOWELSIGN(ch);
*dest++ = last;
mBuffer = 0;
++src;
continue;
}
// split and reorder. consonant goes bet. two parts
if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
if (dest + 3 > destEnd)
goto error_more_output;
*dest++ = TSC_LEFT_VOWEL_PART(ch);
*dest++ = last;
*dest++ = TSC_RIGHT_VOWEL_PART(ch);
mBuffer = 0;
++src;
continue;
}
}
else {
NS_ASSERTION(last == 0xc38a, "No other value can be buffered");
if (ch == UNI_VOWELSIGN_II) {
*dest++ = (char) TSC_SRII_LIGA;
mBuffer = 0;
++src;
continue;
}
else {
// put back TSC_SA_DEAD and TSC_RA
*dest++ = (char) TSC_SA_DEAD;
mBuffer = TSC_RA;
++src;
continue;
}
}
/* Output the buffered character. */
if (last >> 8) {
if (dest + 2 > destEnd)
goto error_more_output;
*dest++ = last & 0xff;
*dest++ = (last >> 8) & 0xff;
}
else
*dest++ = last & 0xff;
mBuffer = 0;
continue;
}
if (ch < 0x80) // Plain ASCII character.
*dest++ = (char)ch;
else if (IS_UNI_TAMIL(ch)) {
uint8_t t = UnicharToTSCII[ch - UNI_TAMIL_START];
if (t != 0) {
if (IS_TSC_CONSONANT(t))
mBuffer = (uint32_t) t;
else
*dest++ = t;
}
else if (IS_UNI_2PARTS_VOWELSIGN(ch)) {
// actually this is an illegal sequence.
if (dest + 2 > destEnd)
goto error_more_output;
*dest++ = TSC_LEFT_VOWEL_PART(ch);
*dest++ = TSC_RIGHT_VOWEL_PART(ch);
}
else {
*aDestLength = dest - aDest;
return NS_ERROR_UENC_NOMAPPING;
}
}
else if (ch == 0x00A9)
*dest++ = (char)ch;
else if (IS_UNI_SINGLE_QUOTE(ch))
*dest++ = ch - UNI_LEFT_SINGLE_QUOTE + TSC_LEFT_SINGLE_QUOTE;
else if (IS_UNI_DOUBLE_QUOTE(ch))
*dest++ = ch - UNI_LEFT_DOUBLE_QUOTE + TSC_LEFT_DOUBLE_QUOTE;
else {
*aDestLength = dest - aDest;
return NS_ERROR_UENC_NOMAPPING;
}
/* Now that we wrote the output increment the input pointer. */
++src;
}
// flush the buffer
if (mBuffer >> 8) {
// Write out the last character, two bytes.
if (dest + 2 > destEnd)
goto error_more_output;
*dest++ = (mBuffer >> 8) & 0xff;
*dest++ = mBuffer & 0xff;
mBuffer = 0;
}
else if (mBuffer) {
// Write out the last character, a single byte.
if (dest >= destEnd)
goto error_more_output;
*dest++ = mBuffer & 0xff;
mBuffer = 0;
}
*aSrcLength = src - aSrc;
*aDestLength = dest - aDest;
return rv;
error_more_output:
*aSrcLength = src - aSrc;
*aDestLength = dest - aDest;
return NS_OK_UENC_MOREOUTPUT;
}
NS_IMETHODIMP
nsUnicodeToTSCII::Finish(char* aDest, int32_t* aDestLength)
{
if (!mBuffer) {
*aDestLength = 0;
return NS_OK;
}
if (mBuffer >> 8) {
// Write out the last character, two bytes.
if (*aDestLength < 2) {
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
*aDest++ = (mBuffer >> 8) & 0xff;
*aDest++ = mBuffer & 0xff;
mBuffer = 0;
*aDestLength = 2;
}
else {
// Write out the last character, a single byte.
if (*aDestLength < 1) {
*aDestLength = 0;
return NS_OK_UENC_MOREOUTPUT;
}
*aDest++ = mBuffer & 0xff;
mBuffer = 0;
*aDestLength = 1;
}
return NS_OK;
}
//================================================================
NS_IMETHODIMP
nsUnicodeToTSCII::Reset()
{
mBuffer = 0;
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToTSCII::GetMaxLength(const PRUnichar * aSrc, int32_t aSrcLength,
int32_t * aDestLength)
{
// Some Tamil letters can be decomposed into 2 glyphs in TSCII.
*aDestLength = aSrcLength * 2;
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToTSCII::SetOutputErrorBehavior(int32_t aBehavior,
nsIUnicharEncoder *aEncoder,
PRUnichar aChar)
{
return NS_OK;
}
// same as the mapping of the C1(0x80-0x9f) part of Windows-1252 to Unicode
const static PRUnichar gTSCIIToTTF[] = {
0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
};
//----------------------------------------------------------------------
// Class nsUnicodeToTamilTTF [implementation]
//
NS_IMPL_ISUPPORTS_INHERITED0(nsUnicodeToTamilTTF, nsUnicodeToTSCII)
NS_IMETHODIMP
nsUnicodeToTamilTTF::Convert(const PRUnichar * aSrc,
int32_t * aSrcLength, char * aDest,
int32_t * aDestLength)
{
int32_t medLen, destLen;
char *med;
GetMaxLength(aSrc, *aSrcLength, &destLen);
NS_ASSERTION(destLen <= *aDestLength, "insufficient dest. buffer size");
// TSCII converter is a single byte encoder and takes half the space
// taken by TamilTTF encoder.
medLen = destLen / 2;
if (medLen > CHAR_BUFFER_SIZE) {
med = (char *) nsMemory::Alloc(medLen);
if (!med)
return NS_ERROR_OUT_OF_MEMORY;
}
else
med = mStaticBuffer;
nsresult rv = nsUnicodeToTSCII::Convert(aSrc, aSrcLength, med, &medLen);
if (NS_FAILED(rv)) {
if (med != mStaticBuffer)
nsMemory::Free(med);
return rv;
}
int32_t i, j;
// widen 8bit TSCII to pseudo-Unicode font encoding of TSCII-Tamil font
for (i = 0, j = 0; i < medLen; i++) {
// Only C1 part(0x80-0x9f) needs to be mapped as if they're CP1251.
PRUnichar ucs2 = (med[i] & 0xe0) == 0x80 ?
gTSCIIToTTF[med[i] & 0x7f] : uint8_t(med[i]);
// A lot of TSCII fonts are still based on TSCII 1.6 so that
// they have Tamil vowel 'I' at 0xad instead of 0xfe.
if (ucs2 == 0xfe) ucs2 = 0xad;
aDest[j++] = uint8_t((ucs2 & 0xff00) >> 8);
aDest[j++] = uint8_t(ucs2 & 0x00ff);
}
*aDestLength = j;
if (med != mStaticBuffer)
nsMemory::Free(med);
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToTamilTTF::GetMaxLength(const PRUnichar * aSrc, int32_t aSrcLength, int32_t * aDestLength)
{
// Each Tamil character can generate at most two presentation forms,
// but we're 'extending' them to 16bit shorts, which accounts for
// additional factor of 2.
*aDestLength = (aSrcLength + 1) * 4;
return NS_OK;
}
NS_IMETHODIMP
nsUnicodeToTamilTTF::SetOutputErrorBehavior(int32_t aBehavior,
nsIUnicharEncoder *aEncoder,
PRUnichar aChar)
{
if (aBehavior == kOnError_CallBack && aEncoder == nullptr)
return NS_ERROR_NULL_POINTER;
mErrEncoder = aEncoder;
mErrBehavior = aBehavior;
mErrChar = aChar;
return NS_OK;
}