gecko-dev/lib/libi18n/kinsokuf.c

515 строки
12 KiB
C

/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*-
*
* The contents of this file are subject to the Netscape Public License
* Version 1.0 (the "NPL"); you may not use this file except in
* compliance with the NPL. You may obtain a copy of the NPL at
* http://www.mozilla.org/NPL/
*
* Software distributed under the NPL is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the NPL
* for the specific language governing rights and limitations under the
* NPL.
*
* The Initial Developer of this code under the NPL is Netscape
* Communications Corporation. Portions created by Netscape are
* Copyright (C) 1998 Netscape Communications Corporation. All Rights
* Reserved.
*/
/* kinsukof.c */
#include "intlpriv.h"
/* The table is defined in kinsukod.c */
extern const char *ProhibitBegin_SJIS[];
extern const char *ProhibitBegin_EUCJP[];
extern const char *ProhibitBegin_BIG5[];
extern const char *ProhibitBegin_GB[];
extern const char *ProhibitBegin_KSC[];
extern const char *ProhibitBegin_UTF8[];
extern const char *ProhibitBegin_CNS[];
extern const char *ProhibitEnd_SJIS[];
extern const char *ProhibitEnd_EUCJP[];
extern const char *ProhibitEnd_BIG5[];
extern const char *ProhibitEnd_GB[];
extern const char *ProhibitEnd_KSC[];
extern const char *ProhibitEnd_UTF8[];
extern const char *ProhibitEnd_CNS[];
PUBLIC const char *INTL_NonBreakingSpace(uint16 win_csid)
{
#ifdef XP_MAC
return "\07"; /* 0x07 */
#else
return "\240"; /* 0xA0 */
#endif
}
/*
INTL_CharClass is used for multibyte to divide character to different type
*/
#define IN_BETWEEN(a,b,c) (((a) <= (b)) && ((b) <= (c)))
PUBLIC int
INTL_CharClass(int charset, unsigned char *pstr)
{
int c1, c2, c3;
c1 = *pstr;
switch (charset)
{
case CS_SJIS:
/*
SEVEN_BIT_CHAR: [0x00-0x7F]
HALFWIDTH_PRONOUNCE_CHAR: [0xA0-0xE0]
FULLWIDTH_ASCII_CHAR: [0x82] [0x60-0x9A]
[0x83] [0x9f-0xB6] ( Really no ASCII but Greek and Cyrillic )
[0x83] [0xBF-0x8F]
[0x84] [0x40-0x60]
[0x84] [0x70-0x8F]
FULLWIDTH_PRONOUNCE_CHAR: [0x82] [0x9F-0xF1]
[0x83] [0x40-0x96]
[0x81] [0x5B-0x5D]
KANJI_CHAR: [0x88-0xFC] [xxxxxxxxx] (Except above)
Note: We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
*/
if (c1 < 0x80)
return SEVEN_BIT_CHAR;
if (IN_BETWEEN(0xA0, c1, 0xE0))
return HALFWIDTH_PRONOUNCE_CHAR;
c2 = *(pstr + 1);
switch(c1)
{
case 0x81:
if(IN_BETWEEN(0x5B, c2, 0x5D))
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0x82:
if(IN_BETWEEN(0x60, c2, 0x9A))
return FULLWIDTH_ASCII_CHAR;
if(IN_BETWEEN(0x9F, c2, 0xF1))
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0x83:
if(IN_BETWEEN(0x9F, c2, 0xB6) || IN_BETWEEN(0xBF, c2, 0xD0))
return FULLWIDTH_ASCII_CHAR;
if(IN_BETWEEN(0x40, c2, 0x96))
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0x84:
if(IN_BETWEEN(0x40, c2, 0x8F) || IN_BETWEEN(0xBF, c2, 0xD0))
return FULLWIDTH_ASCII_CHAR;
break;
}
if (IN_BETWEEN(0x88, c1, 0xFC))
return KANJI_CHAR;
return UNCLASSIFIED_CHAR;
case CS_EUCJP: /* TO BE TEST ON UNIX */
/*
SEVEN_BIT_CHAR: [0x00-0x7F]
HALFWIDTH_PRONOUNCE_CHAR: [0x8E]
FULLWIDTH_ASCII_CHAR: [0xA3] [0xC1-0xDA]
[0xE1-0xFA]
[0xA6] [0xA1-0xB8]
[0xC1-0xD8]
[0xA7] [0xA1-0xC1]
[0xD1-0xF1]
[0x8F] [0xA6-0xAF]
FULLWIDTH_PRONOUNCE_CHAR: [0xA4] [xxxxxxx]
[0xA5] [xxxxxxx]
[0x81] [0x5B-0x5D]
KANJI_CHAR: [0xB0-0xFF] [xxxx]
[0x8F] [>0xB0]
Note: We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
*/
if (c1 < 0x80)
return SEVEN_BIT_CHAR;
c2 = *(pstr + 1);
switch(c1)
{
case 0x8E:
return HALFWIDTH_PRONOUNCE_CHAR;
case 0x8F:
if(IN_BETWEEN(0xA6, c2, 0xAF))
return FULLWIDTH_ASCII_CHAR;
break;
case 0xA3:
if(IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))
return FULLWIDTH_ASCII_CHAR;
break;
case 0xA4: case 0xA5:
return FULLWIDTH_PRONOUNCE_CHAR;
case 0xA6:
if(IN_BETWEEN(0xA1, c2, 0xB8) || IN_BETWEEN(0xC1, c2, 0xD8))
return FULLWIDTH_ASCII_CHAR;
break;
case 0xA7:
if(IN_BETWEEN(0xA1, c2, 0xC1) || IN_BETWEEN(0xD1, c2, 0xF1))
return FULLWIDTH_ASCII_CHAR;
break;
}
if(
(c1 >= 0xB0) ||
((c1 == 0x8F) && (c2 > 0xB0))
)
{
return KANJI_CHAR;
}
return UNCLASSIFIED_CHAR;
case CS_KSC_8BIT:
/*
SEVEN_BIT_CHAR: [0x00-0x80]
HALFWIDTH_PRONOUNCE_CHAR: None
FULLWIDTH_ASCII_CHAR: [0xA3] [0xC1-0xDA]
[0xE1-0xFA]
[0xA5] [0xC1-0xD8]
[0xE1-0xF8]
[0xAC] [0xA1-0xC2]
[0xD1-0xF2]
FULLWIDTH_PRONOUNCE_CHAR: [0xA4] [0xA1-0xFE]
[0xB0-0xC8] [xxxxxxxxx]
KANJI_CHAR: [0xCA-0xFD] [xxxxxxxxx]
Note: We didn't handle Hiragana and Katakana here
We count Cyrillic and Greek as FULLWIDTH_ASCII_CHAR
*/
if (c1 < 0x80)
return SEVEN_BIT_CHAR;
c2 = *(pstr + 1);
if (
((c1== 0xA3) && (IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))) ||
((c1== 0xA5) && (IN_BETWEEN(0xC1, c2, 0xD8) || IN_BETWEEN(0xE1, c2, 0xF8))) ||
((c1== 0xAC) && (IN_BETWEEN(0xA1, c2, 0xC2) || IN_BETWEEN(0xD1, c2, 0xF2)))
)
{
return FULLWIDTH_ASCII_CHAR;
}
if (
((c1== 0xA4) && (IN_BETWEEN(0xA1, c2, 0xFE))) ||
(IN_BETWEEN(0xB0, c1, 0xC8))
)
{
return FULLWIDTH_PRONOUNCE_CHAR;
}
if (IN_BETWEEN(0xCA, c1, 0xFD))
return KANJI_CHAR;
return UNCLASSIFIED_CHAR;
case CS_GB_8BIT:
/*
SEVEN_BIT_CHAR: [0x00-0x7F]
HALFWIDTH_PRONOUNCE_CHAR:
FULLWIDTH_ASCII_CHAR: [0xA3] [0xC1-0xDA]
[0xE1-0xFA]
[0xA6] [0xA1-0xB8] Greek
[0xC1-0xD8]
[0xA7] [0xA1-0xC1] Cyrillic
[0xD1-0xF1]
[0xA8] [0xA1-0xBA] European
FULLWIDTH_PRONOUNCE_CHAR: [0xA4,0xA5,0xA8] [xxxx]
KANJI_CHAR:
*/
if (c1 < 0x80)
return SEVEN_BIT_CHAR;
c2 = *(pstr + 1);
if (
((c1== 0xA3) && (IN_BETWEEN(0xC1, c2, 0xDA) || IN_BETWEEN(0xE1, c2, 0xFA))) ||
((c1== 0xA6) && (IN_BETWEEN(0xA1, c2, 0xB8) || IN_BETWEEN(0xC1, c2, 0xD8))) ||
((c1== 0xA7) && (IN_BETWEEN(0xA1, c2, 0xC1) || IN_BETWEEN(0xD1, c2, 0xF1))) ||
((c1== 0xA8) && (IN_BETWEEN(0xA1, c2, 0xBA)) )
)
{
return FULLWIDTH_ASCII_CHAR;
}
if ((c1 == 0xA4) || (c1 == 0xA5) || (c1 == 0xA8))
return FULLWIDTH_PRONOUNCE_CHAR;
if (IN_BETWEEN(0xB0, c1, 0xF7))
return KANJI_CHAR;
return UNCLASSIFIED_CHAR;
case CS_BIG5:
/*
SEVEN_BIT_CHAR: [0x00-0x7F]
HALFWIDTH_PRONOUNCE_CHAR:
FULLWIDTH_ASCII_CHAR: [0xA2] [0xCF-0xFF]
[0xA3] [0x40-0x73]
FULLWIDTH_PRONOUNCE_CHAR: [0xA3] [0x74-0x7E]
[0xA1-0xBF]
KANJI_CHAR: [0xA4-0xFF] [xxxxxxxxx]
*/
if (c1 < 0x80)
return SEVEN_BIT_CHAR;
c2 = *(pstr + 1);
switch(c1)
{
case 0xA2:
if (IN_BETWEEN(0xCF, c2, 0xFF))
return FULLWIDTH_ASCII_CHAR;
break;
case 0xA3:
if (IN_BETWEEN(0x74, c2, 0x7E) || IN_BETWEEN(0xA1, c2, 0xBF))
return FULLWIDTH_PRONOUNCE_CHAR;
if (IN_BETWEEN(0x40, c2, 0x73))
return FULLWIDTH_ASCII_CHAR;
break;
}
if (c1 >= 0xA4)
return KANJI_CHAR;
return UNCLASSIFIED_CHAR;
case CS_CNS_8BIT: /* TO BE TEST ON UNIX */
/*
SEVEN_BIT_CHAR: [0x00-0x7F]
HALFWIDTH_PRONOUNCE_CHAR:
FULLWIDTH_ASCII_CHAR: [0xA4] [0xC1-0xFE]
[0xA5] [0xA1-0xC6]
FULLWIDTH_PRONOUNCE_CHAR: [0xA5] [0xC7-0xF0]
KANJI_CHAR: [0xC4-0xFF] [xxxxxxxxx]
[0x8E]
*/
if (c1 < 0x80)
return SEVEN_BIT_CHAR;
c2 = *(pstr + 1);
switch(c1)
{
case 0xA4:
if(IN_BETWEEN(0xC1, c2, 0xFE))
return FULLWIDTH_ASCII_CHAR;
break;
case 0xA5:
if(IN_BETWEEN(0xC7, c2, 0xF0))
return FULLWIDTH_PRONOUNCE_CHAR;
if(IN_BETWEEN(0xA1, c2, 0xC6))
return FULLWIDTH_ASCII_CHAR;
break;
}
if (IN_BETWEEN(0xC4, c1, 0x8E))
return KANJI_CHAR;
return UNCLASSIFIED_CHAR;
case CS_UTF8:
/*
SEVEN_BIT_CHAR:
FULLWIDTH_ASCII_CHAR:
U+0000 - U+10FF
[C0-E0] [xxxx] Done
[E1] [80-83] [xxxx] Done
U+1E00 - U+1FFF
[E1] [B8-BF] Done
U+FF21 - U+FF3A
[EF] [BC] [A1-BA] Done
U+FF41 - U+FF5A
[EF] [BD] [81-9A] Done
FULLWIDTH_PRONOUNCE_CHAR:
U+1100 - U+11FF
[E1] [84-87] Done
U+3040 - U+318F
[E3] [81-85] [xx] Done
[E3] [86] [80-8F] Done
U+FF66 - U+FFDC
[EF] [BD] [AC-]
[EF] [BE]
[EF] [BF] [-9C]
U+AC00 - U+D7FF
[EA] [B0-] Done
[EB-EC] [xxx] Done
[ED] [-9F] Done
KANJI_CHAR:
U+4E00 - U+9FFF
[E4] [B8-] Done
[E5-E9] [xx] Done
*/
if (c1 < 0x80)
return SEVEN_BIT_CHAR;
if (IN_BETWEEN(0xC0, c1, 0xE0))
{
return FULLWIDTH_ASCII_CHAR;
}
c2 = *(pstr + 1);
switch(c1)
{
case 0xE1:
if (IN_BETWEEN(0x80, c2, 0x83) || IN_BETWEEN(0xB8, c2, 0xBF))
return FULLWIDTH_ASCII_CHAR;
if (IN_BETWEEN(0x84, c2, 0x87))
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0xE3:
if (IN_BETWEEN(0x81, c2, 0x85))
return FULLWIDTH_PRONOUNCE_CHAR;
if (c2 == 0x86)
{
c3 = *(pstr + 2);
if (IN_BETWEEN(0x80, c3, 0x8F))
return FULLWIDTH_PRONOUNCE_CHAR;
}
break;
case 0xE4:
if (c2 >= 0xB8)
return KANJI_CHAR;
break;
case 0xE5: case 0xE6: case 0xE7: case 0xE8: case 0xE9:
return KANJI_CHAR;
break;
case 0xEA:
if (c2 >= 0xB0)
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0xEB: case 0xEC:
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0xED:
if (c2 <= 0x9F)
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0xEF:
c3 = *(pstr + 2);
switch(c2)
{
case 0xBC:
if (IN_BETWEEN(0xA1, c3, 0xBA))
return FULLWIDTH_ASCII_CHAR;
break;
case 0xBD:
if (IN_BETWEEN(0x81, c3, 0x9A))
return FULLWIDTH_ASCII_CHAR;
if (c3 >= 0xAC)
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0xBE:
return FULLWIDTH_PRONOUNCE_CHAR;
break;
case 0xBF:
if (c3 <= 0x9C)
return FULLWIDTH_PRONOUNCE_CHAR;
break;
}
break;
}
return UNCLASSIFIED_CHAR;
default:
break;
}
return UNCLASSIFIED_CHAR;
}
#define IF_A_IN_ARRAY_B_THEN_RETURN_C(a,b,c) \
{ \
int j; \
for (j = 0; (b)[j][0]; j++) \
if (XP_STRNCMP((char *)a, (b)[j], XP_STRLEN((b)[j])) == 0) \
return (c); \
}
#define IF_PROHIBIT_CLASS_THEN_RETURN(a,ba,ea) \
{ \
IF_A_IN_ARRAY_B_THEN_RETURN_C(a,ba,PROHIBIT_BEGIN_OF_LINE); \
IF_A_IN_ARRAY_B_THEN_RETURN_C(a,ea,PROHIBIT_END_OF_LINE); \
}
PUBLIC int INTL_KinsokuClass(int16 win_csid, unsigned char *pstr)
{
switch (win_csid)
{
case CS_SJIS:
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_SJIS,ProhibitEnd_SJIS);
break;
case CS_EUCJP:
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_EUCJP,ProhibitEnd_EUCJP);
break;
case CS_GB_8BIT:
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_GB,ProhibitEnd_GB);
break;
case CS_BIG5:
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_BIG5,ProhibitEnd_BIG5);
break;
case CS_CNS_8BIT:
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_CNS,ProhibitEnd_CNS);
break;
case CS_KSC_8BIT:
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_KSC,ProhibitEnd_KSC);
break;
case CS_UTF8:
IF_PROHIBIT_CLASS_THEN_RETURN(pstr,ProhibitBegin_UTF8,ProhibitEnd_UTF8);
if( *pstr <= 0xE2) /* UCS2 < 0x2000 */
return PROHIBIT_WORD_BREAK;
break;
}
return PROHIBIT_NOWHERE;
}