зеркало из https://github.com/mozilla/gecko-dev.git
add files from UCDATA 1.9
This commit is contained in:
Родитель
14c7cee8fe
Коммит
001455720b
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
|
@ -0,0 +1,306 @@
|
|||
/*
|
||||
* Copyright 1996, 1997, 1998 Computing Research Labs,
|
||||
* New Mexico State University
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
#ifndef _h_ucdata
|
||||
#define _h_ucdata
|
||||
|
||||
/*
|
||||
* $Id: ucdata.h,v 1.1 1999/01/06 01:46:32 ftang%netscape.com Exp $
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#undef __
|
||||
#ifdef __STDC__
|
||||
#define __(x) x
|
||||
#else
|
||||
#define __(x) ()
|
||||
#endif
|
||||
|
||||
#define UCDATA_VERSION "1.9"
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Masks and macros for character properties.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/*
|
||||
* Values that can appear in the `mask1' parameter of the ucisprop()
|
||||
* function.
|
||||
*/
|
||||
#define UC_MN 0x00000001 /* Mark, Non-Spacing */
|
||||
#define UC_MC 0x00000002 /* Mark, Spacing Combining */
|
||||
#define UC_ME 0x00000004 /* Mark, Enclosing */
|
||||
#define UC_ND 0x00000008 /* Number, Decimal Digit */
|
||||
#define UC_NL 0x00000010 /* Number, Letter */
|
||||
#define UC_NO 0x00000020 /* Number, Other */
|
||||
#define UC_ZS 0x00000040 /* Separator, Space */
|
||||
#define UC_ZL 0x00000080 /* Separator, Line */
|
||||
#define UC_ZP 0x00000100 /* Separator, Paragraph */
|
||||
#define UC_CC 0x00000200 /* Other, Control */
|
||||
#define UC_CF 0x00000400 /* Other, Format */
|
||||
#define UC_OS 0x00000800 /* Other, Surrogate */
|
||||
#define UC_CO 0x00001000 /* Other, Private Use */
|
||||
#define UC_CN 0x00002000 /* Other, Not Assigned */
|
||||
#define UC_LU 0x00004000 /* Letter, Uppercase */
|
||||
#define UC_LL 0x00008000 /* Letter, Lowercase */
|
||||
#define UC_LT 0x00010000 /* Letter, Titlecase */
|
||||
#define UC_LM 0x00020000 /* Letter, Modifier */
|
||||
#define UC_LO 0x00040000 /* Letter, Other */
|
||||
#define UC_PC 0x00080000 /* Punctuation, Connector */
|
||||
#define UC_PD 0x00100000 /* Punctuation, Dash */
|
||||
#define UC_PS 0x00200000 /* Punctuation, Open */
|
||||
#define UC_PE 0x00400000 /* Punctuation, Close */
|
||||
#define UC_PO 0x00800000 /* Punctuation, Other */
|
||||
#define UC_SM 0x01000000 /* Symbol, Math */
|
||||
#define UC_SC 0x02000000 /* Symbol, Currency */
|
||||
#define UC_SK 0x04000000 /* Symbol, Modifier */
|
||||
#define UC_SO 0x08000000 /* Symbol, Other */
|
||||
#define UC_L 0x10000000 /* Left-To-Right */
|
||||
#define UC_R 0x20000000 /* Right-To-Left */
|
||||
#define UC_EN 0x40000000 /* European Number */
|
||||
#define UC_ES 0x80000000 /* European Number Separator */
|
||||
|
||||
/*
|
||||
* Values that can appear in the `mask2' parameter of the ucisprop()
|
||||
* function.
|
||||
*/
|
||||
#define UC_ET 0x00000001 /* European Number Terminator */
|
||||
#define UC_AN 0x00000002 /* Arabic Number */
|
||||
#define UC_CS 0x00000004 /* Common Number Separator */
|
||||
#define UC_B 0x00000008 /* Block Separator */
|
||||
#define UC_S 0x00000010 /* Segment Separator */
|
||||
#define UC_WS 0x00000020 /* Whitespace */
|
||||
#define UC_ON 0x00000040 /* Other Neutrals */
|
||||
/*
|
||||
* Implementation specific character properties.
|
||||
*/
|
||||
#define UC_CM 0x00000080 /* Composite */
|
||||
#define UC_NB 0x00000100 /* Non-Breaking */
|
||||
#define UC_SY 0x00000200 /* Symmetric */
|
||||
#define UC_HD 0x00000400 /* Hex Digit */
|
||||
#define UC_QM 0x00000800 /* Quote Mark */
|
||||
#define UC_MR 0x00001000 /* Mirroring */
|
||||
#define UC_SS 0x00002000 /* Space, other */
|
||||
|
||||
#define UC_CP 0x00004000 /* Defined */
|
||||
|
||||
/*
|
||||
* Added for UnicodeData-2.1.3.
|
||||
*/
|
||||
#define UC_PI 0x00008000 /* Punctuation, Initial */
|
||||
#define UC_PF 0x00010000 /* Punctuation, Final */
|
||||
|
||||
/*
|
||||
* This is the primary function for testing to see if a character has some set
|
||||
* of properties. The macros that test for various character properties all
|
||||
* call this function with some set of masks.
|
||||
*/
|
||||
extern int ucisprop __((unsigned long code, unsigned long mask1,
|
||||
unsigned long mask2));
|
||||
|
||||
#define ucisalpha(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0)
|
||||
#define ucisdigit(cc) ucisprop(cc, UC_ND, 0)
|
||||
#define ucisalnum(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0)
|
||||
#define uciscntrl(cc) ucisprop(cc, UC_CC|UC_CF, 0)
|
||||
#define ucisspace(cc) ucisprop(cc, UC_ZS|UC_SS, 0)
|
||||
#define ucisblank(cc) ucisprop(cc, UC_ZS, 0)
|
||||
#define ucispunct(cc) ucisprop(cc, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF)
|
||||
#define ucisgraph(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
|
||||
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
|
||||
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
|
||||
UC_SO, UC_PI|UC_PF)
|
||||
#define ucisprint(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
|
||||
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
|
||||
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
|
||||
UC_SO|UC_ZS, UC_PI|UC_PF)
|
||||
#define ucisupper(cc) ucisprop(cc, UC_LU, 0)
|
||||
#define ucislower(cc) ucisprop(cc, UC_LL, 0)
|
||||
#define ucistitle(cc) ucisprop(cc, UC_LT, 0)
|
||||
#define ucisxdigit(cc) ucisprop(cc, 0, UC_HD)
|
||||
|
||||
#define ucisisocntrl(cc) ucisprop(cc, UC_CC, 0)
|
||||
#define ucisfmtcntrl(cc) ucisprop(cc, UC_CF, 0)
|
||||
|
||||
#define ucissymbol(cc) ucisprop(cc, UC_SM|UC_SC|UC_SO|UC_SK, 0)
|
||||
#define ucisnumber(cc) ucisprop(cc, UC_ND|UC_NO|UC_NL, 0)
|
||||
#define ucisnonspacing(cc) ucisprop(cc, UC_MN, 0)
|
||||
#define ucisopenpunct(cc) ucisprop(cc, UC_PS, 0)
|
||||
#define ucisclosepunct(cc) ucisprop(cc, UC_PE, 0)
|
||||
#define ucisinitialpunct(cc) ucisprop(cc, 0, UC_PI)
|
||||
#define ucisfinalpunct(cc) ucisprop(cc, 0, UC_PF)
|
||||
|
||||
#define uciscomposite(cc) ucisprop(cc, 0, UC_CM)
|
||||
#define ucishex(cc) ucisprop(cc, 0, UC_HD)
|
||||
#define ucisquote(cc) ucisprop(cc, 0, UC_QM)
|
||||
#define ucissymmetric(cc) ucisprop(cc, 0, UC_SY)
|
||||
#define ucismirroring(cc) ucisprop(cc, 0, UC_MR)
|
||||
#define ucisnonbreaking(cc) ucisprop(cc, 0, UC_NB)
|
||||
|
||||
/*
|
||||
* Directionality macros.
|
||||
*/
|
||||
#define ucisrtl(cc) ucisprop(cc, UC_R, 0)
|
||||
#define ucisltr(cc) ucisprop(cc, UC_L, 0)
|
||||
#define ucisstrong(cc) ucisprop(cc, UC_L|UC_R, 0)
|
||||
#define ucisweak(cc) ucisprop(cc, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS)
|
||||
#define ucisneutral(cc) ucisprop(cc, 0, UC_B|UC_S|UC_WS|UC_ON)
|
||||
#define ucisseparator(cc) ucisprop(cc, 0, UC_B|UC_S)
|
||||
|
||||
/*
|
||||
* Other macros inspired by John Cowan.
|
||||
*/
|
||||
#define ucismark(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME, 0)
|
||||
#define ucismodif(cc) ucisprop(cc, UC_LM, 0)
|
||||
#define ucisletnum(cc) ucisprop(cc, UC_NL, 0)
|
||||
#define ucisconnect(cc) ucisprop(cc, UC_PC, 0)
|
||||
#define ucisdash(cc) ucisprop(cc, UC_PD, 0)
|
||||
#define ucismath(cc) ucisprop(cc, UC_SM, 0)
|
||||
#define uciscurrency(cc) ucisprop(cc, UC_SC, 0)
|
||||
#define ucismodifsymbol(cc) ucisprop(cc, UC_SK, 0)
|
||||
#define ucisnsmark(cc) ucisprop(cc, UC_MN, 0)
|
||||
#define ucisspmark(cc) ucisprop(cc, UC_MC, 0)
|
||||
#define ucisenclosing(cc) ucisprop(cc, UC_ME, 0)
|
||||
#define ucisprivate(cc) ucisprop(cc, UC_CO, 0)
|
||||
#define ucissurrogate(cc) ucisprop(cc, UC_OS, 0)
|
||||
#define ucislsep(cc) ucisprop(cc, UC_ZL, 0)
|
||||
#define ucispsep(cc) ucisprop(cc, UC_ZP, 0)
|
||||
|
||||
#define ucisidentstart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0)
|
||||
#define ucisidentpart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|\
|
||||
UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0)
|
||||
|
||||
#define ucisdefined(cc) ucisprop(cc, 0, UC_CP)
|
||||
#define ucisundefined(cc) !ucisprop(cc, 0, UC_CP)
|
||||
|
||||
/*
|
||||
* Other miscellaneous character property macros.
|
||||
*/
|
||||
#define ucishan(cc) (((cc) >= 0x4e00 && (cc) <= 0x9fff) ||\
|
||||
((cc) >= 0xf900 && (cc) <= 0xfaff))
|
||||
#define ucishangul(cc) ((cc) >= 0xac00 && (cc) <= 0xd7ff)
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions for case conversion.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
extern unsigned long uctoupper __((unsigned long code));
|
||||
extern unsigned long uctolower __((unsigned long code));
|
||||
extern unsigned long uctotitle __((unsigned long code));
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions for getting decompositions.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/*
|
||||
* This routine determines if the code has a decomposition. If it returns 0,
|
||||
* there is no decomposition. Any other value indicates a decomposition was
|
||||
* returned.
|
||||
*/
|
||||
extern int ucdecomp __((unsigned long code, unsigned long *num,
|
||||
|
||||
unsigned long **decomp));
|
||||
|
||||
/*
|
||||
* If the code is a Hangul syllable, this routine decomposes it into the array
|
||||
* passed. The array size should be at least 3.
|
||||
*/
|
||||
extern int ucdecomp_hangul __((unsigned long code, unsigned long *num,
|
||||
unsigned long decomp[]));
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions for getting combining classes.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/*
|
||||
* This will return the combining class for a character to be used with the
|
||||
* Canonical Ordering algorithm.
|
||||
*/
|
||||
extern unsigned long uccombining_class __((unsigned long code));
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions for getting numbers and digits.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
struct ucnumber {
|
||||
int numerator;
|
||||
int denominator;
|
||||
};
|
||||
|
||||
extern int ucnumber_lookup __((unsigned long code, struct ucnumber *num));
|
||||
extern int ucdigit_lookup __((unsigned long code, int *digit));
|
||||
|
||||
/*
|
||||
* For compatibility with John Cowan's "uctype" package.
|
||||
*/
|
||||
extern struct ucnumber ucgetnumber __((unsigned long code));
|
||||
extern int ucgetdigit __((unsigned long code));
|
||||
|
||||
/**************************************************************************
|
||||
*
|
||||
* Functions library initialization and cleanup.
|
||||
*
|
||||
**************************************************************************/
|
||||
|
||||
/*
|
||||
* Macros for specifying the data tables to be loaded for ucdata_load().
|
||||
*/
|
||||
#define UCDATA_CASE 0x01
|
||||
#define UCDATA_CTYPE 0x02
|
||||
#define UCDATA_DECOMP 0x04
|
||||
#define UCDATA_CMBCL 0x08
|
||||
#define UCDATA_NUM 0x10
|
||||
|
||||
#define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
|
||||
UCDATA_CMBCL|UCDATA_NUM)
|
||||
|
||||
/*
|
||||
* Functions to load, unload, and reload specific data files.
|
||||
*/
|
||||
extern void ucdata_load __((char *paths, int mask));
|
||||
extern void ucdata_unload __((int mask));
|
||||
extern void ucdata_reload __((char *paths, int mask));
|
||||
|
||||
/*
|
||||
* Deprecated functions, now just compatibility macros.
|
||||
*/
|
||||
#define ucdata_setup(p) ucdata_load(p, UCDATA_ALL)
|
||||
#define ucdata_cleanup() ucdata_unload(UCDATA_ALL)
|
||||
|
||||
#undef __
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _h_ucdata */
|
|
@ -0,0 +1,208 @@
|
|||
#
|
||||
# $Id: MUTTUCData.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
|
||||
#
|
||||
# Copyright 1996, 1997, 1998 Computing Research Labs,
|
||||
# New Mexico State University
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
#
|
||||
# Implementation specific character properties.
|
||||
#
|
||||
#
|
||||
# Space, other.
|
||||
#
|
||||
0009;;Ss;;;;;;;;;;;;
|
||||
000A;;Ss;;;;;;;;;;;;
|
||||
000B;;Ss;;;;;;;;;;;;
|
||||
000C;;Ss;;;;;;;;;;;;
|
||||
000D;;Ss;;;;;;;;;;;;
|
||||
#
|
||||
# Non-breaking.
|
||||
#
|
||||
00A0;;Nb;;;;;;;;;;;;
|
||||
2007;;Nb;;;;;;;;;;;;
|
||||
2011;;Nb;;;;;;;;;;;;
|
||||
FEFF;;Nb;;;;;;;;;;;;
|
||||
#
|
||||
# Symmetric.
|
||||
#
|
||||
0028;;Sy;;;;;;;;;;;;
|
||||
0029;;Sy;;;;;;;;;;;;
|
||||
005B;;Sy;;;;;;;;;;;;
|
||||
005D;;Sy;;;;;;;;;;;;
|
||||
007B;;Sy;;;;;;;;;;;;
|
||||
007D;;Sy;;;;;;;;;;;;
|
||||
00AB;;Sy;;;;;;;;;;;;
|
||||
00BB;;Sy;;;;;;;;;;;;
|
||||
0F3A;;Sy;;;;;;;;;;;;
|
||||
0F3B;;Sy;;;;;;;;;;;;
|
||||
0F3C;;Sy;;;;;;;;;;;;
|
||||
0F3D;;Sy;;;;;;;;;;;;
|
||||
0F3E;;Sy;;;;;;;;;;;;
|
||||
0F3F;;Sy;;;;;;;;;;;;
|
||||
2018;;Sy;;;;;;;;;;;;
|
||||
2019;;Sy;;;;;;;;;;;;
|
||||
201A;;Sy;;;;;;;;;;;;
|
||||
201B;;Sy;;;;;;;;;;;;
|
||||
201C;;Sy;;;;;;;;;;;;
|
||||
201D;;Sy;;;;;;;;;;;;
|
||||
201E;;Sy;;;;;;;;;;;;
|
||||
201F;;Sy;;;;;;;;;;;;
|
||||
2039;;Sy;;;;;;;;;;;;
|
||||
203A;;Sy;;;;;;;;;;;;
|
||||
2045;;Sy;;;;;;;;;;;;
|
||||
2046;;Sy;;;;;;;;;;;;
|
||||
207D;;Sy;;;;;;;;;;;;
|
||||
207E;;Sy;;;;;;;;;;;;
|
||||
208D;;Sy;;;;;;;;;;;;
|
||||
208E;;Sy;;;;;;;;;;;;
|
||||
2329;;Sy;;;;;;;;;;;;
|
||||
232A;;Sy;;;;;;;;;;;;
|
||||
3008;;Sy;;;;;;;;;;;;
|
||||
3009;;Sy;;;;;;;;;;;;
|
||||
300A;;Sy;;;;;;;;;;;;
|
||||
300B;;Sy;;;;;;;;;;;;
|
||||
300C;;Sy;;;;;;;;;;;;
|
||||
300D;;Sy;;;;;;;;;;;;
|
||||
300E;;Sy;;;;;;;;;;;;
|
||||
300F;;Sy;;;;;;;;;;;;
|
||||
3010;;Sy;;;;;;;;;;;;
|
||||
3011;;Sy;;;;;;;;;;;;
|
||||
3014;;Sy;;;;;;;;;;;;
|
||||
3015;;Sy;;;;;;;;;;;;
|
||||
3016;;Sy;;;;;;;;;;;;
|
||||
3017;;Sy;;;;;;;;;;;;
|
||||
3018;;Sy;;;;;;;;;;;;
|
||||
3019;;Sy;;;;;;;;;;;;
|
||||
301A;;Sy;;;;;;;;;;;;
|
||||
301B;;Sy;;;;;;;;;;;;
|
||||
301D;;Sy;;;;;;;;;;;;
|
||||
301E;;Sy;;;;;;;;;;;;
|
||||
FD3E;;Sy;;;;;;;;;;;;
|
||||
FD3F;;Sy;;;;;;;;;;;;
|
||||
FE35;;Sy;;;;;;;;;;;;
|
||||
FE36;;Sy;;;;;;;;;;;;
|
||||
FE37;;Sy;;;;;;;;;;;;
|
||||
FE38;;Sy;;;;;;;;;;;;
|
||||
FE39;;Sy;;;;;;;;;;;;
|
||||
FE3A;;Sy;;;;;;;;;;;;
|
||||
FE3B;;Sy;;;;;;;;;;;;
|
||||
FE3C;;Sy;;;;;;;;;;;;
|
||||
FE3D;;Sy;;;;;;;;;;;;
|
||||
FE3E;;Sy;;;;;;;;;;;;
|
||||
FE3F;;Sy;;;;;;;;;;;;
|
||||
FE40;;Sy;;;;;;;;;;;;
|
||||
FE41;;Sy;;;;;;;;;;;;
|
||||
FE42;;Sy;;;;;;;;;;;;
|
||||
FE43;;Sy;;;;;;;;;;;;
|
||||
FE44;;Sy;;;;;;;;;;;;
|
||||
FE59;;Sy;;;;;;;;;;;;
|
||||
FE5A;;Sy;;;;;;;;;;;;
|
||||
FE5B;;Sy;;;;;;;;;;;;
|
||||
FE5C;;Sy;;;;;;;;;;;;
|
||||
FE5D;;Sy;;;;;;;;;;;;
|
||||
FE5E;;Sy;;;;;;;;;;;;
|
||||
FF08;;Sy;;;;;;;;;;;;
|
||||
FF09;;Sy;;;;;;;;;;;;
|
||||
FF3B;;Sy;;;;;;;;;;;;
|
||||
FF3D;;Sy;;;;;;;;;;;;
|
||||
FF5B;;Sy;;;;;;;;;;;;
|
||||
FF5D;;Sy;;;;;;;;;;;;
|
||||
FF62;;Sy;;;;;;;;;;;;
|
||||
FF63;;Sy;;;;;;;;;;;;
|
||||
#
|
||||
# Hex digit.
|
||||
#
|
||||
0030;;Hd;;;;;;;;;;;;
|
||||
0031;;Hd;;;;;;;;;;;;
|
||||
0032;;Hd;;;;;;;;;;;;
|
||||
0033;;Hd;;;;;;;;;;;;
|
||||
0034;;Hd;;;;;;;;;;;;
|
||||
0035;;Hd;;;;;;;;;;;;
|
||||
0036;;Hd;;;;;;;;;;;;
|
||||
0037;;Hd;;;;;;;;;;;;
|
||||
0038;;Hd;;;;;;;;;;;;
|
||||
0039;;Hd;;;;;;;;;;;;
|
||||
0041;;Hd;;;;;;;;;;;;
|
||||
0042;;Hd;;;;;;;;;;;;
|
||||
0043;;Hd;;;;;;;;;;;;
|
||||
0044;;Hd;;;;;;;;;;;;
|
||||
0045;;Hd;;;;;;;;;;;;
|
||||
0046;;Hd;;;;;;;;;;;;
|
||||
0061;;Hd;;;;;;;;;;;;
|
||||
0062;;Hd;;;;;;;;;;;;
|
||||
0063;;Hd;;;;;;;;;;;;
|
||||
0064;;Hd;;;;;;;;;;;;
|
||||
0065;;Hd;;;;;;;;;;;;
|
||||
0066;;Hd;;;;;;;;;;;;
|
||||
FF10;;Hd;;;;;;;;;;;;
|
||||
FF11;;Hd;;;;;;;;;;;;
|
||||
FF12;;Hd;;;;;;;;;;;;
|
||||
FF13;;Hd;;;;;;;;;;;;
|
||||
FF14;;Hd;;;;;;;;;;;;
|
||||
FF15;;Hd;;;;;;;;;;;;
|
||||
FF16;;Hd;;;;;;;;;;;;
|
||||
FF17;;Hd;;;;;;;;;;;;
|
||||
FF18;;Hd;;;;;;;;;;;;
|
||||
FF19;;Hd;;;;;;;;;;;;
|
||||
FF21;;Hd;;;;;;;;;;;;
|
||||
FF22;;Hd;;;;;;;;;;;;
|
||||
FF23;;Hd;;;;;;;;;;;;
|
||||
FF24;;Hd;;;;;;;;;;;;
|
||||
FF25;;Hd;;;;;;;;;;;;
|
||||
FF26;;Hd;;;;;;;;;;;;
|
||||
FF41;;Hd;;;;;;;;;;;;
|
||||
FF42;;Hd;;;;;;;;;;;;
|
||||
FF43;;Hd;;;;;;;;;;;;
|
||||
FF44;;Hd;;;;;;;;;;;;
|
||||
FF45;;Hd;;;;;;;;;;;;
|
||||
FF46;;Hd;;;;;;;;;;;;
|
||||
#
|
||||
# Quote marks.
|
||||
#
|
||||
0022;;Qm;;;;;;;;;;;;
|
||||
0027;;Qm;;;;;;;;;;;;
|
||||
00AB;;Qm;;;;;;;;;;;;
|
||||
00BB;;Qm;;;;;;;;;;;;
|
||||
2018;;Qm;;;;;;;;;;;;
|
||||
2019;;Qm;;;;;;;;;;;;
|
||||
201A;;Qm;;;;;;;;;;;;
|
||||
201B;;Qm;;;;;;;;;;;;
|
||||
201C;;Qm;;;;;;;;;;;;
|
||||
201D;;Qm;;;;;;;;;;;;
|
||||
201E;;Qm;;;;;;;;;;;;
|
||||
201F;;Qm;;;;;;;;;;;;
|
||||
2039;;Qm;;;;;;;;;;;;
|
||||
203A;;Qm;;;;;;;;;;;;
|
||||
300C;;Qm;;;;;;;;;;;;
|
||||
300D;;Qm;;;;;;;;;;;;
|
||||
300E;;Qm;;;;;;;;;;;;
|
||||
300F;;Qm;;;;;;;;;;;;
|
||||
301D;;Qm;;;;;;;;;;;;
|
||||
301E;;Qm;;;;;;;;;;;;
|
||||
301F;;Qm;;;;;;;;;;;;
|
||||
FE41;;Qm;;;;;;;;;;;;
|
||||
FE42;;Qm;;;;;;;;;;;;
|
||||
FE43;;Qm;;;;;;;;;;;;
|
||||
FE44;;Qm;;;;;;;;;;;;
|
||||
FF02;;Qm;;;;;;;;;;;;
|
||||
FF07;;Qm;;;;;;;;;;;;
|
||||
FF62;;Qm;;;;;;;;;;;;
|
||||
FF63;;Qm;;;;;;;;;;;;
|
|
@ -0,0 +1,207 @@
|
|||
#
|
||||
# $Id: UCDATAREADME.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
|
||||
#
|
||||
|
||||
MUTT UCData Package 1.9
|
||||
-----------------------
|
||||
|
||||
This is a package that supports ctype-like operations for Unicode UCS-2 text
|
||||
(and surrogates), case mapping, and decomposition lookup. To use it, you will
|
||||
need to get the "UnicodeData-2.0.14.txt" (or later) file from the Unicode Web
|
||||
or FTP site.
|
||||
|
||||
This package consists of two parts:
|
||||
|
||||
1. A program called "ucgendat" which generates five data files from the
|
||||
UnicodeData-2.*.txt file. The files are:
|
||||
|
||||
A. case.dat - the case mappings.
|
||||
B. ctype.dat - the character property tables.
|
||||
C. decomp.dat - the character decompositions.
|
||||
D. cmbcl.dat - the non-zero combining classes.
|
||||
E. num.dat - the codes representing numbers.
|
||||
|
||||
2. The "ucdata.[ch]" files which implement the functions needed to
|
||||
check to see if a character matches groups of properties, to map between
|
||||
upper, lower, and title case, to look up the decomposition of a
|
||||
character, look up the combining class of a character, and get the number
|
||||
value of a character.
|
||||
|
||||
A short reference to the functions available is in the "api.txt" file.
|
||||
|
||||
Techie Details
|
||||
==============
|
||||
|
||||
The "ucgendat" program parses files from the command line which are all in the
|
||||
Unicode Character Database (UCDB) format. An additional properties file,
|
||||
"MUTTUCData.txt", provides some extra properties for some characters.
|
||||
|
||||
The program looks for the two character properties fields (2 and 4), the
|
||||
combining class field (3), the decomposition field (5), the numeric value
|
||||
field (8), and the case mapping fields (12, 13, and 14). The decompositions
|
||||
are recursively expanded before being written out.
|
||||
|
||||
The decomposition table contains all the canonical decompositions. This means
|
||||
all decompositions that do not have tags such as "<compat>" or "<font>".
|
||||
|
||||
The data is almost all stored as unsigned longs (32-bits assumed) and the
|
||||
routines that load the data take care of endian swaps when necessary. This
|
||||
also means that surrogates (>= 0x10000) can be placed in the data files the
|
||||
"ucgendat" program parses.
|
||||
|
||||
The data is written as external files and broken into five parts so it can be
|
||||
selectively updated at runtime if necessary.
|
||||
|
||||
The data files currently generated from the "ucgendat" program total about 56K
|
||||
in size all together.
|
||||
|
||||
The format of the binary data files is documented in the "format.txt" file.
|
||||
|
||||
Mark Leisher <mleisher@crl.nmsu.edu>
|
||||
13 December 1998
|
||||
|
||||
CHANGES
|
||||
=======
|
||||
|
||||
Version 1.9
|
||||
-----------
|
||||
1. Fixed a problem with an incorrect amount of storage being allocated for the
|
||||
combining class nodes.
|
||||
|
||||
2. Fixed an invalid initialization in the number code.
|
||||
|
||||
3. Changed the Java template file formatting a bit.
|
||||
|
||||
4. Added tables and function for getting decompositions in the Java class.
|
||||
|
||||
Version 1.8
|
||||
-----------
|
||||
1. Fixed a problem with adding certain ranges.
|
||||
|
||||
2. Added two more macros for testing for identifiers.
|
||||
|
||||
3. Tested with the UnicodeData-2.1.5.txt file.
|
||||
|
||||
Version 1.7
|
||||
-----------
|
||||
1. Fixed a problem with looking up decompositions in "ucgendat."
|
||||
|
||||
Version 1.6
|
||||
-----------
|
||||
1. Added two new properties introduced with UnicodeData-2.1.4.txt.
|
||||
|
||||
2. Changed the "ucgendat.c" program a little to automatically align the
|
||||
property data on a 4-byte boundary when new properties are added.
|
||||
|
||||
3. Changed the "ucgendat.c" programs to only generate canonical
|
||||
decompositions.
|
||||
|
||||
4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for
|
||||
initial and final punctuation characters.
|
||||
|
||||
5. Minor additions and changes to the documentation.
|
||||
|
||||
Version 1.5
|
||||
-----------
|
||||
1. Changed all file open calls to include binary mode with "b" for DOS/WIN
|
||||
platforms.
|
||||
|
||||
2. Wrapped the unistd.h include so it won't be included when compiled under
|
||||
Win32.
|
||||
|
||||
3. Fixed a bad range check for hex digits in ucgendat.c.
|
||||
|
||||
4. Fixed a bad endian swap for combining classes.
|
||||
|
||||
5. Added code to make a number table and associated lookup functions.
|
||||
Functions added are ucnumber(), ucdigit(), and ucgetnumber(). The last
|
||||
function is to maintain compatibility with John Cowan's "uctype" package.
|
||||
|
||||
Version 1.4
|
||||
-----------
|
||||
1. Fixed a bug with adding a range.
|
||||
|
||||
2. Fixed a bug with inserting a range in order.
|
||||
|
||||
3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros.
|
||||
|
||||
4. Added the missing unload for the combining class data.
|
||||
|
||||
5. Fixed a bad macro placement in ucisweak().
|
||||
|
||||
Version 1.3
|
||||
-----------
|
||||
1. Bug with case mapping calculations fixed.
|
||||
|
||||
2. Bug with empty character property entries fixed.
|
||||
|
||||
3. Bug with incorrect type in the combining class lookup fixed.
|
||||
|
||||
4. Some corrections done to api.txt.
|
||||
|
||||
5. Bug in certain character property lookups fixed.
|
||||
|
||||
6. Added a character property table that records the defined characters.
|
||||
|
||||
7. Replaced ucisunknown() with ucisdefined() and ucisundefined().
|
||||
|
||||
Version 1.2
|
||||
-----------
|
||||
1. Added code to ucgendat to generate a combining class table.
|
||||
|
||||
2. Fixed an endian problem with the byte count of decompositions.
|
||||
|
||||
3. Fixed some minor problems in the "format.txt" file.
|
||||
|
||||
4. Removed some bogus "Ss" values from MUTTUCData.txt file.
|
||||
|
||||
5. Added API function to get combining class.
|
||||
|
||||
6. Changed the open mode to "rb" so binary data files will be opened correctly
|
||||
on DOS/WIN as well as other platforms.
|
||||
|
||||
7. Added the "api.txt" file.
|
||||
|
||||
Version 1.1
|
||||
-----------
|
||||
1. Added ucisxdigit() which I overlooked.
|
||||
|
||||
2. Added UC_LT to the ucisalpha() macro which I overlooked.
|
||||
|
||||
3. Change uciscntrl() to include UC_CF.
|
||||
|
||||
4. Added ucisocntrl() and ucfntcntrl() macros.
|
||||
|
||||
5. Added a ucisblank() which I overlooked.
|
||||
|
||||
6. Added missing properties to ucissymbol() and ucisnumber().
|
||||
|
||||
7. Added ucisgraph() and ucisprint().
|
||||
|
||||
8. Changed the "Mr" property to "Sy" to mark this subset of mirroring
|
||||
characters as symmetric to avoid trampling the Unicode/ISO10646 sense of
|
||||
mirroring.
|
||||
|
||||
9. Added another property called "Ss" which includes control characters
|
||||
traditionally seen as spaces in the isspace() macro.
|
||||
|
||||
10. Added a bunch of macros to be API compatible with John Cowan's package.
|
||||
|
||||
ACKNOWLEDGEMENTS
|
||||
================
|
||||
|
||||
Thanks go to John Cowan <cowan@locke.ccil.org> for pointing out lots of
|
||||
missing things and giving me stuff, particularly a bunch of new macros.
|
||||
|
||||
Thanks go to Bob Verbrugge <bob_verbrugge@nl.compuware.com> for pointing out
|
||||
various bugs.
|
||||
|
||||
Thanks go to Christophe Pierret <cpierret@businessobjects.com> for pointing
|
||||
out that file modes need to have "b" for DOS/WIN machines, pointing out
|
||||
unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum().
|
||||
|
||||
Thanks go to Kent Johnson <kent@pondview.mv.com> for finding a bug that caused
|
||||
incomplete decompositions to be generated by the "ucgendat" program.
|
||||
|
||||
Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
|
||||
error and an initialization error.
|
|
@ -0,0 +1,243 @@
|
|||
#
|
||||
# $Id: format.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
|
||||
#
|
||||
|
||||
CHARACTER DATA
|
||||
==============
|
||||
|
||||
This package generates some data files that contain character properties useful
|
||||
for text processing.
|
||||
|
||||
CHARACTER PROPERTIES
|
||||
====================
|
||||
|
||||
The first data file is called "ctype.dat" and contains a compressed form of
|
||||
the character properties found in the Unicode Character Database (UCDB).
|
||||
Additional properties can be specified in limited UCDB format in another file
|
||||
to avoid modifying the original UCDB.
|
||||
|
||||
The following is a property name and code table to be used with the character
|
||||
data:
|
||||
|
||||
NAME CODE DESCRIPTION
|
||||
---------------------
|
||||
Mn 0 Mark, Non-Spacing
|
||||
Mc 1 Mark, Spacing Combining
|
||||
Me 2 Mark, Enclosing
|
||||
Nd 3 Number, Decimal Digit
|
||||
Nl 4 Number, Letter
|
||||
No 5 Number, Other
|
||||
Zs 6 Separator, Space
|
||||
Zl 7 Separator, Line
|
||||
Zp 8 Separator, Paragraph
|
||||
Cc 9 Other, Control
|
||||
Cf 10 Other, Format
|
||||
Cs 11 Other, Surrogate
|
||||
Co 12 Other, Private Use
|
||||
Cn 13 Other, Not Assigned
|
||||
Lu 14 Letter, Uppercase
|
||||
Ll 15 Letter, Lowercase
|
||||
Lt 16 Letter, Titlecase
|
||||
Lm 17 Letter, Modifier
|
||||
Lo 18 Letter, Other
|
||||
Pc 19 Punctuation, Connector
|
||||
Pd 20 Punctuation, Dash
|
||||
Ps 21 Punctuation, Open
|
||||
Pe 22 Punctuation, Close
|
||||
Po 23 Punctuation, Other
|
||||
Sm 24 Symbol, Math
|
||||
Sc 25 Symbol, Currency
|
||||
Sk 26 Symbol, Modifier
|
||||
So 27 Symbol, Other
|
||||
L 28 Left-To-Right
|
||||
R 29 Right-To-Left
|
||||
EN 30 European Number
|
||||
ES 31 European Number Separator
|
||||
ET 32 European Number Terminator
|
||||
AN 33 Arabic Number
|
||||
CS 34 Common Number Separator
|
||||
B 35 Block Separator
|
||||
S 36 Segment Separator
|
||||
WS 37 Whitespace
|
||||
ON 38 Other Neutrals
|
||||
Pi 47 Punctuation, Initial
|
||||
Pf 48 Punctuation, Final
|
||||
#
|
||||
# Implementation specific properties.
|
||||
#
|
||||
Cm 39 Composite
|
||||
Nb 40 Non-Breaking
|
||||
Sy 41 Symmetric (characters which are part of open/close pairs)
|
||||
Hd 42 Hex Digit
|
||||
Qm 43 Quote Mark
|
||||
Mr 44 Mirroring
|
||||
Ss 45 Space, Other (controls viewed as spaces in ctype isspace())
|
||||
Cp 46 Defined character
|
||||
|
||||
The actual binary data is formatted as follows:
|
||||
|
||||
Assumptions: unsigned short is at least 16-bits in size and unsigned long
|
||||
is at least 32-bits in size.
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short OffsetArraySize
|
||||
unsigned long Bytes
|
||||
unsigned short Offsets[OffsetArraySize + 1]
|
||||
unsigned long Ranges[N], N = value of Offsets[OffsetArraySize]
|
||||
|
||||
The Bytes field provides the total byte count used for the Offsets[] and
|
||||
Ranges[] arrays. The Offsets[] array is aligned on a 4-byte boundary and
|
||||
there is always one extra node on the end to hold the final index of the
|
||||
Ranges[] array. The Ranges[] array contains pairs of 4-byte values
|
||||
representing a range of Unicode characters. The pairs are arranged in
|
||||
increasing order by the first character code in the range.
|
||||
|
||||
Determining if a particular character is in the property list requires a
|
||||
simple binary search to determine if a character is in any of the ranges
|
||||
for the property.
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a
|
||||
machine with a different endian order and the values must be byte-swapped.
|
||||
|
||||
To swap a 16-bit value:
|
||||
c = (c >> 8) | ((c & 0xff) << 8)
|
||||
|
||||
To swap a 32-bit value:
|
||||
c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) |
|
||||
(((c >> 16) & 0xff) << 8) | (c >> 24)
|
||||
|
||||
CASE MAPPINGS
|
||||
=============
|
||||
|
||||
The next data file is called "case.dat" and contains three case mapping tables
|
||||
in the following order: upper, lower, and title case. Each table is in
|
||||
increasing order by character code and each mapping contains 3 unsigned longs
|
||||
which represent the possible mappings.
|
||||
|
||||
The format for the binary form of these tables is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumMappingNodes, count of all mapping nodes
|
||||
unsigned short CaseTableSizes[2], upper and lower mapping node counts
|
||||
unsigned long CaseTables[NumMappingNodes]
|
||||
|
||||
The starting indexes of the case tables are calculated as following:
|
||||
|
||||
UpperIndex = 0;
|
||||
LowerIndex = CaseTableSizes[0] * 3;
|
||||
TitleIndex = LowerIndex + CaseTableSizes[1] * 3;
|
||||
|
||||
The order of the fields for the three tables are:
|
||||
|
||||
Upper case
|
||||
----------
|
||||
unsigned long upper;
|
||||
unsigned long lower;
|
||||
unsigned long title;
|
||||
|
||||
Lower case
|
||||
----------
|
||||
unsigned long lower;
|
||||
unsigned long upper;
|
||||
unsigned long title;
|
||||
|
||||
Title case
|
||||
----------
|
||||
unsigned long title;
|
||||
unsigned long upper;
|
||||
unsigned long lower;
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
Because the tables are in increasing order by character code, locating a
|
||||
mapping requires a simple binary search on one of the 3 codes that make up
|
||||
each node.
|
||||
|
||||
It is important to note that there can only be 65536 mapping nodes which
|
||||
divided into 3 portions allows 21845 nodes for each case mapping table. The
|
||||
distribution of mappings may be more or less than 21845 per table, but only
|
||||
65536 are allowed.
|
||||
|
||||
DECOMPOSITIONS
|
||||
==============
|
||||
|
||||
The next data file is called "decomp.dat" and contains the decomposition data
|
||||
for all characters with decompositions containing more than one character and
|
||||
are *not* compatibility decompositions. Compatibility decompositions are
|
||||
signaled in the UCDB format by the use of the <compat> tag in the
|
||||
decomposition field. Each list of character codes represents a full
|
||||
decomposition of a composite character. The nodes are arranged in increasing
|
||||
order by character code.
|
||||
|
||||
The format for the binary form of this table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumDecompNodes, count of all decomposition nodes
|
||||
unsigned long Bytes
|
||||
unsigned long DecompNodes[(NumDecompNodes * 2) + 1]
|
||||
unsigned long Decomp[N], N = sum of all counts in DecompNodes[]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The DecompNodes[] array consists of pairs of unsigned longs, the first of
|
||||
which is the character code and the second is the initial index of the list
|
||||
of character codes representing the decomposition.
|
||||
|
||||
Locating the decomposition of a composite character requires a binary search
|
||||
for a character code in the DecompNodes[] array and using its index to
|
||||
locate the start of the decomposition. The length of the decomposition list
|
||||
is the index in the following element in DecompNode[] minus the current
|
||||
index.
|
||||
|
||||
COMBINING CLASSES
|
||||
=================
|
||||
|
||||
The fourth data file is called "cmbcl.dat" and contains the characters with
|
||||
non-zero combining classes.
|
||||
|
||||
The format for the binary form of this table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumCCLNodes
|
||||
unsigned long Bytes
|
||||
unsigned long CCLNodes[NumCCLNodes * 3]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The CCLNodes[] array consists of groups of three unsigned longs. The first
|
||||
and second are the beginning and ending of a range and the third is the
|
||||
combining class of that range.
|
||||
|
||||
If a character is not found in this table, then the combining class is
|
||||
assumed to be 0.
|
||||
|
||||
It is important to note that only 65536 distinct ranges plus combining class
|
||||
can be specified because the NumCCLNodes is usually a 16-bit number.
|
||||
|
||||
NUMBER TABLE
|
||||
============
|
||||
|
||||
The final data file is called "num.dat" and contains the characters that have
|
||||
a numeric value associated with them.
|
||||
|
||||
The format for the binary form of the table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumNumberNodes
|
||||
unsigned long Bytes
|
||||
unsigned long NumberNodes[NumNumberNodes]
|
||||
unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long)))
|
||||
/ sizeof(short)]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The NumberNodes array contains pairs of values, the first of which is the
|
||||
character code and the second an index into the ValueNodes array. The
|
||||
ValueNodes array contains pairs of integers which represent the numerator
|
||||
and denominator of the numeric value of the character. If the character
|
||||
happens to map to an integer, both the values in ValueNodes will be the
|
||||
same.
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Загрузка…
Ссылка в новой задаче