This commit is contained in:
ftang%netscape.com 1999-01-06 01:46:32 +00:00
Родитель 14c7cee8fe
Коммит 001455720b
6 изменённых файлов: 3583 добавлений и 0 удалений

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -0,0 +1,306 @@
/*
* Copyright 1996, 1997, 1998 Computing Research Labs,
* New Mexico State University
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
* OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
* THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef _h_ucdata
#define _h_ucdata
/*
* $Id: ucdata.h,v 1.1 1999/01/06 01:46:32 ftang%netscape.com Exp $
*/
#ifdef __cplusplus
extern "C" {
#endif
#undef __
#ifdef __STDC__
#define __(x) x
#else
#define __(x) ()
#endif
#define UCDATA_VERSION "1.9"
/**************************************************************************
*
* Masks and macros for character properties.
*
**************************************************************************/
/*
* Values that can appear in the `mask1' parameter of the ucisprop()
* function.
*/
#define UC_MN 0x00000001 /* Mark, Non-Spacing */
#define UC_MC 0x00000002 /* Mark, Spacing Combining */
#define UC_ME 0x00000004 /* Mark, Enclosing */
#define UC_ND 0x00000008 /* Number, Decimal Digit */
#define UC_NL 0x00000010 /* Number, Letter */
#define UC_NO 0x00000020 /* Number, Other */
#define UC_ZS 0x00000040 /* Separator, Space */
#define UC_ZL 0x00000080 /* Separator, Line */
#define UC_ZP 0x00000100 /* Separator, Paragraph */
#define UC_CC 0x00000200 /* Other, Control */
#define UC_CF 0x00000400 /* Other, Format */
#define UC_OS 0x00000800 /* Other, Surrogate */
#define UC_CO 0x00001000 /* Other, Private Use */
#define UC_CN 0x00002000 /* Other, Not Assigned */
#define UC_LU 0x00004000 /* Letter, Uppercase */
#define UC_LL 0x00008000 /* Letter, Lowercase */
#define UC_LT 0x00010000 /* Letter, Titlecase */
#define UC_LM 0x00020000 /* Letter, Modifier */
#define UC_LO 0x00040000 /* Letter, Other */
#define UC_PC 0x00080000 /* Punctuation, Connector */
#define UC_PD 0x00100000 /* Punctuation, Dash */
#define UC_PS 0x00200000 /* Punctuation, Open */
#define UC_PE 0x00400000 /* Punctuation, Close */
#define UC_PO 0x00800000 /* Punctuation, Other */
#define UC_SM 0x01000000 /* Symbol, Math */
#define UC_SC 0x02000000 /* Symbol, Currency */
#define UC_SK 0x04000000 /* Symbol, Modifier */
#define UC_SO 0x08000000 /* Symbol, Other */
#define UC_L 0x10000000 /* Left-To-Right */
#define UC_R 0x20000000 /* Right-To-Left */
#define UC_EN 0x40000000 /* European Number */
#define UC_ES 0x80000000 /* European Number Separator */
/*
* Values that can appear in the `mask2' parameter of the ucisprop()
* function.
*/
#define UC_ET 0x00000001 /* European Number Terminator */
#define UC_AN 0x00000002 /* Arabic Number */
#define UC_CS 0x00000004 /* Common Number Separator */
#define UC_B 0x00000008 /* Block Separator */
#define UC_S 0x00000010 /* Segment Separator */
#define UC_WS 0x00000020 /* Whitespace */
#define UC_ON 0x00000040 /* Other Neutrals */
/*
* Implementation specific character properties.
*/
#define UC_CM 0x00000080 /* Composite */
#define UC_NB 0x00000100 /* Non-Breaking */
#define UC_SY 0x00000200 /* Symmetric */
#define UC_HD 0x00000400 /* Hex Digit */
#define UC_QM 0x00000800 /* Quote Mark */
#define UC_MR 0x00001000 /* Mirroring */
#define UC_SS 0x00002000 /* Space, other */
#define UC_CP 0x00004000 /* Defined */
/*
* Added for UnicodeData-2.1.3.
*/
#define UC_PI 0x00008000 /* Punctuation, Initial */
#define UC_PF 0x00010000 /* Punctuation, Final */
/*
* This is the primary function for testing to see if a character has some set
* of properties. The macros that test for various character properties all
* call this function with some set of masks.
*/
extern int ucisprop __((unsigned long code, unsigned long mask1,
unsigned long mask2));
#define ucisalpha(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0)
#define ucisdigit(cc) ucisprop(cc, UC_ND, 0)
#define ucisalnum(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0)
#define uciscntrl(cc) ucisprop(cc, UC_CC|UC_CF, 0)
#define ucisspace(cc) ucisprop(cc, UC_ZS|UC_SS, 0)
#define ucisblank(cc) ucisprop(cc, UC_ZS, 0)
#define ucispunct(cc) ucisprop(cc, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF)
#define ucisgraph(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
UC_SO, UC_PI|UC_PF)
#define ucisprint(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
UC_SO|UC_ZS, UC_PI|UC_PF)
#define ucisupper(cc) ucisprop(cc, UC_LU, 0)
#define ucislower(cc) ucisprop(cc, UC_LL, 0)
#define ucistitle(cc) ucisprop(cc, UC_LT, 0)
#define ucisxdigit(cc) ucisprop(cc, 0, UC_HD)
#define ucisisocntrl(cc) ucisprop(cc, UC_CC, 0)
#define ucisfmtcntrl(cc) ucisprop(cc, UC_CF, 0)
#define ucissymbol(cc) ucisprop(cc, UC_SM|UC_SC|UC_SO|UC_SK, 0)
#define ucisnumber(cc) ucisprop(cc, UC_ND|UC_NO|UC_NL, 0)
#define ucisnonspacing(cc) ucisprop(cc, UC_MN, 0)
#define ucisopenpunct(cc) ucisprop(cc, UC_PS, 0)
#define ucisclosepunct(cc) ucisprop(cc, UC_PE, 0)
#define ucisinitialpunct(cc) ucisprop(cc, 0, UC_PI)
#define ucisfinalpunct(cc) ucisprop(cc, 0, UC_PF)
#define uciscomposite(cc) ucisprop(cc, 0, UC_CM)
#define ucishex(cc) ucisprop(cc, 0, UC_HD)
#define ucisquote(cc) ucisprop(cc, 0, UC_QM)
#define ucissymmetric(cc) ucisprop(cc, 0, UC_SY)
#define ucismirroring(cc) ucisprop(cc, 0, UC_MR)
#define ucisnonbreaking(cc) ucisprop(cc, 0, UC_NB)
/*
* Directionality macros.
*/
#define ucisrtl(cc) ucisprop(cc, UC_R, 0)
#define ucisltr(cc) ucisprop(cc, UC_L, 0)
#define ucisstrong(cc) ucisprop(cc, UC_L|UC_R, 0)
#define ucisweak(cc) ucisprop(cc, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS)
#define ucisneutral(cc) ucisprop(cc, 0, UC_B|UC_S|UC_WS|UC_ON)
#define ucisseparator(cc) ucisprop(cc, 0, UC_B|UC_S)
/*
* Other macros inspired by John Cowan.
*/
#define ucismark(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME, 0)
#define ucismodif(cc) ucisprop(cc, UC_LM, 0)
#define ucisletnum(cc) ucisprop(cc, UC_NL, 0)
#define ucisconnect(cc) ucisprop(cc, UC_PC, 0)
#define ucisdash(cc) ucisprop(cc, UC_PD, 0)
#define ucismath(cc) ucisprop(cc, UC_SM, 0)
#define uciscurrency(cc) ucisprop(cc, UC_SC, 0)
#define ucismodifsymbol(cc) ucisprop(cc, UC_SK, 0)
#define ucisnsmark(cc) ucisprop(cc, UC_MN, 0)
#define ucisspmark(cc) ucisprop(cc, UC_MC, 0)
#define ucisenclosing(cc) ucisprop(cc, UC_ME, 0)
#define ucisprivate(cc) ucisprop(cc, UC_CO, 0)
#define ucissurrogate(cc) ucisprop(cc, UC_OS, 0)
#define ucislsep(cc) ucisprop(cc, UC_ZL, 0)
#define ucispsep(cc) ucisprop(cc, UC_ZP, 0)
#define ucisidentstart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0)
#define ucisidentpart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|\
UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0)
#define ucisdefined(cc) ucisprop(cc, 0, UC_CP)
#define ucisundefined(cc) !ucisprop(cc, 0, UC_CP)
/*
* Other miscellaneous character property macros.
*/
#define ucishan(cc) (((cc) >= 0x4e00 && (cc) <= 0x9fff) ||\
((cc) >= 0xf900 && (cc) <= 0xfaff))
#define ucishangul(cc) ((cc) >= 0xac00 && (cc) <= 0xd7ff)
/**************************************************************************
*
* Functions for case conversion.
*
**************************************************************************/
extern unsigned long uctoupper __((unsigned long code));
extern unsigned long uctolower __((unsigned long code));
extern unsigned long uctotitle __((unsigned long code));
/**************************************************************************
*
* Functions for getting decompositions.
*
**************************************************************************/
/*
* This routine determines if the code has a decomposition. If it returns 0,
* there is no decomposition. Any other value indicates a decomposition was
* returned.
*/
extern int ucdecomp __((unsigned long code, unsigned long *num,
unsigned long **decomp));
/*
* If the code is a Hangul syllable, this routine decomposes it into the array
* passed. The array size should be at least 3.
*/
extern int ucdecomp_hangul __((unsigned long code, unsigned long *num,
unsigned long decomp[]));
/**************************************************************************
*
* Functions for getting combining classes.
*
**************************************************************************/
/*
* This will return the combining class for a character to be used with the
* Canonical Ordering algorithm.
*/
extern unsigned long uccombining_class __((unsigned long code));
/**************************************************************************
*
* Functions for getting numbers and digits.
*
**************************************************************************/
struct ucnumber {
int numerator;
int denominator;
};
extern int ucnumber_lookup __((unsigned long code, struct ucnumber *num));
extern int ucdigit_lookup __((unsigned long code, int *digit));
/*
* For compatibility with John Cowan's "uctype" package.
*/
extern struct ucnumber ucgetnumber __((unsigned long code));
extern int ucgetdigit __((unsigned long code));
/**************************************************************************
*
* Functions library initialization and cleanup.
*
**************************************************************************/
/*
* Macros for specifying the data tables to be loaded for ucdata_load().
*/
#define UCDATA_CASE 0x01
#define UCDATA_CTYPE 0x02
#define UCDATA_DECOMP 0x04
#define UCDATA_CMBCL 0x08
#define UCDATA_NUM 0x10
#define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
UCDATA_CMBCL|UCDATA_NUM)
/*
* Functions to load, unload, and reload specific data files.
*/
extern void ucdata_load __((char *paths, int mask));
extern void ucdata_unload __((int mask));
extern void ucdata_reload __((char *paths, int mask));
/*
* Deprecated functions, now just compatibility macros.
*/
#define ucdata_setup(p) ucdata_load(p, UCDATA_ALL)
#define ucdata_cleanup() ucdata_unload(UCDATA_ALL)
#undef __
#ifdef __cplusplus
}
#endif
#endif /* _h_ucdata */

Просмотреть файл

@ -0,0 +1,208 @@
#
# $Id: MUTTUCData.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
#
# Copyright 1996, 1997, 1998 Computing Research Labs,
# New Mexico State University
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
#
# Implementation specific character properties.
#
#
# Space, other.
#
0009;;Ss;;;;;;;;;;;;
000A;;Ss;;;;;;;;;;;;
000B;;Ss;;;;;;;;;;;;
000C;;Ss;;;;;;;;;;;;
000D;;Ss;;;;;;;;;;;;
#
# Non-breaking.
#
00A0;;Nb;;;;;;;;;;;;
2007;;Nb;;;;;;;;;;;;
2011;;Nb;;;;;;;;;;;;
FEFF;;Nb;;;;;;;;;;;;
#
# Symmetric.
#
0028;;Sy;;;;;;;;;;;;
0029;;Sy;;;;;;;;;;;;
005B;;Sy;;;;;;;;;;;;
005D;;Sy;;;;;;;;;;;;
007B;;Sy;;;;;;;;;;;;
007D;;Sy;;;;;;;;;;;;
00AB;;Sy;;;;;;;;;;;;
00BB;;Sy;;;;;;;;;;;;
0F3A;;Sy;;;;;;;;;;;;
0F3B;;Sy;;;;;;;;;;;;
0F3C;;Sy;;;;;;;;;;;;
0F3D;;Sy;;;;;;;;;;;;
0F3E;;Sy;;;;;;;;;;;;
0F3F;;Sy;;;;;;;;;;;;
2018;;Sy;;;;;;;;;;;;
2019;;Sy;;;;;;;;;;;;
201A;;Sy;;;;;;;;;;;;
201B;;Sy;;;;;;;;;;;;
201C;;Sy;;;;;;;;;;;;
201D;;Sy;;;;;;;;;;;;
201E;;Sy;;;;;;;;;;;;
201F;;Sy;;;;;;;;;;;;
2039;;Sy;;;;;;;;;;;;
203A;;Sy;;;;;;;;;;;;
2045;;Sy;;;;;;;;;;;;
2046;;Sy;;;;;;;;;;;;
207D;;Sy;;;;;;;;;;;;
207E;;Sy;;;;;;;;;;;;
208D;;Sy;;;;;;;;;;;;
208E;;Sy;;;;;;;;;;;;
2329;;Sy;;;;;;;;;;;;
232A;;Sy;;;;;;;;;;;;
3008;;Sy;;;;;;;;;;;;
3009;;Sy;;;;;;;;;;;;
300A;;Sy;;;;;;;;;;;;
300B;;Sy;;;;;;;;;;;;
300C;;Sy;;;;;;;;;;;;
300D;;Sy;;;;;;;;;;;;
300E;;Sy;;;;;;;;;;;;
300F;;Sy;;;;;;;;;;;;
3010;;Sy;;;;;;;;;;;;
3011;;Sy;;;;;;;;;;;;
3014;;Sy;;;;;;;;;;;;
3015;;Sy;;;;;;;;;;;;
3016;;Sy;;;;;;;;;;;;
3017;;Sy;;;;;;;;;;;;
3018;;Sy;;;;;;;;;;;;
3019;;Sy;;;;;;;;;;;;
301A;;Sy;;;;;;;;;;;;
301B;;Sy;;;;;;;;;;;;
301D;;Sy;;;;;;;;;;;;
301E;;Sy;;;;;;;;;;;;
FD3E;;Sy;;;;;;;;;;;;
FD3F;;Sy;;;;;;;;;;;;
FE35;;Sy;;;;;;;;;;;;
FE36;;Sy;;;;;;;;;;;;
FE37;;Sy;;;;;;;;;;;;
FE38;;Sy;;;;;;;;;;;;
FE39;;Sy;;;;;;;;;;;;
FE3A;;Sy;;;;;;;;;;;;
FE3B;;Sy;;;;;;;;;;;;
FE3C;;Sy;;;;;;;;;;;;
FE3D;;Sy;;;;;;;;;;;;
FE3E;;Sy;;;;;;;;;;;;
FE3F;;Sy;;;;;;;;;;;;
FE40;;Sy;;;;;;;;;;;;
FE41;;Sy;;;;;;;;;;;;
FE42;;Sy;;;;;;;;;;;;
FE43;;Sy;;;;;;;;;;;;
FE44;;Sy;;;;;;;;;;;;
FE59;;Sy;;;;;;;;;;;;
FE5A;;Sy;;;;;;;;;;;;
FE5B;;Sy;;;;;;;;;;;;
FE5C;;Sy;;;;;;;;;;;;
FE5D;;Sy;;;;;;;;;;;;
FE5E;;Sy;;;;;;;;;;;;
FF08;;Sy;;;;;;;;;;;;
FF09;;Sy;;;;;;;;;;;;
FF3B;;Sy;;;;;;;;;;;;
FF3D;;Sy;;;;;;;;;;;;
FF5B;;Sy;;;;;;;;;;;;
FF5D;;Sy;;;;;;;;;;;;
FF62;;Sy;;;;;;;;;;;;
FF63;;Sy;;;;;;;;;;;;
#
# Hex digit.
#
0030;;Hd;;;;;;;;;;;;
0031;;Hd;;;;;;;;;;;;
0032;;Hd;;;;;;;;;;;;
0033;;Hd;;;;;;;;;;;;
0034;;Hd;;;;;;;;;;;;
0035;;Hd;;;;;;;;;;;;
0036;;Hd;;;;;;;;;;;;
0037;;Hd;;;;;;;;;;;;
0038;;Hd;;;;;;;;;;;;
0039;;Hd;;;;;;;;;;;;
0041;;Hd;;;;;;;;;;;;
0042;;Hd;;;;;;;;;;;;
0043;;Hd;;;;;;;;;;;;
0044;;Hd;;;;;;;;;;;;
0045;;Hd;;;;;;;;;;;;
0046;;Hd;;;;;;;;;;;;
0061;;Hd;;;;;;;;;;;;
0062;;Hd;;;;;;;;;;;;
0063;;Hd;;;;;;;;;;;;
0064;;Hd;;;;;;;;;;;;
0065;;Hd;;;;;;;;;;;;
0066;;Hd;;;;;;;;;;;;
FF10;;Hd;;;;;;;;;;;;
FF11;;Hd;;;;;;;;;;;;
FF12;;Hd;;;;;;;;;;;;
FF13;;Hd;;;;;;;;;;;;
FF14;;Hd;;;;;;;;;;;;
FF15;;Hd;;;;;;;;;;;;
FF16;;Hd;;;;;;;;;;;;
FF17;;Hd;;;;;;;;;;;;
FF18;;Hd;;;;;;;;;;;;
FF19;;Hd;;;;;;;;;;;;
FF21;;Hd;;;;;;;;;;;;
FF22;;Hd;;;;;;;;;;;;
FF23;;Hd;;;;;;;;;;;;
FF24;;Hd;;;;;;;;;;;;
FF25;;Hd;;;;;;;;;;;;
FF26;;Hd;;;;;;;;;;;;
FF41;;Hd;;;;;;;;;;;;
FF42;;Hd;;;;;;;;;;;;
FF43;;Hd;;;;;;;;;;;;
FF44;;Hd;;;;;;;;;;;;
FF45;;Hd;;;;;;;;;;;;
FF46;;Hd;;;;;;;;;;;;
#
# Quote marks.
#
0022;;Qm;;;;;;;;;;;;
0027;;Qm;;;;;;;;;;;;
00AB;;Qm;;;;;;;;;;;;
00BB;;Qm;;;;;;;;;;;;
2018;;Qm;;;;;;;;;;;;
2019;;Qm;;;;;;;;;;;;
201A;;Qm;;;;;;;;;;;;
201B;;Qm;;;;;;;;;;;;
201C;;Qm;;;;;;;;;;;;
201D;;Qm;;;;;;;;;;;;
201E;;Qm;;;;;;;;;;;;
201F;;Qm;;;;;;;;;;;;
2039;;Qm;;;;;;;;;;;;
203A;;Qm;;;;;;;;;;;;
300C;;Qm;;;;;;;;;;;;
300D;;Qm;;;;;;;;;;;;
300E;;Qm;;;;;;;;;;;;
300F;;Qm;;;;;;;;;;;;
301D;;Qm;;;;;;;;;;;;
301E;;Qm;;;;;;;;;;;;
301F;;Qm;;;;;;;;;;;;
FE41;;Qm;;;;;;;;;;;;
FE42;;Qm;;;;;;;;;;;;
FE43;;Qm;;;;;;;;;;;;
FE44;;Qm;;;;;;;;;;;;
FF02;;Qm;;;;;;;;;;;;
FF07;;Qm;;;;;;;;;;;;
FF62;;Qm;;;;;;;;;;;;
FF63;;Qm;;;;;;;;;;;;

Просмотреть файл

@ -0,0 +1,207 @@
#
# $Id: UCDATAREADME.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
#
MUTT UCData Package 1.9
-----------------------
This is a package that supports ctype-like operations for Unicode UCS-2 text
(and surrogates), case mapping, and decomposition lookup. To use it, you will
need to get the "UnicodeData-2.0.14.txt" (or later) file from the Unicode Web
or FTP site.
This package consists of two parts:
1. A program called "ucgendat" which generates five data files from the
UnicodeData-2.*.txt file. The files are:
A. case.dat - the case mappings.
B. ctype.dat - the character property tables.
C. decomp.dat - the character decompositions.
D. cmbcl.dat - the non-zero combining classes.
E. num.dat - the codes representing numbers.
2. The "ucdata.[ch]" files which implement the functions needed to
check to see if a character matches groups of properties, to map between
upper, lower, and title case, to look up the decomposition of a
character, look up the combining class of a character, and get the number
value of a character.
A short reference to the functions available is in the "api.txt" file.
Techie Details
==============
The "ucgendat" program parses files from the command line which are all in the
Unicode Character Database (UCDB) format. An additional properties file,
"MUTTUCData.txt", provides some extra properties for some characters.
The program looks for the two character properties fields (2 and 4), the
combining class field (3), the decomposition field (5), the numeric value
field (8), and the case mapping fields (12, 13, and 14). The decompositions
are recursively expanded before being written out.
The decomposition table contains all the canonical decompositions. This means
all decompositions that do not have tags such as "<compat>" or "<font>".
The data is almost all stored as unsigned longs (32-bits assumed) and the
routines that load the data take care of endian swaps when necessary. This
also means that surrogates (>= 0x10000) can be placed in the data files the
"ucgendat" program parses.
The data is written as external files and broken into five parts so it can be
selectively updated at runtime if necessary.
The data files currently generated from the "ucgendat" program total about 56K
in size all together.
The format of the binary data files is documented in the "format.txt" file.
Mark Leisher <mleisher@crl.nmsu.edu>
13 December 1998
CHANGES
=======
Version 1.9
-----------
1. Fixed a problem with an incorrect amount of storage being allocated for the
combining class nodes.
2. Fixed an invalid initialization in the number code.
3. Changed the Java template file formatting a bit.
4. Added tables and function for getting decompositions in the Java class.
Version 1.8
-----------
1. Fixed a problem with adding certain ranges.
2. Added two more macros for testing for identifiers.
3. Tested with the UnicodeData-2.1.5.txt file.
Version 1.7
-----------
1. Fixed a problem with looking up decompositions in "ucgendat."
Version 1.6
-----------
1. Added two new properties introduced with UnicodeData-2.1.4.txt.
2. Changed the "ucgendat.c" program a little to automatically align the
property data on a 4-byte boundary when new properties are added.
3. Changed the "ucgendat.c" programs to only generate canonical
decompositions.
4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for
initial and final punctuation characters.
5. Minor additions and changes to the documentation.
Version 1.5
-----------
1. Changed all file open calls to include binary mode with "b" for DOS/WIN
platforms.
2. Wrapped the unistd.h include so it won't be included when compiled under
Win32.
3. Fixed a bad range check for hex digits in ucgendat.c.
4. Fixed a bad endian swap for combining classes.
5. Added code to make a number table and associated lookup functions.
Functions added are ucnumber(), ucdigit(), and ucgetnumber(). The last
function is to maintain compatibility with John Cowan's "uctype" package.
Version 1.4
-----------
1. Fixed a bug with adding a range.
2. Fixed a bug with inserting a range in order.
3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros.
4. Added the missing unload for the combining class data.
5. Fixed a bad macro placement in ucisweak().
Version 1.3
-----------
1. Bug with case mapping calculations fixed.
2. Bug with empty character property entries fixed.
3. Bug with incorrect type in the combining class lookup fixed.
4. Some corrections done to api.txt.
5. Bug in certain character property lookups fixed.
6. Added a character property table that records the defined characters.
7. Replaced ucisunknown() with ucisdefined() and ucisundefined().
Version 1.2
-----------
1. Added code to ucgendat to generate a combining class table.
2. Fixed an endian problem with the byte count of decompositions.
3. Fixed some minor problems in the "format.txt" file.
4. Removed some bogus "Ss" values from MUTTUCData.txt file.
5. Added API function to get combining class.
6. Changed the open mode to "rb" so binary data files will be opened correctly
on DOS/WIN as well as other platforms.
7. Added the "api.txt" file.
Version 1.1
-----------
1. Added ucisxdigit() which I overlooked.
2. Added UC_LT to the ucisalpha() macro which I overlooked.
3. Change uciscntrl() to include UC_CF.
4. Added ucisocntrl() and ucfntcntrl() macros.
5. Added a ucisblank() which I overlooked.
6. Added missing properties to ucissymbol() and ucisnumber().
7. Added ucisgraph() and ucisprint().
8. Changed the "Mr" property to "Sy" to mark this subset of mirroring
characters as symmetric to avoid trampling the Unicode/ISO10646 sense of
mirroring.
9. Added another property called "Ss" which includes control characters
traditionally seen as spaces in the isspace() macro.
10. Added a bunch of macros to be API compatible with John Cowan's package.
ACKNOWLEDGEMENTS
================
Thanks go to John Cowan <cowan@locke.ccil.org> for pointing out lots of
missing things and giving me stuff, particularly a bunch of new macros.
Thanks go to Bob Verbrugge <bob_verbrugge@nl.compuware.com> for pointing out
various bugs.
Thanks go to Christophe Pierret <cpierret@businessobjects.com> for pointing
out that file modes need to have "b" for DOS/WIN machines, pointing out
unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum().
Thanks go to Kent Johnson <kent@pondview.mv.com> for finding a bug that caused
incomplete decompositions to be generated by the "ucgendat" program.
Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
error and an initialization error.

Просмотреть файл

@ -0,0 +1,243 @@
#
# $Id: format.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
#
CHARACTER DATA
==============
This package generates some data files that contain character properties useful
for text processing.
CHARACTER PROPERTIES
====================
The first data file is called "ctype.dat" and contains a compressed form of
the character properties found in the Unicode Character Database (UCDB).
Additional properties can be specified in limited UCDB format in another file
to avoid modifying the original UCDB.
The following is a property name and code table to be used with the character
data:
NAME CODE DESCRIPTION
---------------------
Mn 0 Mark, Non-Spacing
Mc 1 Mark, Spacing Combining
Me 2 Mark, Enclosing
Nd 3 Number, Decimal Digit
Nl 4 Number, Letter
No 5 Number, Other
Zs 6 Separator, Space
Zl 7 Separator, Line
Zp 8 Separator, Paragraph
Cc 9 Other, Control
Cf 10 Other, Format
Cs 11 Other, Surrogate
Co 12 Other, Private Use
Cn 13 Other, Not Assigned
Lu 14 Letter, Uppercase
Ll 15 Letter, Lowercase
Lt 16 Letter, Titlecase
Lm 17 Letter, Modifier
Lo 18 Letter, Other
Pc 19 Punctuation, Connector
Pd 20 Punctuation, Dash
Ps 21 Punctuation, Open
Pe 22 Punctuation, Close
Po 23 Punctuation, Other
Sm 24 Symbol, Math
Sc 25 Symbol, Currency
Sk 26 Symbol, Modifier
So 27 Symbol, Other
L 28 Left-To-Right
R 29 Right-To-Left
EN 30 European Number
ES 31 European Number Separator
ET 32 European Number Terminator
AN 33 Arabic Number
CS 34 Common Number Separator
B 35 Block Separator
S 36 Segment Separator
WS 37 Whitespace
ON 38 Other Neutrals
Pi 47 Punctuation, Initial
Pf 48 Punctuation, Final
#
# Implementation specific properties.
#
Cm 39 Composite
Nb 40 Non-Breaking
Sy 41 Symmetric (characters which are part of open/close pairs)
Hd 42 Hex Digit
Qm 43 Quote Mark
Mr 44 Mirroring
Ss 45 Space, Other (controls viewed as spaces in ctype isspace())
Cp 46 Defined character
The actual binary data is formatted as follows:
Assumptions: unsigned short is at least 16-bits in size and unsigned long
is at least 32-bits in size.
unsigned short ByteOrderMark
unsigned short OffsetArraySize
unsigned long Bytes
unsigned short Offsets[OffsetArraySize + 1]
unsigned long Ranges[N], N = value of Offsets[OffsetArraySize]
The Bytes field provides the total byte count used for the Offsets[] and
Ranges[] arrays. The Offsets[] array is aligned on a 4-byte boundary and
there is always one extra node on the end to hold the final index of the
Ranges[] array. The Ranges[] array contains pairs of 4-byte values
representing a range of Unicode characters. The pairs are arranged in
increasing order by the first character code in the range.
Determining if a particular character is in the property list requires a
simple binary search to determine if a character is in any of the ranges
for the property.
If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a
machine with a different endian order and the values must be byte-swapped.
To swap a 16-bit value:
c = (c >> 8) | ((c & 0xff) << 8)
To swap a 32-bit value:
c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) |
(((c >> 16) & 0xff) << 8) | (c >> 24)
CASE MAPPINGS
=============
The next data file is called "case.dat" and contains three case mapping tables
in the following order: upper, lower, and title case. Each table is in
increasing order by character code and each mapping contains 3 unsigned longs
which represent the possible mappings.
The format for the binary form of these tables is:
unsigned short ByteOrderMark
unsigned short NumMappingNodes, count of all mapping nodes
unsigned short CaseTableSizes[2], upper and lower mapping node counts
unsigned long CaseTables[NumMappingNodes]
The starting indexes of the case tables are calculated as following:
UpperIndex = 0;
LowerIndex = CaseTableSizes[0] * 3;
TitleIndex = LowerIndex + CaseTableSizes[1] * 3;
The order of the fields for the three tables are:
Upper case
----------
unsigned long upper;
unsigned long lower;
unsigned long title;
Lower case
----------
unsigned long lower;
unsigned long upper;
unsigned long title;
Title case
----------
unsigned long title;
unsigned long upper;
unsigned long lower;
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
same way as described in the CHARACTER PROPERTIES section.
Because the tables are in increasing order by character code, locating a
mapping requires a simple binary search on one of the 3 codes that make up
each node.
It is important to note that there can only be 65536 mapping nodes which
divided into 3 portions allows 21845 nodes for each case mapping table. The
distribution of mappings may be more or less than 21845 per table, but only
65536 are allowed.
DECOMPOSITIONS
==============
The next data file is called "decomp.dat" and contains the decomposition data
for all characters with decompositions containing more than one character and
are *not* compatibility decompositions. Compatibility decompositions are
signaled in the UCDB format by the use of the <compat> tag in the
decomposition field. Each list of character codes represents a full
decomposition of a composite character. The nodes are arranged in increasing
order by character code.
The format for the binary form of this table is:
unsigned short ByteOrderMark
unsigned short NumDecompNodes, count of all decomposition nodes
unsigned long Bytes
unsigned long DecompNodes[(NumDecompNodes * 2) + 1]
unsigned long Decomp[N], N = sum of all counts in DecompNodes[]
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
same way as described in the CHARACTER PROPERTIES section.
The DecompNodes[] array consists of pairs of unsigned longs, the first of
which is the character code and the second is the initial index of the list
of character codes representing the decomposition.
Locating the decomposition of a composite character requires a binary search
for a character code in the DecompNodes[] array and using its index to
locate the start of the decomposition. The length of the decomposition list
is the index in the following element in DecompNode[] minus the current
index.
COMBINING CLASSES
=================
The fourth data file is called "cmbcl.dat" and contains the characters with
non-zero combining classes.
The format for the binary form of this table is:
unsigned short ByteOrderMark
unsigned short NumCCLNodes
unsigned long Bytes
unsigned long CCLNodes[NumCCLNodes * 3]
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
same way as described in the CHARACTER PROPERTIES section.
The CCLNodes[] array consists of groups of three unsigned longs. The first
and second are the beginning and ending of a range and the third is the
combining class of that range.
If a character is not found in this table, then the combining class is
assumed to be 0.
It is important to note that only 65536 distinct ranges plus combining class
can be specified because the NumCCLNodes is usually a 16-bit number.
NUMBER TABLE
============
The final data file is called "num.dat" and contains the characters that have
a numeric value associated with them.
The format for the binary form of the table is:
unsigned short ByteOrderMark
unsigned short NumNumberNodes
unsigned long Bytes
unsigned long NumberNodes[NumNumberNodes]
unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long)))
/ sizeof(short)]
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
same way as described in the CHARACTER PROPERTIES section.
The NumberNodes array contains pairs of values, the first of which is the
character code and the second an index into the ValueNodes array. The
ValueNodes array contains pairs of integers which represent the numerator
and denominator of the numeric value of the character. If the character
happens to map to an integer, both the values in ValueNodes will be the
same.

Разница между файлами не показана из-за своего большого размера Загрузить разницу