зеркало из https://github.com/mozilla/pjs.git
move to intl/unicharutil
This commit is contained in:
Родитель
b275208d71
Коммит
38e75adbe9
|
@ -1,208 +0,0 @@
|
|||
#
|
||||
# $Id: MUTTUCData.txt,v 1.1 1999-01-06 01:46:03 ftang%netscape.com Exp $
|
||||
#
|
||||
# Copyright 1996, 1997, 1998 Computing Research Labs,
|
||||
# New Mexico State University
|
||||
#
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a
|
||||
# copy of this software and associated documentation files (the "Software"),
|
||||
# to deal in the Software without restriction, including without limitation
|
||||
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the
|
||||
# Software is furnished to do so, subject to the following conditions:
|
||||
#
|
||||
# The above copyright notice and this permission notice shall be included in
|
||||
# all copies or substantial portions of the Software.
|
||||
#
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
|
||||
# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
|
||||
# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
|
||||
# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
#
|
||||
#
|
||||
# Implementation specific character properties.
|
||||
#
|
||||
#
|
||||
# Space, other.
|
||||
#
|
||||
0009;;Ss;;;;;;;;;;;;
|
||||
000A;;Ss;;;;;;;;;;;;
|
||||
000B;;Ss;;;;;;;;;;;;
|
||||
000C;;Ss;;;;;;;;;;;;
|
||||
000D;;Ss;;;;;;;;;;;;
|
||||
#
|
||||
# Non-breaking.
|
||||
#
|
||||
00A0;;Nb;;;;;;;;;;;;
|
||||
2007;;Nb;;;;;;;;;;;;
|
||||
2011;;Nb;;;;;;;;;;;;
|
||||
FEFF;;Nb;;;;;;;;;;;;
|
||||
#
|
||||
# Symmetric.
|
||||
#
|
||||
0028;;Sy;;;;;;;;;;;;
|
||||
0029;;Sy;;;;;;;;;;;;
|
||||
005B;;Sy;;;;;;;;;;;;
|
||||
005D;;Sy;;;;;;;;;;;;
|
||||
007B;;Sy;;;;;;;;;;;;
|
||||
007D;;Sy;;;;;;;;;;;;
|
||||
00AB;;Sy;;;;;;;;;;;;
|
||||
00BB;;Sy;;;;;;;;;;;;
|
||||
0F3A;;Sy;;;;;;;;;;;;
|
||||
0F3B;;Sy;;;;;;;;;;;;
|
||||
0F3C;;Sy;;;;;;;;;;;;
|
||||
0F3D;;Sy;;;;;;;;;;;;
|
||||
0F3E;;Sy;;;;;;;;;;;;
|
||||
0F3F;;Sy;;;;;;;;;;;;
|
||||
2018;;Sy;;;;;;;;;;;;
|
||||
2019;;Sy;;;;;;;;;;;;
|
||||
201A;;Sy;;;;;;;;;;;;
|
||||
201B;;Sy;;;;;;;;;;;;
|
||||
201C;;Sy;;;;;;;;;;;;
|
||||
201D;;Sy;;;;;;;;;;;;
|
||||
201E;;Sy;;;;;;;;;;;;
|
||||
201F;;Sy;;;;;;;;;;;;
|
||||
2039;;Sy;;;;;;;;;;;;
|
||||
203A;;Sy;;;;;;;;;;;;
|
||||
2045;;Sy;;;;;;;;;;;;
|
||||
2046;;Sy;;;;;;;;;;;;
|
||||
207D;;Sy;;;;;;;;;;;;
|
||||
207E;;Sy;;;;;;;;;;;;
|
||||
208D;;Sy;;;;;;;;;;;;
|
||||
208E;;Sy;;;;;;;;;;;;
|
||||
2329;;Sy;;;;;;;;;;;;
|
||||
232A;;Sy;;;;;;;;;;;;
|
||||
3008;;Sy;;;;;;;;;;;;
|
||||
3009;;Sy;;;;;;;;;;;;
|
||||
300A;;Sy;;;;;;;;;;;;
|
||||
300B;;Sy;;;;;;;;;;;;
|
||||
300C;;Sy;;;;;;;;;;;;
|
||||
300D;;Sy;;;;;;;;;;;;
|
||||
300E;;Sy;;;;;;;;;;;;
|
||||
300F;;Sy;;;;;;;;;;;;
|
||||
3010;;Sy;;;;;;;;;;;;
|
||||
3011;;Sy;;;;;;;;;;;;
|
||||
3014;;Sy;;;;;;;;;;;;
|
||||
3015;;Sy;;;;;;;;;;;;
|
||||
3016;;Sy;;;;;;;;;;;;
|
||||
3017;;Sy;;;;;;;;;;;;
|
||||
3018;;Sy;;;;;;;;;;;;
|
||||
3019;;Sy;;;;;;;;;;;;
|
||||
301A;;Sy;;;;;;;;;;;;
|
||||
301B;;Sy;;;;;;;;;;;;
|
||||
301D;;Sy;;;;;;;;;;;;
|
||||
301E;;Sy;;;;;;;;;;;;
|
||||
FD3E;;Sy;;;;;;;;;;;;
|
||||
FD3F;;Sy;;;;;;;;;;;;
|
||||
FE35;;Sy;;;;;;;;;;;;
|
||||
FE36;;Sy;;;;;;;;;;;;
|
||||
FE37;;Sy;;;;;;;;;;;;
|
||||
FE38;;Sy;;;;;;;;;;;;
|
||||
FE39;;Sy;;;;;;;;;;;;
|
||||
FE3A;;Sy;;;;;;;;;;;;
|
||||
FE3B;;Sy;;;;;;;;;;;;
|
||||
FE3C;;Sy;;;;;;;;;;;;
|
||||
FE3D;;Sy;;;;;;;;;;;;
|
||||
FE3E;;Sy;;;;;;;;;;;;
|
||||
FE3F;;Sy;;;;;;;;;;;;
|
||||
FE40;;Sy;;;;;;;;;;;;
|
||||
FE41;;Sy;;;;;;;;;;;;
|
||||
FE42;;Sy;;;;;;;;;;;;
|
||||
FE43;;Sy;;;;;;;;;;;;
|
||||
FE44;;Sy;;;;;;;;;;;;
|
||||
FE59;;Sy;;;;;;;;;;;;
|
||||
FE5A;;Sy;;;;;;;;;;;;
|
||||
FE5B;;Sy;;;;;;;;;;;;
|
||||
FE5C;;Sy;;;;;;;;;;;;
|
||||
FE5D;;Sy;;;;;;;;;;;;
|
||||
FE5E;;Sy;;;;;;;;;;;;
|
||||
FF08;;Sy;;;;;;;;;;;;
|
||||
FF09;;Sy;;;;;;;;;;;;
|
||||
FF3B;;Sy;;;;;;;;;;;;
|
||||
FF3D;;Sy;;;;;;;;;;;;
|
||||
FF5B;;Sy;;;;;;;;;;;;
|
||||
FF5D;;Sy;;;;;;;;;;;;
|
||||
FF62;;Sy;;;;;;;;;;;;
|
||||
FF63;;Sy;;;;;;;;;;;;
|
||||
#
|
||||
# Hex digit.
|
||||
#
|
||||
0030;;Hd;;;;;;;;;;;;
|
||||
0031;;Hd;;;;;;;;;;;;
|
||||
0032;;Hd;;;;;;;;;;;;
|
||||
0033;;Hd;;;;;;;;;;;;
|
||||
0034;;Hd;;;;;;;;;;;;
|
||||
0035;;Hd;;;;;;;;;;;;
|
||||
0036;;Hd;;;;;;;;;;;;
|
||||
0037;;Hd;;;;;;;;;;;;
|
||||
0038;;Hd;;;;;;;;;;;;
|
||||
0039;;Hd;;;;;;;;;;;;
|
||||
0041;;Hd;;;;;;;;;;;;
|
||||
0042;;Hd;;;;;;;;;;;;
|
||||
0043;;Hd;;;;;;;;;;;;
|
||||
0044;;Hd;;;;;;;;;;;;
|
||||
0045;;Hd;;;;;;;;;;;;
|
||||
0046;;Hd;;;;;;;;;;;;
|
||||
0061;;Hd;;;;;;;;;;;;
|
||||
0062;;Hd;;;;;;;;;;;;
|
||||
0063;;Hd;;;;;;;;;;;;
|
||||
0064;;Hd;;;;;;;;;;;;
|
||||
0065;;Hd;;;;;;;;;;;;
|
||||
0066;;Hd;;;;;;;;;;;;
|
||||
FF10;;Hd;;;;;;;;;;;;
|
||||
FF11;;Hd;;;;;;;;;;;;
|
||||
FF12;;Hd;;;;;;;;;;;;
|
||||
FF13;;Hd;;;;;;;;;;;;
|
||||
FF14;;Hd;;;;;;;;;;;;
|
||||
FF15;;Hd;;;;;;;;;;;;
|
||||
FF16;;Hd;;;;;;;;;;;;
|
||||
FF17;;Hd;;;;;;;;;;;;
|
||||
FF18;;Hd;;;;;;;;;;;;
|
||||
FF19;;Hd;;;;;;;;;;;;
|
||||
FF21;;Hd;;;;;;;;;;;;
|
||||
FF22;;Hd;;;;;;;;;;;;
|
||||
FF23;;Hd;;;;;;;;;;;;
|
||||
FF24;;Hd;;;;;;;;;;;;
|
||||
FF25;;Hd;;;;;;;;;;;;
|
||||
FF26;;Hd;;;;;;;;;;;;
|
||||
FF41;;Hd;;;;;;;;;;;;
|
||||
FF42;;Hd;;;;;;;;;;;;
|
||||
FF43;;Hd;;;;;;;;;;;;
|
||||
FF44;;Hd;;;;;;;;;;;;
|
||||
FF45;;Hd;;;;;;;;;;;;
|
||||
FF46;;Hd;;;;;;;;;;;;
|
||||
#
|
||||
# Quote marks.
|
||||
#
|
||||
0022;;Qm;;;;;;;;;;;;
|
||||
0027;;Qm;;;;;;;;;;;;
|
||||
00AB;;Qm;;;;;;;;;;;;
|
||||
00BB;;Qm;;;;;;;;;;;;
|
||||
2018;;Qm;;;;;;;;;;;;
|
||||
2019;;Qm;;;;;;;;;;;;
|
||||
201A;;Qm;;;;;;;;;;;;
|
||||
201B;;Qm;;;;;;;;;;;;
|
||||
201C;;Qm;;;;;;;;;;;;
|
||||
201D;;Qm;;;;;;;;;;;;
|
||||
201E;;Qm;;;;;;;;;;;;
|
||||
201F;;Qm;;;;;;;;;;;;
|
||||
2039;;Qm;;;;;;;;;;;;
|
||||
203A;;Qm;;;;;;;;;;;;
|
||||
300C;;Qm;;;;;;;;;;;;
|
||||
300D;;Qm;;;;;;;;;;;;
|
||||
300E;;Qm;;;;;;;;;;;;
|
||||
300F;;Qm;;;;;;;;;;;;
|
||||
301D;;Qm;;;;;;;;;;;;
|
||||
301E;;Qm;;;;;;;;;;;;
|
||||
301F;;Qm;;;;;;;;;;;;
|
||||
FE41;;Qm;;;;;;;;;;;;
|
||||
FE42;;Qm;;;;;;;;;;;;
|
||||
FE43;;Qm;;;;;;;;;;;;
|
||||
FE44;;Qm;;;;;;;;;;;;
|
||||
FF02;;Qm;;;;;;;;;;;;
|
||||
FF07;;Qm;;;;;;;;;;;;
|
||||
FF62;;Qm;;;;;;;;;;;;
|
||||
FF63;;Qm;;;;;;;;;;;;
|
|
@ -1,207 +0,0 @@
|
|||
#
|
||||
# $Id: UCDATAREADME.txt,v 1.1 1999-01-06 01:46:03 ftang%netscape.com Exp $
|
||||
#
|
||||
|
||||
MUTT UCData Package 1.9
|
||||
-----------------------
|
||||
|
||||
This is a package that supports ctype-like operations for Unicode UCS-2 text
|
||||
(and surrogates), case mapping, and decomposition lookup. To use it, you will
|
||||
need to get the "UnicodeData-2.0.14.txt" (or later) file from the Unicode Web
|
||||
or FTP site.
|
||||
|
||||
This package consists of two parts:
|
||||
|
||||
1. A program called "ucgendat" which generates five data files from the
|
||||
UnicodeData-2.*.txt file. The files are:
|
||||
|
||||
A. case.dat - the case mappings.
|
||||
B. ctype.dat - the character property tables.
|
||||
C. decomp.dat - the character decompositions.
|
||||
D. cmbcl.dat - the non-zero combining classes.
|
||||
E. num.dat - the codes representing numbers.
|
||||
|
||||
2. The "ucdata.[ch]" files which implement the functions needed to
|
||||
check to see if a character matches groups of properties, to map between
|
||||
upper, lower, and title case, to look up the decomposition of a
|
||||
character, look up the combining class of a character, and get the number
|
||||
value of a character.
|
||||
|
||||
A short reference to the functions available is in the "api.txt" file.
|
||||
|
||||
Techie Details
|
||||
==============
|
||||
|
||||
The "ucgendat" program parses files from the command line which are all in the
|
||||
Unicode Character Database (UCDB) format. An additional properties file,
|
||||
"MUTTUCData.txt", provides some extra properties for some characters.
|
||||
|
||||
The program looks for the two character properties fields (2 and 4), the
|
||||
combining class field (3), the decomposition field (5), the numeric value
|
||||
field (8), and the case mapping fields (12, 13, and 14). The decompositions
|
||||
are recursively expanded before being written out.
|
||||
|
||||
The decomposition table contains all the canonical decompositions. This means
|
||||
all decompositions that do not have tags such as "<compat>" or "<font>".
|
||||
|
||||
The data is almost all stored as unsigned longs (32-bits assumed) and the
|
||||
routines that load the data take care of endian swaps when necessary. This
|
||||
also means that surrogates (>= 0x10000) can be placed in the data files the
|
||||
"ucgendat" program parses.
|
||||
|
||||
The data is written as external files and broken into five parts so it can be
|
||||
selectively updated at runtime if necessary.
|
||||
|
||||
The data files currently generated from the "ucgendat" program total about 56K
|
||||
in size all together.
|
||||
|
||||
The format of the binary data files is documented in the "format.txt" file.
|
||||
|
||||
Mark Leisher <mleisher@crl.nmsu.edu>
|
||||
13 December 1998
|
||||
|
||||
CHANGES
|
||||
=======
|
||||
|
||||
Version 1.9
|
||||
-----------
|
||||
1. Fixed a problem with an incorrect amount of storage being allocated for the
|
||||
combining class nodes.
|
||||
|
||||
2. Fixed an invalid initialization in the number code.
|
||||
|
||||
3. Changed the Java template file formatting a bit.
|
||||
|
||||
4. Added tables and function for getting decompositions in the Java class.
|
||||
|
||||
Version 1.8
|
||||
-----------
|
||||
1. Fixed a problem with adding certain ranges.
|
||||
|
||||
2. Added two more macros for testing for identifiers.
|
||||
|
||||
3. Tested with the UnicodeData-2.1.5.txt file.
|
||||
|
||||
Version 1.7
|
||||
-----------
|
||||
1. Fixed a problem with looking up decompositions in "ucgendat."
|
||||
|
||||
Version 1.6
|
||||
-----------
|
||||
1. Added two new properties introduced with UnicodeData-2.1.4.txt.
|
||||
|
||||
2. Changed the "ucgendat.c" program a little to automatically align the
|
||||
property data on a 4-byte boundary when new properties are added.
|
||||
|
||||
3. Changed the "ucgendat.c" programs to only generate canonical
|
||||
decompositions.
|
||||
|
||||
4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for
|
||||
initial and final punctuation characters.
|
||||
|
||||
5. Minor additions and changes to the documentation.
|
||||
|
||||
Version 1.5
|
||||
-----------
|
||||
1. Changed all file open calls to include binary mode with "b" for DOS/WIN
|
||||
platforms.
|
||||
|
||||
2. Wrapped the unistd.h include so it won't be included when compiled under
|
||||
Win32.
|
||||
|
||||
3. Fixed a bad range check for hex digits in ucgendat.c.
|
||||
|
||||
4. Fixed a bad endian swap for combining classes.
|
||||
|
||||
5. Added code to make a number table and associated lookup functions.
|
||||
Functions added are ucnumber(), ucdigit(), and ucgetnumber(). The last
|
||||
function is to maintain compatibility with John Cowan's "uctype" package.
|
||||
|
||||
Version 1.4
|
||||
-----------
|
||||
1. Fixed a bug with adding a range.
|
||||
|
||||
2. Fixed a bug with inserting a range in order.
|
||||
|
||||
3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros.
|
||||
|
||||
4. Added the missing unload for the combining class data.
|
||||
|
||||
5. Fixed a bad macro placement in ucisweak().
|
||||
|
||||
Version 1.3
|
||||
-----------
|
||||
1. Bug with case mapping calculations fixed.
|
||||
|
||||
2. Bug with empty character property entries fixed.
|
||||
|
||||
3. Bug with incorrect type in the combining class lookup fixed.
|
||||
|
||||
4. Some corrections done to api.txt.
|
||||
|
||||
5. Bug in certain character property lookups fixed.
|
||||
|
||||
6. Added a character property table that records the defined characters.
|
||||
|
||||
7. Replaced ucisunknown() with ucisdefined() and ucisundefined().
|
||||
|
||||
Version 1.2
|
||||
-----------
|
||||
1. Added code to ucgendat to generate a combining class table.
|
||||
|
||||
2. Fixed an endian problem with the byte count of decompositions.
|
||||
|
||||
3. Fixed some minor problems in the "format.txt" file.
|
||||
|
||||
4. Removed some bogus "Ss" values from MUTTUCData.txt file.
|
||||
|
||||
5. Added API function to get combining class.
|
||||
|
||||
6. Changed the open mode to "rb" so binary data files will be opened correctly
|
||||
on DOS/WIN as well as other platforms.
|
||||
|
||||
7. Added the "api.txt" file.
|
||||
|
||||
Version 1.1
|
||||
-----------
|
||||
1. Added ucisxdigit() which I overlooked.
|
||||
|
||||
2. Added UC_LT to the ucisalpha() macro which I overlooked.
|
||||
|
||||
3. Change uciscntrl() to include UC_CF.
|
||||
|
||||
4. Added ucisocntrl() and ucfntcntrl() macros.
|
||||
|
||||
5. Added a ucisblank() which I overlooked.
|
||||
|
||||
6. Added missing properties to ucissymbol() and ucisnumber().
|
||||
|
||||
7. Added ucisgraph() and ucisprint().
|
||||
|
||||
8. Changed the "Mr" property to "Sy" to mark this subset of mirroring
|
||||
characters as symmetric to avoid trampling the Unicode/ISO10646 sense of
|
||||
mirroring.
|
||||
|
||||
9. Added another property called "Ss" which includes control characters
|
||||
traditionally seen as spaces in the isspace() macro.
|
||||
|
||||
10. Added a bunch of macros to be API compatible with John Cowan's package.
|
||||
|
||||
ACKNOWLEDGEMENTS
|
||||
================
|
||||
|
||||
Thanks go to John Cowan <cowan@locke.ccil.org> for pointing out lots of
|
||||
missing things and giving me stuff, particularly a bunch of new macros.
|
||||
|
||||
Thanks go to Bob Verbrugge <bob_verbrugge@nl.compuware.com> for pointing out
|
||||
various bugs.
|
||||
|
||||
Thanks go to Christophe Pierret <cpierret@businessobjects.com> for pointing
|
||||
out that file modes need to have "b" for DOS/WIN machines, pointing out
|
||||
unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum().
|
||||
|
||||
Thanks go to Kent Johnson <kent@pondview.mv.com> for finding a bug that caused
|
||||
incomplete decompositions to be generated by the "ucgendat" program.
|
||||
|
||||
Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
|
||||
error and an initialization error.
|
|
@ -1,243 +0,0 @@
|
|||
#
|
||||
# $Id: format.txt,v 1.1 1999-01-06 01:46:03 ftang%netscape.com Exp $
|
||||
#
|
||||
|
||||
CHARACTER DATA
|
||||
==============
|
||||
|
||||
This package generates some data files that contain character properties useful
|
||||
for text processing.
|
||||
|
||||
CHARACTER PROPERTIES
|
||||
====================
|
||||
|
||||
The first data file is called "ctype.dat" and contains a compressed form of
|
||||
the character properties found in the Unicode Character Database (UCDB).
|
||||
Additional properties can be specified in limited UCDB format in another file
|
||||
to avoid modifying the original UCDB.
|
||||
|
||||
The following is a property name and code table to be used with the character
|
||||
data:
|
||||
|
||||
NAME CODE DESCRIPTION
|
||||
---------------------
|
||||
Mn 0 Mark, Non-Spacing
|
||||
Mc 1 Mark, Spacing Combining
|
||||
Me 2 Mark, Enclosing
|
||||
Nd 3 Number, Decimal Digit
|
||||
Nl 4 Number, Letter
|
||||
No 5 Number, Other
|
||||
Zs 6 Separator, Space
|
||||
Zl 7 Separator, Line
|
||||
Zp 8 Separator, Paragraph
|
||||
Cc 9 Other, Control
|
||||
Cf 10 Other, Format
|
||||
Cs 11 Other, Surrogate
|
||||
Co 12 Other, Private Use
|
||||
Cn 13 Other, Not Assigned
|
||||
Lu 14 Letter, Uppercase
|
||||
Ll 15 Letter, Lowercase
|
||||
Lt 16 Letter, Titlecase
|
||||
Lm 17 Letter, Modifier
|
||||
Lo 18 Letter, Other
|
||||
Pc 19 Punctuation, Connector
|
||||
Pd 20 Punctuation, Dash
|
||||
Ps 21 Punctuation, Open
|
||||
Pe 22 Punctuation, Close
|
||||
Po 23 Punctuation, Other
|
||||
Sm 24 Symbol, Math
|
||||
Sc 25 Symbol, Currency
|
||||
Sk 26 Symbol, Modifier
|
||||
So 27 Symbol, Other
|
||||
L 28 Left-To-Right
|
||||
R 29 Right-To-Left
|
||||
EN 30 European Number
|
||||
ES 31 European Number Separator
|
||||
ET 32 European Number Terminator
|
||||
AN 33 Arabic Number
|
||||
CS 34 Common Number Separator
|
||||
B 35 Block Separator
|
||||
S 36 Segment Separator
|
||||
WS 37 Whitespace
|
||||
ON 38 Other Neutrals
|
||||
Pi 47 Punctuation, Initial
|
||||
Pf 48 Punctuation, Final
|
||||
#
|
||||
# Implementation specific properties.
|
||||
#
|
||||
Cm 39 Composite
|
||||
Nb 40 Non-Breaking
|
||||
Sy 41 Symmetric (characters which are part of open/close pairs)
|
||||
Hd 42 Hex Digit
|
||||
Qm 43 Quote Mark
|
||||
Mr 44 Mirroring
|
||||
Ss 45 Space, Other (controls viewed as spaces in ctype isspace())
|
||||
Cp 46 Defined character
|
||||
|
||||
The actual binary data is formatted as follows:
|
||||
|
||||
Assumptions: unsigned short is at least 16-bits in size and unsigned long
|
||||
is at least 32-bits in size.
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short OffsetArraySize
|
||||
unsigned long Bytes
|
||||
unsigned short Offsets[OffsetArraySize + 1]
|
||||
unsigned long Ranges[N], N = value of Offsets[OffsetArraySize]
|
||||
|
||||
The Bytes field provides the total byte count used for the Offsets[] and
|
||||
Ranges[] arrays. The Offsets[] array is aligned on a 4-byte boundary and
|
||||
there is always one extra node on the end to hold the final index of the
|
||||
Ranges[] array. The Ranges[] array contains pairs of 4-byte values
|
||||
representing a range of Unicode characters. The pairs are arranged in
|
||||
increasing order by the first character code in the range.
|
||||
|
||||
Determining if a particular character is in the property list requires a
|
||||
simple binary search to determine if a character is in any of the ranges
|
||||
for the property.
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a
|
||||
machine with a different endian order and the values must be byte-swapped.
|
||||
|
||||
To swap a 16-bit value:
|
||||
c = (c >> 8) | ((c & 0xff) << 8)
|
||||
|
||||
To swap a 32-bit value:
|
||||
c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) |
|
||||
(((c >> 16) & 0xff) << 8) | (c >> 24)
|
||||
|
||||
CASE MAPPINGS
|
||||
=============
|
||||
|
||||
The next data file is called "case.dat" and contains three case mapping tables
|
||||
in the following order: upper, lower, and title case. Each table is in
|
||||
increasing order by character code and each mapping contains 3 unsigned longs
|
||||
which represent the possible mappings.
|
||||
|
||||
The format for the binary form of these tables is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumMappingNodes, count of all mapping nodes
|
||||
unsigned short CaseTableSizes[2], upper and lower mapping node counts
|
||||
unsigned long CaseTables[NumMappingNodes]
|
||||
|
||||
The starting indexes of the case tables are calculated as following:
|
||||
|
||||
UpperIndex = 0;
|
||||
LowerIndex = CaseTableSizes[0] * 3;
|
||||
TitleIndex = LowerIndex + CaseTableSizes[1] * 3;
|
||||
|
||||
The order of the fields for the three tables are:
|
||||
|
||||
Upper case
|
||||
----------
|
||||
unsigned long upper;
|
||||
unsigned long lower;
|
||||
unsigned long title;
|
||||
|
||||
Lower case
|
||||
----------
|
||||
unsigned long lower;
|
||||
unsigned long upper;
|
||||
unsigned long title;
|
||||
|
||||
Title case
|
||||
----------
|
||||
unsigned long title;
|
||||
unsigned long upper;
|
||||
unsigned long lower;
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
Because the tables are in increasing order by character code, locating a
|
||||
mapping requires a simple binary search on one of the 3 codes that make up
|
||||
each node.
|
||||
|
||||
It is important to note that there can only be 65536 mapping nodes which
|
||||
divided into 3 portions allows 21845 nodes for each case mapping table. The
|
||||
distribution of mappings may be more or less than 21845 per table, but only
|
||||
65536 are allowed.
|
||||
|
||||
DECOMPOSITIONS
|
||||
==============
|
||||
|
||||
The next data file is called "decomp.dat" and contains the decomposition data
|
||||
for all characters with decompositions containing more than one character and
|
||||
are *not* compatibility decompositions. Compatibility decompositions are
|
||||
signaled in the UCDB format by the use of the <compat> tag in the
|
||||
decomposition field. Each list of character codes represents a full
|
||||
decomposition of a composite character. The nodes are arranged in increasing
|
||||
order by character code.
|
||||
|
||||
The format for the binary form of this table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumDecompNodes, count of all decomposition nodes
|
||||
unsigned long Bytes
|
||||
unsigned long DecompNodes[(NumDecompNodes * 2) + 1]
|
||||
unsigned long Decomp[N], N = sum of all counts in DecompNodes[]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The DecompNodes[] array consists of pairs of unsigned longs, the first of
|
||||
which is the character code and the second is the initial index of the list
|
||||
of character codes representing the decomposition.
|
||||
|
||||
Locating the decomposition of a composite character requires a binary search
|
||||
for a character code in the DecompNodes[] array and using its index to
|
||||
locate the start of the decomposition. The length of the decomposition list
|
||||
is the index in the following element in DecompNode[] minus the current
|
||||
index.
|
||||
|
||||
COMBINING CLASSES
|
||||
=================
|
||||
|
||||
The fourth data file is called "cmbcl.dat" and contains the characters with
|
||||
non-zero combining classes.
|
||||
|
||||
The format for the binary form of this table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumCCLNodes
|
||||
unsigned long Bytes
|
||||
unsigned long CCLNodes[NumCCLNodes * 3]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The CCLNodes[] array consists of groups of three unsigned longs. The first
|
||||
and second are the beginning and ending of a range and the third is the
|
||||
combining class of that range.
|
||||
|
||||
If a character is not found in this table, then the combining class is
|
||||
assumed to be 0.
|
||||
|
||||
It is important to note that only 65536 distinct ranges plus combining class
|
||||
can be specified because the NumCCLNodes is usually a 16-bit number.
|
||||
|
||||
NUMBER TABLE
|
||||
============
|
||||
|
||||
The final data file is called "num.dat" and contains the characters that have
|
||||
a numeric value associated with them.
|
||||
|
||||
The format for the binary form of the table is:
|
||||
|
||||
unsigned short ByteOrderMark
|
||||
unsigned short NumNumberNodes
|
||||
unsigned long Bytes
|
||||
unsigned long NumberNodes[NumNumberNodes]
|
||||
unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long)))
|
||||
/ sizeof(short)]
|
||||
|
||||
If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
|
||||
same way as described in the CHARACTER PROPERTIES section.
|
||||
|
||||
The NumberNodes array contains pairs of values, the first of which is the
|
||||
character code and the second an index into the ValueNodes array. The
|
||||
ValueNodes array contains pairs of integers which represent the numerator
|
||||
and denominator of the numeric value of the character. If the character
|
||||
happens to map to an integer, both the values in ValueNodes will be the
|
||||
same.
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Загрузка…
Ссылка в новой задаче