add files from UCDATA 1.9

1999-01-06 01:46:32 +00:00 · 1999-01-06 01:46:32 +00:00 · 001455720b
--- a/modules/unicharutil/src/ucdata.c
+++ b/modules/unicharutil/src/ucdata.c
--- a/modules/unicharutil/src/ucdata.h
+++ b/modules/unicharutil/src/ucdata.h
@ -0,0 +1,306 @@
+/*
+ * Copyright 1996, 1997, 1998 Computing Research Labs,
+ * New Mexico State University
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+ * OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+ * THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _h_ucdata
+#define _h_ucdata
+
+/*
+ * $Id: ucdata.h,v 1.1 1999/01/06 01:46:32 ftang%netscape.com Exp $
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#undef __
+#ifdef __STDC__
+#define __(x) x
+#else
+#define __(x) ()
+#endif
+
+#define UCDATA_VERSION "1.9"
+
+/**************************************************************************
+ *
+ * Masks and macros for character properties.
+ *
+ **************************************************************************/
+
+/*
+ * Values that can appear in the `mask1' parameter of the ucisprop()
+ * function.
+ */
+#define UC_MN 0x00000001 /* Mark, Non-Spacing          */
+#define UC_MC 0x00000002 /* Mark, Spacing Combining    */
+#define UC_ME 0x00000004 /* Mark, Enclosing            */
+#define UC_ND 0x00000008 /* Number, Decimal Digit      */
+#define UC_NL 0x00000010 /* Number, Letter             */
+#define UC_NO 0x00000020 /* Number, Other              */
+#define UC_ZS 0x00000040 /* Separator, Space           */
+#define UC_ZL 0x00000080 /* Separator, Line            */
+#define UC_ZP 0x00000100 /* Separator, Paragraph       */
+#define UC_CC 0x00000200 /* Other, Control             */
+#define UC_CF 0x00000400 /* Other, Format              */
+#define UC_OS 0x00000800 /* Other, Surrogate           */
+#define UC_CO 0x00001000 /* Other, Private Use         */
+#define UC_CN 0x00002000 /* Other, Not Assigned        */
+#define UC_LU 0x00004000 /* Letter, Uppercase          */
+#define UC_LL 0x00008000 /* Letter, Lowercase          */
+#define UC_LT 0x00010000 /* Letter, Titlecase          */
+#define UC_LM 0x00020000 /* Letter, Modifier           */
+#define UC_LO 0x00040000 /* Letter, Other              */
+#define UC_PC 0x00080000 /* Punctuation, Connector     */
+#define UC_PD 0x00100000 /* Punctuation, Dash          */
+#define UC_PS 0x00200000 /* Punctuation, Open          */
+#define UC_PE 0x00400000 /* Punctuation, Close         */
+#define UC_PO 0x00800000 /* Punctuation, Other         */
+#define UC_SM 0x01000000 /* Symbol, Math               */
+#define UC_SC 0x02000000 /* Symbol, Currency           */
+#define UC_SK 0x04000000 /* Symbol, Modifier           */
+#define UC_SO 0x08000000 /* Symbol, Other              */
+#define UC_L  0x10000000 /* Left-To-Right              */
+#define UC_R  0x20000000 /* Right-To-Left              */
+#define UC_EN 0x40000000 /* European Number            */
+#define UC_ES 0x80000000 /* European Number Separator  */
+
+/*
+ * Values that can appear in the `mask2' parameter of the ucisprop()
+ * function.
+ */
+#define UC_ET 0x00000001 /* European Number Terminator */
+#define UC_AN 0x00000002 /* Arabic Number              */
+#define UC_CS 0x00000004 /* Common Number Separator    */
+#define UC_B  0x00000008 /* Block Separator            */
+#define UC_S  0x00000010 /* Segment Separator          */
+#define UC_WS 0x00000020 /* Whitespace                 */
+#define UC_ON 0x00000040 /* Other Neutrals             */
+/*
+ * Implementation specific character properties.
+ */
+#define UC_CM 0x00000080 /* Composite                  */
+#define UC_NB 0x00000100 /* Non-Breaking               */
+#define UC_SY 0x00000200 /* Symmetric                  */
+#define UC_HD 0x00000400 /* Hex Digit                  */
+#define UC_QM 0x00000800 /* Quote Mark                 */
+#define UC_MR 0x00001000 /* Mirroring                  */
+#define UC_SS 0x00002000 /* Space, other               */
+
+#define UC_CP 0x00004000 /* Defined                    */
+
+/*
+ * Added for UnicodeData-2.1.3.
+ */
+#define UC_PI 0x00008000 /* Punctuation, Initial       */
+#define UC_PF 0x00010000 /* Punctuation, Final         */
+
+/*
+ * This is the primary function for testing to see if a character has some set
+ * of properties.  The macros that test for various character properties all
+ * call this function with some set of masks.
+ */
+extern int ucisprop __((unsigned long code, unsigned long mask1,
+                        unsigned long mask2));
+
+#define ucisalpha(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT, 0)
+#define ucisdigit(cc) ucisprop(cc, UC_ND, 0)
+#define ucisalnum(cc) ucisprop(cc, UC_LU|UC_LL|UC_LM|UC_LO|UC_LT|UC_ND, 0)
+#define uciscntrl(cc) ucisprop(cc, UC_CC|UC_CF, 0)
+#define ucisspace(cc) ucisprop(cc, UC_ZS|UC_SS, 0)
+#define ucisblank(cc) ucisprop(cc, UC_ZS, 0)
+#define ucispunct(cc) ucisprop(cc, UC_PD|UC_PS|UC_PE|UC_PO, UC_PI|UC_PF)
+#define ucisgraph(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
+                               UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
+                               UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
+                               UC_SO, UC_PI|UC_PF)
+#define ucisprint(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME|UC_ND|UC_NL|UC_NO|\
+                               UC_LU|UC_LL|UC_LT|UC_LM|UC_LO|UC_PC|UC_PD|\
+                               UC_PS|UC_PE|UC_PO|UC_SM|UC_SM|UC_SC|UC_SK|\
+                               UC_SO|UC_ZS, UC_PI|UC_PF)
+#define ucisupper(cc) ucisprop(cc, UC_LU, 0)
+#define ucislower(cc) ucisprop(cc, UC_LL, 0)
+#define ucistitle(cc) ucisprop(cc, UC_LT, 0)
+#define ucisxdigit(cc) ucisprop(cc, 0, UC_HD)
+
+#define ucisisocntrl(cc) ucisprop(cc, UC_CC, 0)
+#define ucisfmtcntrl(cc) ucisprop(cc, UC_CF, 0)
+
+#define ucissymbol(cc) ucisprop(cc, UC_SM|UC_SC|UC_SO|UC_SK, 0)
+#define ucisnumber(cc) ucisprop(cc, UC_ND|UC_NO|UC_NL, 0)
+#define ucisnonspacing(cc) ucisprop(cc, UC_MN, 0)
+#define ucisopenpunct(cc) ucisprop(cc, UC_PS, 0)
+#define ucisclosepunct(cc) ucisprop(cc, UC_PE, 0)
+#define ucisinitialpunct(cc) ucisprop(cc, 0, UC_PI)
+#define ucisfinalpunct(cc) ucisprop(cc, 0, UC_PF)
+
+#define uciscomposite(cc) ucisprop(cc, 0, UC_CM)
+#define ucishex(cc) ucisprop(cc, 0, UC_HD)
+#define ucisquote(cc) ucisprop(cc, 0, UC_QM)
+#define ucissymmetric(cc) ucisprop(cc, 0, UC_SY)
+#define ucismirroring(cc) ucisprop(cc, 0, UC_MR)
+#define ucisnonbreaking(cc) ucisprop(cc, 0, UC_NB)
+
+/*
+ * Directionality macros.
+ */
+#define ucisrtl(cc) ucisprop(cc, UC_R, 0)
+#define ucisltr(cc) ucisprop(cc, UC_L, 0)
+#define ucisstrong(cc) ucisprop(cc, UC_L|UC_R, 0)
+#define ucisweak(cc) ucisprop(cc, UC_EN|UC_ES, UC_ET|UC_AN|UC_CS)
+#define ucisneutral(cc) ucisprop(cc, 0, UC_B|UC_S|UC_WS|UC_ON)
+#define ucisseparator(cc) ucisprop(cc, 0, UC_B|UC_S)
+
+/*
+ * Other macros inspired by John Cowan.
+ */
+#define ucismark(cc) ucisprop(cc, UC_MN|UC_MC|UC_ME, 0)
+#define ucismodif(cc) ucisprop(cc, UC_LM, 0)
+#define ucisletnum(cc) ucisprop(cc, UC_NL, 0)
+#define ucisconnect(cc) ucisprop(cc, UC_PC, 0)
+#define ucisdash(cc) ucisprop(cc, UC_PD, 0)
+#define ucismath(cc) ucisprop(cc, UC_SM, 0)
+#define uciscurrency(cc) ucisprop(cc, UC_SC, 0)
+#define ucismodifsymbol(cc) ucisprop(cc, UC_SK, 0)
+#define ucisnsmark(cc) ucisprop(cc, UC_MN, 0)
+#define ucisspmark(cc) ucisprop(cc, UC_MC, 0)
+#define ucisenclosing(cc) ucisprop(cc, UC_ME, 0)
+#define ucisprivate(cc) ucisprop(cc, UC_CO, 0)
+#define ucissurrogate(cc) ucisprop(cc, UC_OS, 0)
+#define ucislsep(cc) ucisprop(cc, UC_ZL, 0)
+#define ucispsep(cc) ucisprop(cc, UC_ZP, 0)
+
+#define ucisidentstart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL, 0)
+#define ucisidentpart(cc) ucisprop(cc, UC_LU|UC_LL|UC_LT|UC_LO|UC_NL|\
+                                   UC_MN|UC_MC|UC_ND|UC_PC|UC_CF, 0)
+
+#define ucisdefined(cc) ucisprop(cc, 0, UC_CP)
+#define ucisundefined(cc) !ucisprop(cc, 0, UC_CP)
+
+/*
+ * Other miscellaneous character property macros.
+ */
+#define ucishan(cc) (((cc) >= 0x4e00 && (cc) <= 0x9fff) ||\
+                     ((cc) >= 0xf900 && (cc) <= 0xfaff))
+#define ucishangul(cc) ((cc) >= 0xac00 && (cc) <= 0xd7ff)
+
+/**************************************************************************
+ *
+ * Functions for case conversion.
+ *
+ **************************************************************************/
+
+extern unsigned long uctoupper __((unsigned long code));
+extern unsigned long uctolower __((unsigned long code));
+extern unsigned long uctotitle __((unsigned long code));
+
+/**************************************************************************
+ *
+ * Functions for getting decompositions.
+ *
+ **************************************************************************/
+
+/*
+ * This routine determines if the code has a decomposition.  If it returns 0,
+ * there is no decomposition.  Any other value indicates a decomposition was
+ * returned.
+ */
+extern int ucdecomp __((unsigned long code, unsigned long *num,
+
+                        unsigned long **decomp));
+
+/*
+ * If the code is a Hangul syllable, this routine decomposes it into the array
+ * passed.  The array size should be at least 3.
+ */
+extern int ucdecomp_hangul __((unsigned long code, unsigned long *num,
+                               unsigned long decomp[]));
+
+/**************************************************************************
+ *
+ * Functions for getting combining classes.
+ *
+ **************************************************************************/
+
+/*
+ * This will return the combining class for a character to be used with the
+ * Canonical Ordering algorithm.
+ */
+extern unsigned long uccombining_class __((unsigned long code));
+
+/**************************************************************************
+ *
+ * Functions for getting numbers and digits.
+ *
+ **************************************************************************/
+
+struct ucnumber {
+    int numerator;
+    int denominator;
+};
+
+extern int ucnumber_lookup __((unsigned long code, struct ucnumber *num));
+extern int ucdigit_lookup __((unsigned long code, int *digit));
+
+/*
+ * For compatibility with John Cowan's "uctype" package.
+ */
+extern struct ucnumber ucgetnumber __((unsigned long code));
+extern int ucgetdigit __((unsigned long code));
+
+/**************************************************************************
+ *
+ * Functions library initialization and cleanup.
+ *
+ **************************************************************************/
+
+/*
+ * Macros for specifying the data tables to be loaded for ucdata_load().
+ */
+#define UCDATA_CASE   0x01
+#define UCDATA_CTYPE  0x02
+#define UCDATA_DECOMP 0x04
+#define UCDATA_CMBCL  0x08
+#define UCDATA_NUM    0x10
+
+#define UCDATA_ALL (UCDATA_CASE|UCDATA_CTYPE|UCDATA_DECOMP|\
+                    UCDATA_CMBCL|UCDATA_NUM)
+
+/*
+ * Functions to load, unload, and reload specific data files.
+ */
+extern void ucdata_load __((char *paths, int mask));
+extern void ucdata_unload __((int mask));
+extern void ucdata_reload __((char *paths, int mask));
+
+/*
+ * Deprecated functions, now just compatibility macros.
+ */
+#define ucdata_setup(p) ucdata_load(p, UCDATA_ALL)
+#define ucdata_cleanup() ucdata_unload(UCDATA_ALL)
+
+#undef __
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _h_ucdata */
--- a/modules/unicharutil/tools/MUTTUCData.txt
+++ b/modules/unicharutil/tools/MUTTUCData.txt
@ -0,0 +1,208 @@
+#
+# $Id: MUTTUCData.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
+#
+# Copyright 1996, 1997, 1998 Computing Research Labs,
+# New Mexico State University
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE COMPUTING RESEARCH LAB OR NEW MEXICO STATE UNIVERSITY BE LIABLE FOR ANY
+# CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT
+# OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR
+# THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+#
+# Implementation specific character properties.
+#
+#
+# Space, other.
+#
+0009;;Ss;;;;;;;;;;;;
+000A;;Ss;;;;;;;;;;;;
+000B;;Ss;;;;;;;;;;;;
+000C;;Ss;;;;;;;;;;;;
+000D;;Ss;;;;;;;;;;;;
+#
+# Non-breaking.
+#
+00A0;;Nb;;;;;;;;;;;;
+2007;;Nb;;;;;;;;;;;;
+2011;;Nb;;;;;;;;;;;;
+FEFF;;Nb;;;;;;;;;;;;
+#
+# Symmetric.
+#
+0028;;Sy;;;;;;;;;;;;
+0029;;Sy;;;;;;;;;;;;
+005B;;Sy;;;;;;;;;;;;
+005D;;Sy;;;;;;;;;;;;
+007B;;Sy;;;;;;;;;;;;
+007D;;Sy;;;;;;;;;;;;
+00AB;;Sy;;;;;;;;;;;;
+00BB;;Sy;;;;;;;;;;;;
+0F3A;;Sy;;;;;;;;;;;;
+0F3B;;Sy;;;;;;;;;;;;
+0F3C;;Sy;;;;;;;;;;;;
+0F3D;;Sy;;;;;;;;;;;;
+0F3E;;Sy;;;;;;;;;;;;
+0F3F;;Sy;;;;;;;;;;;;
+2018;;Sy;;;;;;;;;;;;
+2019;;Sy;;;;;;;;;;;;
+201A;;Sy;;;;;;;;;;;;
+201B;;Sy;;;;;;;;;;;;
+201C;;Sy;;;;;;;;;;;;
+201D;;Sy;;;;;;;;;;;;
+201E;;Sy;;;;;;;;;;;;
+201F;;Sy;;;;;;;;;;;;
+2039;;Sy;;;;;;;;;;;;
+203A;;Sy;;;;;;;;;;;;
+2045;;Sy;;;;;;;;;;;;
+2046;;Sy;;;;;;;;;;;;
+207D;;Sy;;;;;;;;;;;;
+207E;;Sy;;;;;;;;;;;;
+208D;;Sy;;;;;;;;;;;;
+208E;;Sy;;;;;;;;;;;;
+2329;;Sy;;;;;;;;;;;;
+232A;;Sy;;;;;;;;;;;;
+3008;;Sy;;;;;;;;;;;;
+3009;;Sy;;;;;;;;;;;;
+300A;;Sy;;;;;;;;;;;;
+300B;;Sy;;;;;;;;;;;;
+300C;;Sy;;;;;;;;;;;;
+300D;;Sy;;;;;;;;;;;;
+300E;;Sy;;;;;;;;;;;;
+300F;;Sy;;;;;;;;;;;;
+3010;;Sy;;;;;;;;;;;;
+3011;;Sy;;;;;;;;;;;;
+3014;;Sy;;;;;;;;;;;;
+3015;;Sy;;;;;;;;;;;;
+3016;;Sy;;;;;;;;;;;;
+3017;;Sy;;;;;;;;;;;;
+3018;;Sy;;;;;;;;;;;;
+3019;;Sy;;;;;;;;;;;;
+301A;;Sy;;;;;;;;;;;;
+301B;;Sy;;;;;;;;;;;;
+301D;;Sy;;;;;;;;;;;;
+301E;;Sy;;;;;;;;;;;;
+FD3E;;Sy;;;;;;;;;;;;
+FD3F;;Sy;;;;;;;;;;;;
+FE35;;Sy;;;;;;;;;;;;
+FE36;;Sy;;;;;;;;;;;;
+FE37;;Sy;;;;;;;;;;;;
+FE38;;Sy;;;;;;;;;;;;
+FE39;;Sy;;;;;;;;;;;;
+FE3A;;Sy;;;;;;;;;;;;
+FE3B;;Sy;;;;;;;;;;;;
+FE3C;;Sy;;;;;;;;;;;;
+FE3D;;Sy;;;;;;;;;;;;
+FE3E;;Sy;;;;;;;;;;;;
+FE3F;;Sy;;;;;;;;;;;;
+FE40;;Sy;;;;;;;;;;;;
+FE41;;Sy;;;;;;;;;;;;
+FE42;;Sy;;;;;;;;;;;;
+FE43;;Sy;;;;;;;;;;;;
+FE44;;Sy;;;;;;;;;;;;
+FE59;;Sy;;;;;;;;;;;;
+FE5A;;Sy;;;;;;;;;;;;
+FE5B;;Sy;;;;;;;;;;;;
+FE5C;;Sy;;;;;;;;;;;;
+FE5D;;Sy;;;;;;;;;;;;
+FE5E;;Sy;;;;;;;;;;;;
+FF08;;Sy;;;;;;;;;;;;
+FF09;;Sy;;;;;;;;;;;;
+FF3B;;Sy;;;;;;;;;;;;
+FF3D;;Sy;;;;;;;;;;;;
+FF5B;;Sy;;;;;;;;;;;;
+FF5D;;Sy;;;;;;;;;;;;
+FF62;;Sy;;;;;;;;;;;;
+FF63;;Sy;;;;;;;;;;;;
+#
+# Hex digit.
+#
+0030;;Hd;;;;;;;;;;;;
+0031;;Hd;;;;;;;;;;;;
+0032;;Hd;;;;;;;;;;;;
+0033;;Hd;;;;;;;;;;;;
+0034;;Hd;;;;;;;;;;;;
+0035;;Hd;;;;;;;;;;;;
+0036;;Hd;;;;;;;;;;;;
+0037;;Hd;;;;;;;;;;;;
+0038;;Hd;;;;;;;;;;;;
+0039;;Hd;;;;;;;;;;;;
+0041;;Hd;;;;;;;;;;;;
+0042;;Hd;;;;;;;;;;;;
+0043;;Hd;;;;;;;;;;;;
+0044;;Hd;;;;;;;;;;;;
+0045;;Hd;;;;;;;;;;;;
+0046;;Hd;;;;;;;;;;;;
+0061;;Hd;;;;;;;;;;;;
+0062;;Hd;;;;;;;;;;;;
+0063;;Hd;;;;;;;;;;;;
+0064;;Hd;;;;;;;;;;;;
+0065;;Hd;;;;;;;;;;;;
+0066;;Hd;;;;;;;;;;;;
+FF10;;Hd;;;;;;;;;;;;
+FF11;;Hd;;;;;;;;;;;;
+FF12;;Hd;;;;;;;;;;;;
+FF13;;Hd;;;;;;;;;;;;
+FF14;;Hd;;;;;;;;;;;;
+FF15;;Hd;;;;;;;;;;;;
+FF16;;Hd;;;;;;;;;;;;
+FF17;;Hd;;;;;;;;;;;;
+FF18;;Hd;;;;;;;;;;;;
+FF19;;Hd;;;;;;;;;;;;
+FF21;;Hd;;;;;;;;;;;;
+FF22;;Hd;;;;;;;;;;;;
+FF23;;Hd;;;;;;;;;;;;
+FF24;;Hd;;;;;;;;;;;;
+FF25;;Hd;;;;;;;;;;;;
+FF26;;Hd;;;;;;;;;;;;
+FF41;;Hd;;;;;;;;;;;;
+FF42;;Hd;;;;;;;;;;;;
+FF43;;Hd;;;;;;;;;;;;
+FF44;;Hd;;;;;;;;;;;;
+FF45;;Hd;;;;;;;;;;;;
+FF46;;Hd;;;;;;;;;;;;
+#
+# Quote marks.
+#
+0022;;Qm;;;;;;;;;;;;
+0027;;Qm;;;;;;;;;;;;
+00AB;;Qm;;;;;;;;;;;;
+00BB;;Qm;;;;;;;;;;;;
+2018;;Qm;;;;;;;;;;;;
+2019;;Qm;;;;;;;;;;;;
+201A;;Qm;;;;;;;;;;;;
+201B;;Qm;;;;;;;;;;;;
+201C;;Qm;;;;;;;;;;;;
+201D;;Qm;;;;;;;;;;;;
+201E;;Qm;;;;;;;;;;;;
+201F;;Qm;;;;;;;;;;;;
+2039;;Qm;;;;;;;;;;;;
+203A;;Qm;;;;;;;;;;;;
+300C;;Qm;;;;;;;;;;;;
+300D;;Qm;;;;;;;;;;;;
+300E;;Qm;;;;;;;;;;;;
+300F;;Qm;;;;;;;;;;;;
+301D;;Qm;;;;;;;;;;;;
+301E;;Qm;;;;;;;;;;;;
+301F;;Qm;;;;;;;;;;;;
+FE41;;Qm;;;;;;;;;;;;
+FE42;;Qm;;;;;;;;;;;;
+FE43;;Qm;;;;;;;;;;;;
+FE44;;Qm;;;;;;;;;;;;
+FF02;;Qm;;;;;;;;;;;;
+FF07;;Qm;;;;;;;;;;;;
+FF62;;Qm;;;;;;;;;;;;
+FF63;;Qm;;;;;;;;;;;;
--- a/modules/unicharutil/tools/UCDATAREADME.txt
+++ b/modules/unicharutil/tools/UCDATAREADME.txt
@ -0,0 +1,207 @@
+#
+# $Id: UCDATAREADME.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
+#
+
+                           MUTT UCData Package 1.9
+                           -----------------------
+
+This is a package that supports ctype-like operations for Unicode UCS-2 text
+(and surrogates), case mapping, and decomposition lookup.  To use it, you will
+need to get the "UnicodeData-2.0.14.txt" (or later) file from the Unicode Web
+or FTP site.
+
+This package consists of two parts:
+
+  1. A program called "ucgendat" which generates five data files from the
+     UnicodeData-2.*.txt file.  The files are:
+
+     A. case.dat   - the case mappings.
+     B. ctype.dat  - the character property tables.
+     C. decomp.dat - the character decompositions.
+     D. cmbcl.dat  - the non-zero combining classes.
+     E. num.dat    - the codes representing numbers.
+
+  2. The "ucdata.[ch]" files which implement the functions needed to
+     check to see if a character matches groups of properties, to map between
+     upper, lower, and title case, to look up the decomposition of a
+     character, look up the combining class of a character, and get the number
+     value of a character.
+
+A short reference to the functions available is in the "api.txt" file.
+
+Techie Details
+==============
+
+The "ucgendat" program parses files from the command line which are all in the
+Unicode Character Database (UCDB) format.  An additional properties file,
+"MUTTUCData.txt", provides some extra properties for some characters.
+
+The program looks for the two character properties fields (2 and 4), the
+combining class field (3), the decomposition field (5), the numeric value
+field (8), and the case mapping fields (12, 13, and 14).  The decompositions
+are recursively expanded before being written out.
+
+The decomposition table contains all the canonical decompositions.  This means
+all decompositions that do not have tags such as "<compat>" or "<font>".
+
+The data is almost all stored as unsigned longs (32-bits assumed) and the
+routines that load the data take care of endian swaps when necessary.  This
+also means that surrogates (>= 0x10000) can be placed in the data files the
+"ucgendat" program parses.
+
+The data is written as external files and broken into five parts so it can be
+selectively updated at runtime if necessary.
+
+The data files currently generated from the "ucgendat" program total about 56K
+in size all together.
+
+The format of the binary data files is documented in the "format.txt" file.
+
+Mark Leisher <mleisher@crl.nmsu.edu>
+13 December 1998
+
+CHANGES
+=======
+
+Version 1.9
+-----------
+1. Fixed a problem with an incorrect amount of storage being allocated for the
+   combining class nodes.
+
+2. Fixed an invalid initialization in the number code.
+
+3. Changed the Java template file formatting a bit.
+
+4. Added tables and function for getting decompositions in the Java class.
+
+Version 1.8
+-----------
+1. Fixed a problem with adding certain ranges.
+
+2. Added two more macros for testing for identifiers.
+
+3. Tested with the UnicodeData-2.1.5.txt file.
+
+Version 1.7
+-----------
+1. Fixed a problem with looking up decompositions in "ucgendat."
+
+Version 1.6
+-----------
+1. Added two new properties introduced with UnicodeData-2.1.4.txt.
+
+2. Changed the "ucgendat.c" program a little to automatically align the
+   property data on a 4-byte boundary when new properties are added.
+
+3. Changed the "ucgendat.c" programs to only generate canonical
+   decompositions.
+
+4. Added two new macros ucisinitialpunct() and ucisfinalpunct() to check for
+   initial and final punctuation characters.
+
+5. Minor additions and changes to the documentation.
+
+Version 1.5
+-----------
+1. Changed all file open calls to include binary mode with "b" for DOS/WIN
+   platforms.
+
+2. Wrapped the unistd.h include so it won't be included when compiled under
+   Win32.
+
+3. Fixed a bad range check for hex digits in ucgendat.c.
+
+4. Fixed a bad endian swap for combining classes.
+
+5. Added code to make a number table and associated lookup functions.
+   Functions added are ucnumber(), ucdigit(), and ucgetnumber().  The last
+   function is to maintain compatibility with John Cowan's "uctype" package.
+
+Version 1.4
+-----------
+1. Fixed a bug with adding a range.
+
+2. Fixed a bug with inserting a range in order.
+
+3. Fixed incorrectly specified ucisdefined() and ucisundefined() macros.
+
+4. Added the missing unload for the combining class data.
+
+5. Fixed a bad macro placement in ucisweak().
+
+Version 1.3
+-----------
+1. Bug with case mapping calculations fixed.
+
+2. Bug with empty character property entries fixed.
+
+3. Bug with incorrect type in the combining class lookup fixed.
+
+4. Some corrections done to api.txt.
+
+5. Bug in certain character property lookups fixed.
+
+6. Added a character property table that records the defined characters.
+
+7. Replaced ucisunknown() with ucisdefined() and ucisundefined().
+
+Version 1.2
+-----------
+1. Added code to ucgendat to generate a combining class table.
+
+2. Fixed an endian problem with the byte count of decompositions.
+
+3. Fixed some minor problems in the "format.txt" file.
+
+4. Removed some bogus "Ss" values from MUTTUCData.txt file.
+
+5. Added API function to get combining class.
+
+6. Changed the open mode to "rb" so binary data files will be opened correctly
+   on DOS/WIN as well as other platforms.
+
+7. Added the "api.txt" file.
+
+Version 1.1
+-----------
+1. Added ucisxdigit() which I overlooked.
+
+2. Added UC_LT to the ucisalpha() macro which I overlooked.
+
+3. Change uciscntrl() to include UC_CF.
+
+4. Added ucisocntrl() and ucfntcntrl() macros.
+
+5. Added a ucisblank() which I overlooked.
+
+6. Added missing properties to ucissymbol() and ucisnumber().
+
+7. Added ucisgraph() and ucisprint().
+
+8. Changed the "Mr" property to "Sy" to mark this subset of mirroring
+   characters as symmetric to avoid trampling the Unicode/ISO10646 sense of
+   mirroring.
+
+9. Added another property called "Ss" which includes control characters
+   traditionally seen as spaces in the isspace() macro.
+
+10. Added a bunch of macros to be API compatible with John Cowan's package.
+
+ACKNOWLEDGEMENTS
+================
+
+Thanks go to John Cowan <cowan@locke.ccil.org> for pointing out lots of
+missing things and giving me stuff, particularly a bunch of new macros.
+
+Thanks go to Bob Verbrugge <bob_verbrugge@nl.compuware.com> for pointing out
+various bugs.
+
+Thanks go to Christophe Pierret <cpierret@businessobjects.com> for pointing
+out that file modes need to have "b" for DOS/WIN machines, pointing out
+unistd.h is not a Win 32 header, and pointing out a problem with ucisalnum().
+
+Thanks go to Kent Johnson <kent@pondview.mv.com> for finding a bug that caused
+incomplete decompositions to be generated by the "ucgendat" program.
+
+Thanks go to Valeriy E. Ushakov <uwe@ptc.spbu.ru> for spotting an allocation
+error and an initialization error.
--- a/modules/unicharutil/tools/format.txt
+++ b/modules/unicharutil/tools/format.txt
@ -0,0 +1,243 @@
+#
+# $Id: format.txt,v 1.1 1999/01/06 01:46:03 ftang%netscape.com Exp $
+#
+
+CHARACTER DATA
+==============
+
+This package generates some data files that contain character properties useful
+for text processing.
+
+CHARACTER PROPERTIES
+====================
+
+The first data file is called "ctype.dat" and contains a compressed form of
+the character properties found in the Unicode Character Database (UCDB).
+Additional properties can be specified in limited UCDB format in another file
+to avoid modifying the original UCDB.
+
+The following is a property name and code table to be used with the character
+data:
+
+NAME CODE DESCRIPTION
+---------------------
+Mn   0    Mark, Non-Spacing
+Mc   1    Mark, Spacing Combining
+Me   2    Mark, Enclosing
+Nd   3    Number, Decimal Digit
+Nl   4    Number, Letter
+No   5    Number, Other
+Zs   6    Separator, Space
+Zl   7    Separator, Line
+Zp   8    Separator, Paragraph
+Cc   9    Other, Control
+Cf   10   Other, Format
+Cs   11   Other, Surrogate
+Co   12   Other, Private Use
+Cn   13   Other, Not Assigned
+Lu   14   Letter, Uppercase
+Ll   15   Letter, Lowercase
+Lt   16   Letter, Titlecase
+Lm   17   Letter, Modifier
+Lo   18   Letter, Other
+Pc   19   Punctuation, Connector
+Pd   20   Punctuation, Dash
+Ps   21   Punctuation, Open
+Pe   22   Punctuation, Close
+Po   23   Punctuation, Other
+Sm   24   Symbol, Math
+Sc   25   Symbol, Currency
+Sk   26   Symbol, Modifier
+So   27   Symbol, Other
+L    28   Left-To-Right
+R    29   Right-To-Left
+EN   30   European Number
+ES   31   European Number Separator
+ET   32   European Number Terminator
+AN   33   Arabic Number
+CS   34   Common Number Separator
+B    35   Block Separator
+S    36   Segment Separator
+WS   37   Whitespace
+ON   38   Other Neutrals
+Pi   47   Punctuation, Initial
+Pf   48   Punctuation, Final
+#
+# Implementation specific properties.
+#
+Cm   39   Composite
+Nb   40   Non-Breaking
+Sy   41   Symmetric (characters which are part of open/close pairs)
+Hd   42   Hex Digit
+Qm   43   Quote Mark
+Mr   44   Mirroring
+Ss   45   Space, Other (controls viewed as spaces in ctype isspace())
+Cp   46   Defined character
+
+The actual binary data is formatted as follows:
+
+  Assumptions: unsigned short is at least 16-bits in size and unsigned long
+               is at least 32-bits in size.
+
+    unsigned short ByteOrderMark
+    unsigned short OffsetArraySize
+    unsigned long  Bytes
+    unsigned short Offsets[OffsetArraySize + 1]
+    unsigned long  Ranges[N], N = value of Offsets[OffsetArraySize]
+
+  The Bytes field provides the total byte count used for the Offsets[] and
+  Ranges[] arrays.  The Offsets[] array is aligned on a 4-byte boundary and
+  there is always one extra node on the end to hold the final index of the
+  Ranges[] array.  The Ranges[] array contains pairs of 4-byte values
+  representing a range of Unicode characters.  The pairs are arranged in
+  increasing order by the first character code in the range.
+
+  Determining if a particular character is in the property list requires a
+  simple binary search to determine if a character is in any of the ranges
+  for the property.
+
+  If the ByteOrderMark is equal to 0xFFFE, then the data was generated on a
+  machine with a different endian order and the values must be byte-swapped.
+
+  To swap a 16-bit value:
+     c = (c >> 8) | ((c & 0xff) << 8)
+
+  To swap a 32-bit value:
+     c = ((c & 0xff) << 24) | (((c >> 8) & 0xff) << 16) |
+         (((c >> 16) & 0xff) << 8) | (c >> 24)
+
+CASE MAPPINGS
+=============
+
+The next data file is called "case.dat" and contains three case mapping tables
+in the following order: upper, lower, and title case.  Each table is in
+increasing order by character code and each mapping contains 3 unsigned longs
+which represent the possible mappings.
+
+The format for the binary form of these tables is:
+
+  unsigned short ByteOrderMark
+  unsigned short NumMappingNodes, count of all mapping nodes
+  unsigned short CaseTableSizes[2], upper and lower mapping node counts
+  unsigned long  CaseTables[NumMappingNodes]
+
+  The starting indexes of the case tables are calculated as following:
+
+    UpperIndex = 0;
+    LowerIndex = CaseTableSizes[0] * 3;
+    TitleIndex = LowerIndex + CaseTableSizes[1] * 3;
+
+  The order of the fields for the three tables are:
+
+    Upper case
+    ----------
+    unsigned long upper;
+    unsigned long lower;
+    unsigned long title;
+
+    Lower case
+    ----------
+    unsigned long lower;
+    unsigned long upper;
+    unsigned long title;
+
+    Title case
+    ----------
+    unsigned long title;
+    unsigned long upper;
+    unsigned long lower;
+
+  If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+  same way as described in the CHARACTER PROPERTIES section.
+
+  Because the tables are in increasing order by character code, locating a
+  mapping requires a simple binary search on one of the 3 codes that make up
+  each node.
+
+  It is important to note that there can only be 65536 mapping nodes which
+  divided into 3 portions allows 21845 nodes for each case mapping table.  The
+  distribution of mappings may be more or less than 21845 per table, but only
+  65536 are allowed.
+
+DECOMPOSITIONS
+==============
+
+The next data file is called "decomp.dat" and contains the decomposition data
+for all characters with decompositions containing more than one character and
+are *not* compatibility decompositions.  Compatibility decompositions are
+signaled in the UCDB format by the use of the <compat> tag in the
+decomposition field.  Each list of character codes represents a full
+decomposition of a composite character.  The nodes are arranged in increasing
+order by character code.
+
+The format for the binary form of this table is:
+
+  unsigned short ByteOrderMark
+  unsigned short NumDecompNodes, count of all decomposition nodes
+  unsigned long  Bytes
+  unsigned long  DecompNodes[(NumDecompNodes * 2) + 1]
+  unsigned long  Decomp[N], N = sum of all counts in DecompNodes[]
+
+  If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+  same way as described in the CHARACTER PROPERTIES section.
+
+  The DecompNodes[] array consists of pairs of unsigned longs, the first of
+  which is the character code and the second is the initial index of the list
+  of character codes representing the decomposition.
+
+  Locating the decomposition of a composite character requires a binary search
+  for a character code in the DecompNodes[] array and using its index to
+  locate the start of the decomposition.  The length of the decomposition list
+  is the index in the following element in DecompNode[] minus the current
+  index.
+
+COMBINING CLASSES
+=================
+
+The fourth data file is called "cmbcl.dat" and contains the characters with
+non-zero combining classes.
+
+The format for the binary form of this table is:
+
+  unsigned short ByteOrderMark
+  unsigned short NumCCLNodes
+  unsigned long  Bytes
+  unsigned long  CCLNodes[NumCCLNodes * 3]
+
+  If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+  same way as described in the CHARACTER PROPERTIES section.
+
+  The CCLNodes[] array consists of groups of three unsigned longs.  The first
+  and second are the beginning and ending of a range and the third is the
+  combining class of that range.
+
+  If a character is not found in this table, then the combining class is
+  assumed to be 0.
+
+  It is important to note that only 65536 distinct ranges plus combining class
+  can be specified because the NumCCLNodes is usually a 16-bit number.
+
+NUMBER TABLE
+============
+
+The final data file is called "num.dat" and contains the characters that have
+a numeric value associated with them.
+
+The format for the binary form of the table is:
+
+  unsigned short ByteOrderMark
+  unsigned short NumNumberNodes
+  unsigned long  Bytes
+  unsigned long  NumberNodes[NumNumberNodes]
+  unsigned short ValueNodes[(Bytes - (NumNumberNodes * sizeof(unsigned long)))
+                            / sizeof(short)]
+
+  If the ByteOrderMark is equal to 0xFFFE, endian swapping is required in the
+  same way as described in the CHARACTER PROPERTIES section.
+
+  The NumberNodes array contains pairs of values, the first of which is the
+  character code and the second an index into the ValueNodes array.  The
+  ValueNodes array contains pairs of integers which represent the numerator
+  and denominator of the numeric value of the character.  If the character
+  happens to map to an integer, both the values in ValueNodes will be the
+  same.
--- a/modules/unicharutil/tools/ucgendat.c
+++ b/modules/unicharutil/tools/ucgendat.c