зеркало из https://github.com/mozilla/gecko-dev.git
add perl script which read unicode character database and generate the transliteration table
This commit is contained in:
Родитель
8b12386605
Коммит
c47b02290d
|
@ -0,0 +1,465 @@
|
|||
#!/usr/bin/perl
|
||||
#
|
||||
# The contents of this file are subject to the Netscape Public
|
||||
# License Version 1.1 (the "License"); you may not use this file
|
||||
# except in compliance with the License. You may obtain a copy of
|
||||
# the License at http://www.mozilla.org/NPL/
|
||||
#
|
||||
# Software distributed under the License is distributed on an "AS
|
||||
# IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
||||
# implied. See the License for the specific language governing
|
||||
# rights and limitations under the License.
|
||||
#
|
||||
# The Original Code is mozilla.org code.
|
||||
#
|
||||
# The Initial Developer of the Original Code is Netscape
|
||||
# Communications Corporation. Portions created by Netscape are
|
||||
# Copyright (C) 1999 Netscape Communications Corporation. All
|
||||
# Rights Reserved.
|
||||
#
|
||||
# Contributor(s):
|
||||
#
|
||||
|
||||
$handcoded = <<END_OF_HANDCODED;
|
||||
##
|
||||
## The contents of this file are subject to the Netscape Public
|
||||
## License Version 1.1 (the "License"); you may not use this file
|
||||
## except in compliance with the License. You may obtain a copy of
|
||||
## the License at http://www.mozilla.org/NPL/
|
||||
##
|
||||
## Software distributed under the License is distributed on an "AS
|
||||
## IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or
|
||||
## implied. See the License for the specific language governing
|
||||
## rights and limitations under the License.
|
||||
##
|
||||
## The Original Code is mozilla.org code.
|
||||
##
|
||||
## The Initial Developer of the Original Code is Netscape
|
||||
## Communications Corporation. Portions created by Netscape are
|
||||
## Copyright (C) 1999 Netscape Communications Corporation. All
|
||||
## Rights Reserved.
|
||||
##
|
||||
## Contributor(s):
|
||||
##
|
||||
## THIS FILE IS GENERATED BY mozilla/intl/unicharutil/tools/gentransliterate.pl
|
||||
## PLEASE DO NOT MODIFY THIS FILE BY HAND
|
||||
##
|
||||
entity.list.name=transliterate
|
||||
entity.169=(c)
|
||||
#
|
||||
#
|
||||
# Here are the windows-1252 characters from the range 0x80 - 0x9F
|
||||
#
|
||||
# EURO SIGN
|
||||
entity.8364=EUR
|
||||
# SINGLE LOW-9 QUOTATION MARK
|
||||
entity.8218=,
|
||||
# LATIN SMALL LETTER F WITH HOOK
|
||||
entity.402=f
|
||||
# DOUBLE LOW-9 QUOTATION MARK
|
||||
entity.8222="
|
||||
# DAGGER
|
||||
entity.8224=+
|
||||
# DOUBLE DAGGER
|
||||
entity.8225=++
|
||||
# MODIFIER LETTER CIRCUMFLEX ACCENT
|
||||
entity.710=^
|
||||
# PER MILLE SIGN
|
||||
entity.8240=0/00
|
||||
# SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
||||
entity.8249=<
|
||||
# LATIN CAPITAL LIGATURE OE
|
||||
entity.338=OE
|
||||
# LEFT SINGLE QUOTATION MARK
|
||||
entity.8216='
|
||||
# RIGHT SINGLE QUOTATION MARK
|
||||
entity.8217='
|
||||
# LEFT DOUBLE QUOTATION MARK
|
||||
entity.8220="
|
||||
# RIGHT DOUBLE QUOTATION MARK
|
||||
entity.8221="
|
||||
# BULLET
|
||||
entity.8226=.
|
||||
# EN DASH
|
||||
entity.8211=-
|
||||
# EM DASH
|
||||
entity.8212=--
|
||||
# SMALL TILDE
|
||||
entity.732=~
|
||||
# SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
||||
entity.8250=>
|
||||
# LATIN SMALL LIGATURE OE
|
||||
entity.339=oe
|
||||
##
|
||||
## End of hand coded section
|
||||
## Below are generated from the unicode character database
|
||||
##
|
||||
END_OF_HANDCODED
|
||||
|
||||
@table = ();
|
||||
sub FromLatinComment
|
||||
{
|
||||
my ($cmt) = (@_);
|
||||
$char = "";
|
||||
if($cmt =~ /PRESEDED BY APOSTROPHE/) {
|
||||
$char = "\'";
|
||||
}
|
||||
if($cmt =~ /CAPITAL LETTER ([A-Z]*)/) {
|
||||
$char = $char . $1;
|
||||
}
|
||||
if($cmt =~ /SMALL LETTER ([A-Z]*)/) {
|
||||
$char = $char . lc($1);
|
||||
}
|
||||
@f = split(/ / , $cmt);
|
||||
while($item = shift @f) {
|
||||
if($item eq "DOT") {
|
||||
$char .= ".";
|
||||
} elsif ($item eq "DIAERESIS") {
|
||||
$char .= "\"";
|
||||
} elsif ($item eq "BREVE") {
|
||||
$char .= "(";
|
||||
} elsif ($item eq "ACUTE") {
|
||||
$char .= "\'";
|
||||
} elsif ($item eq "GRAVE") {
|
||||
$char .= "`";
|
||||
} elsif ($item eq "TILDE") {
|
||||
$char .= "~";
|
||||
} elsif ($item eq "CARON") {
|
||||
$char .= "(";
|
||||
} elsif ($item eq "HOOK") {
|
||||
$char .= "?";
|
||||
} elsif ($item eq "CEDILLA") {
|
||||
$char .= ",";
|
||||
} elsif ($item eq "MACRON") {
|
||||
$char .= "-";
|
||||
} elsif ($item eq "CIRCUMFLEX") {
|
||||
$char .= "^";
|
||||
} elsif ($item eq "RING") {
|
||||
$char .= "*";
|
||||
} elsif ($item eq "OGONEK") {
|
||||
$char .= ";";
|
||||
} elsif ($item eq "LINE") {
|
||||
$char .= "_";
|
||||
} elsif ($item eq "COMMA") {
|
||||
$char .= ",";
|
||||
} elsif ($item eq "STROKE") {
|
||||
$char .= "/";
|
||||
} elsif ($item eq "HORN") {
|
||||
$char .= "+";
|
||||
} elsif ($item =~ /^(LATIN|CAPITAL|SMALL|LETTER|WITH|ABOVE|BELOW|INVERTED|MIDDLE|AND|BY|APOSTROPHE|[A-Z])$/) {
|
||||
# ignore
|
||||
} else {
|
||||
#print "AAAA $item\n";
|
||||
}
|
||||
}
|
||||
|
||||
return $char;
|
||||
}
|
||||
sub warning
|
||||
{
|
||||
my ($warning) = (@_);
|
||||
print "WARNING: $warning \n";
|
||||
}
|
||||
sub doutput
|
||||
{
|
||||
my ($u, $cmt, $udec, $str) = (@_);
|
||||
print "# U+$u $cmt\n";
|
||||
print "entity.$udec=$str\n";
|
||||
}
|
||||
sub output
|
||||
{
|
||||
my ($u, $cmt, $udec, $str) = (@_);
|
||||
if(decomposeIntoNonASCII($str)) {
|
||||
if(($cmt =~ "LATIN") && ($cmt =~ "LETTER") && !($cmt =~ "LONG")) {
|
||||
$str = FromLatinComment($cmt);
|
||||
output($u,$cmt,$udec,$str);
|
||||
}
|
||||
} else {
|
||||
print OUT "# U+$u $cmt\n";
|
||||
print OUT "entity.$udec=$str\n";
|
||||
}
|
||||
}
|
||||
|
||||
sub decomposeIntoNonASCII
|
||||
{
|
||||
my ($dec) = (@_);
|
||||
return $dec =~ /([1-9A-F][0-9A-F][0-9A-F]|[0-9A-F][1-9A-F][0-9A-F]|00[8-9A-F])[0-9A-F]/;
|
||||
}
|
||||
|
||||
sub foldcombining
|
||||
{
|
||||
my ($dec) = (@_);
|
||||
$grave = "0060";
|
||||
$acute = "0027";
|
||||
$hat = "005E";
|
||||
$hat = "005E";
|
||||
$tilde = "007E";
|
||||
$overscore = "002D"; ## should be 00AF but we can only handle ASCII now
|
||||
$umlaut = "0022"; ## should be 00A8 but we can only handle ASCII now
|
||||
$doubleacute = "0022";
|
||||
$dot = "002E";
|
||||
$doublegrave = "0060 0060";
|
||||
|
||||
|
||||
$dec =~ s/00A8/$umlaut/eg;
|
||||
$dec =~ s/00AF/$overscore/eg;
|
||||
# $dec =~ s/00B0//eg;
|
||||
$dec =~ s/00B4/$acute/eg;
|
||||
$dec =~ s/00B7/$dot/eg;
|
||||
# $dec =~ s/00B8//eg;
|
||||
$dec =~ s/0300/$grave/eg;
|
||||
$dec =~ s/0301/$acute/eg;
|
||||
$dec =~ s/0302/$hat/eg;
|
||||
$dec =~ s/0303/$tilde/eg;
|
||||
$dec =~ s/0304/$overscore/eg;
|
||||
$dec =~ s/0305/$overscore/eg;
|
||||
#$dec =~ s/0306/?/eg;
|
||||
$dec =~ s/0307/$dot/eg;
|
||||
$dec =~ s/0308/$umlaut/eg;
|
||||
#$dec =~ s/0309/?/eg;
|
||||
#$dec =~ s/030A/?/eg;
|
||||
$dec =~ s/030B/$doubleacute/eg;
|
||||
#$dec =~ s/030C/?/eg;
|
||||
$dec =~ s/030D/$acute/eg;
|
||||
$dec =~ s/030E/$doubleacute/eg;
|
||||
$dec =~ s/030F/$doublegrave/eg;
|
||||
|
||||
# $dec =~ s/03[0-9A-F][0-9A-F]//eg; ## drop others
|
||||
return $dec;
|
||||
}
|
||||
sub rdecompose
|
||||
{
|
||||
my ($dec) = (@_);
|
||||
if(exists $table{$dec}) {
|
||||
$t = $table{$dec};
|
||||
$t =~ s/<[a-zA-Z]*>//eg;
|
||||
$t = foldcombining($t);
|
||||
return rdecompose( $table{$t});
|
||||
}
|
||||
return $dec;
|
||||
}
|
||||
sub decompose
|
||||
{
|
||||
my ($removeprefix, $dec) = (@_);
|
||||
$removeprefix .= " ";
|
||||
|
||||
$dec =~ s/$removeprefix//eg;
|
||||
if($dec eq "0020") {
|
||||
$dec = "\\u0020";
|
||||
} elsif($dec eq "005C") {
|
||||
$dec = "\\u005C";
|
||||
} else {
|
||||
$k = "\/";
|
||||
$dec =~ s/2044/$k/eg;
|
||||
$dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg;
|
||||
$dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g;
|
||||
$dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg;
|
||||
$dec =~ s/ //eg;
|
||||
}
|
||||
return $dec;
|
||||
}
|
||||
|
||||
######################################################################
|
||||
#
|
||||
# Open the unicode database file
|
||||
#
|
||||
######################################################################
|
||||
open ( UNICODATA , "< UnicodeData-Latest.txt")
|
||||
|| die "cannot find UnicodeData-Latest.txt";
|
||||
|
||||
open ( UNICODATA2 , "< UnicodeData-Latest.txt")
|
||||
|| die "cannot find UnicodeData-Latest.txt";
|
||||
######################################################################
|
||||
#
|
||||
# Open the output file
|
||||
#
|
||||
######################################################################
|
||||
open ( OUT , "> ../tables/transliterate.properties")
|
||||
|| die "cannot open output ../tables/transliterate.properties file";
|
||||
|
||||
print OUT $handcoded;
|
||||
|
||||
######################################################################
|
||||
#
|
||||
# Process the file line by line
|
||||
#
|
||||
######################################################################
|
||||
while(<UNICODATA2>) {
|
||||
chop;
|
||||
@f = split(/;/ , $_);
|
||||
$udec = hex($u);
|
||||
if(($udec > 256 ) && ($f[5] ne "")) {
|
||||
$table{$f[0]}=$f[5];
|
||||
}
|
||||
}
|
||||
while(<UNICODATA>) {
|
||||
chop;
|
||||
######################################################################
|
||||
#
|
||||
# Get value from fields
|
||||
#
|
||||
######################################################################
|
||||
@f = split(/;/ , $_);
|
||||
$u = $f[0]; # The unicode value
|
||||
$cmt = $f[1]; # The comment
|
||||
$dec = $f[5]; # The decomposed value
|
||||
$d1 = $f[6];
|
||||
$d2 = $f[7];
|
||||
$d3 = $f[8];
|
||||
$udec = hex($u);
|
||||
|
||||
if($udec > 128)
|
||||
{
|
||||
# not ASCII
|
||||
if($dec ne "")
|
||||
{
|
||||
# have decomposition
|
||||
if($dec =~ /</) {
|
||||
# formated decomposition
|
||||
if($dec =~ /<wide>/) {
|
||||
output($u,$cmt,$udec,&decompose("<wide>", $dec));
|
||||
} elsif($dec =~ /<narrow>/) {
|
||||
# ignore non ASCII decomposition
|
||||
# warning($_);
|
||||
} elsif($dec =~ /<circle>/) {
|
||||
output($u,$cmt,$udec,&decompose("<circle>", "(".$dec.")"));
|
||||
} elsif($dec =~ /<fraction>/) {
|
||||
output($u,$cmt,$udec,&decompose("<fraction>", $dec));
|
||||
} elsif($dec =~ /<small>/) {
|
||||
output($u,$cmt,$udec,&decompose("<small>", $dec));
|
||||
} elsif($dec =~ /<vertical>/) {
|
||||
# warning($_);
|
||||
} elsif($dec =~ /<super>/) {
|
||||
output($u,$cmt,$udec,"^(".&decompose("<super>", $dec).")");
|
||||
} elsif($dec =~ /<sub>/) {
|
||||
output($u,$cmt,$udec,"v(".&decompose("<sub>", $dec).")");
|
||||
} elsif($dec =~ /<font>/) {
|
||||
output($u,$cmt,$udec,&decompose("<font>", $dec));
|
||||
} elsif($dec =~ /<square>/) {
|
||||
# ignore <square>
|
||||
# warning($_);
|
||||
} elsif($dec =~ /<compat>/) {
|
||||
output($u,$cmt,$udec,&decompose("<compat>", $dec));
|
||||
} elsif($dec =~ /<isolated>/) {
|
||||
# ignore <isolated>
|
||||
# warning($_);
|
||||
} elsif($dec =~ /<medial>/) {
|
||||
# ignore <medial>
|
||||
# warning($_);
|
||||
} elsif($dec =~ /<final>/) {
|
||||
# ignore <final>
|
||||
# warning($_);
|
||||
} elsif($dec =~ /<initial>/) {
|
||||
# ignore <initial>
|
||||
# warning($_);
|
||||
} elsif($dec =~ /<noBreak>/) {
|
||||
if($dec eq "<noBreak> 0020")
|
||||
{
|
||||
output($u,$cmt,$udec,"\\u0020");
|
||||
} else {
|
||||
# ignore
|
||||
# warning($_);
|
||||
}
|
||||
} else {
|
||||
warning($_);
|
||||
}
|
||||
} else {
|
||||
# decomposition without format code
|
||||
if($cmt =~ /LATIN/) {
|
||||
$dec = foldcombining($dec);
|
||||
output($u,$cmt,$udec,&decompose("", $dec));
|
||||
} elsif($cmt =~ /CYRILLIC/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /GREEK/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /ARABIC/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /CJK/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /HEBREW/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /DEVANAGARI/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /BENGALI/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /GURMUKHI/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /ORIYA/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /TAMIL/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /TELUGU/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /KANNADA/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /MALAYALAM/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /SINHALA/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /TIBETAN/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /MYANMAR/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /KATAKANA/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} elsif($cmt =~ /HIRAGANA/) {
|
||||
# ignore
|
||||
# warning($_);
|
||||
} else {
|
||||
# ignore
|
||||
# warning($_);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
# do not have decomposition
|
||||
if ($d1 ne "")
|
||||
{
|
||||
# are numeric characters
|
||||
output($u,$cmt,$udec,$d1);
|
||||
} elsif ($d2 ne "") {
|
||||
if($cmt =~ /CIRCLED/) {
|
||||
# circled
|
||||
output($u,$cmt,$udec,"(".$d2.")");
|
||||
} else {
|
||||
warning($_);
|
||||
}
|
||||
} elsif ($d3 ne "") {
|
||||
if($cmt =~ /CIRCLED/) {
|
||||
# circled
|
||||
output($u,$cmt,$udec,"(".$d3.")");
|
||||
} else {
|
||||
# others, use [ ]
|
||||
output($u,$cmt,$udec,"[".$d3."]");
|
||||
}
|
||||
} else {
|
||||
# not numeric characters
|
||||
|
||||
} # end of no decomposition
|
||||
} # end of have/not decomposition
|
||||
}
|
||||
}
|
||||
######################################################################
|
||||
#
|
||||
# Close files
|
||||
#
|
||||
######################################################################
|
||||
close(UNIDATA);
|
||||
close(OUT);
|
||||
|
Загрузка…
Ссылка в новой задаче