#!/usr/bin/perl # # The contents of this file are subject to the Netscape Public # License Version 1.1 (the "License"); you may not use this file # except in compliance with the License. You may obtain a copy of # the License at http://www.mozilla.org/NPL/ # # Software distributed under the License is distributed on an "AS # IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or # implied. See the License for the specific language governing # rights and limitations under the License. # # The Original Code is mozilla.org code. # # The Initial Developer of the Original Code is Netscape # Communications Corporation. Portions created by Netscape are # Copyright (C) 1999 Netscape Communications Corporation. All # Rights Reserved. # # Contributor(s): # $handcoded = <//eg; $t = foldcombining($t); return rdecompose( $table{$t}); } return $dec; } sub decompose { my ($removeprefix, $dec) = (@_); $removeprefix .= " "; $dec =~ s/$removeprefix//eg; if($dec eq "0020") { $dec = "\\u0020"; } elsif($dec eq "005C") { $dec = "\\u005C"; } else { $k = "\/"; $dec =~ s/2044/$k/eg; $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/rdecompose($1)/eg; $dec =~ s/([0-9A-F][0-9A-F][0-9A-F][0-9A-F])/\\u$1/g; $dec =~ s/\\u00([0-7][0-9A-F])/pack("C",hex($1))/eg; $dec =~ s/ //eg; } return $dec; } ###################################################################### # # Open the unicode database file # ###################################################################### open ( UNICODATA , "< UnicodeData-Latest.txt") || die "cannot find UnicodeData-Latest.txt"; open ( UNICODATA2 , "< UnicodeData-Latest.txt") || die "cannot find UnicodeData-Latest.txt"; ###################################################################### # # Open the output file # ###################################################################### open ( OUT , "> ../tables/transliterate.properties") || die "cannot open output ../tables/transliterate.properties file"; print OUT $handcoded; ###################################################################### # # Process the file line by line # ###################################################################### while() { chop; @f = split(/;/ , $_); $udec = hex($u); if(($udec > 256 ) && ($f[5] ne "")) { $table{$f[0]}=$f[5]; } } while() { chop; ###################################################################### # # Get value from fields # ###################################################################### @f = split(/;/ , $_); $u = $f[0]; # The unicode value $cmt = $f[1]; # The comment $dec = $f[5]; # The decomposed value $d1 = $f[6]; $d2 = $f[7]; $d3 = $f[8]; $udec = hex($u); if($udec > 128) { # not ASCII if($dec ne "") { # have decomposition if($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { # ignore non ASCII decomposition # warning($_); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", "(".$dec.")")); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { # warning($_); } elsif($dec =~ //) { output($u,$cmt,$udec,"^(".&decompose("", $dec).")"); } elsif($dec =~ //) { output($u,$cmt,$udec,"v(".&decompose("", $dec).")"); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { output($u,$cmt,$udec,&decompose("", $dec)); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { # ignore # warning($_); } elsif($dec =~ //) { if($dec eq " 0020") { output($u,$cmt,$udec,"\\u0020"); } else { # ignore # warning($_); } } else { warning($_); } } else { # decomposition without format code if($cmt =~ /LATIN/) { $dec = foldcombining($dec); output($u,$cmt,$udec,&decompose("", $dec)); } elsif($cmt =~ /CYRILLIC/) { # ignore # warning($_); } elsif($cmt =~ /GREEK/) { # ignore # warning($_); } elsif($cmt =~ /ARABIC/) { # ignore # warning($_); } elsif($cmt =~ /CJK/) { # ignore # warning($_); } elsif($cmt =~ /HEBREW/) { # ignore # warning($_); } elsif($cmt =~ /DEVANAGARI/) { # ignore # warning($_); } elsif($cmt =~ /BENGALI/) { # ignore # warning($_); } elsif($cmt =~ /GURMUKHI/) { # ignore # warning($_); } elsif($cmt =~ /ORIYA/) { # ignore # warning($_); } elsif($cmt =~ /TAMIL/) { # ignore # warning($_); } elsif($cmt =~ /TELUGU/) { # ignore # warning($_); } elsif($cmt =~ /KANNADA/) { # ignore # warning($_); } elsif($cmt =~ /MALAYALAM/) { # ignore # warning($_); } elsif($cmt =~ /SINHALA/) { # ignore # warning($_); } elsif($cmt =~ /TIBETAN/) { # ignore # warning($_); } elsif($cmt =~ /MYANMAR/) { # ignore # warning($_); } elsif($cmt =~ /KATAKANA/) { # ignore # warning($_); } elsif($cmt =~ /HIRAGANA/) { # ignore # warning($_); } else { # ignore # warning($_); } } } else { # do not have decomposition if ($d1 ne "") { # are numeric characters output($u,$cmt,$udec,$d1); } elsif ($d2 ne "") { if($cmt =~ /CIRCLED/) { # circled output($u,$cmt,$udec,"(".$d2.")"); } else { warning($_); } } elsif ($d3 ne "") { if($cmt =~ /CIRCLED/) { # circled output($u,$cmt,$udec,"(".$d3.")"); } else { # others, use [ ] output($u,$cmt,$udec,"[".$d3."]"); } } else { # not numeric characters } # end of no decomposition } # end of have/not decomposition } } ###################################################################### # # Close files # ###################################################################### close(UNIDATA); close(OUT);