ruby/template/unicode_norm_gen.tmpl

218 строки
6.5 KiB
Ruby

%# -*- mode: ruby; coding: utf-8 -*-
<%
# Copyright Ayumu Nojima ( ) and Martin J. Dürst (duerst@it.aoyama.ac.jp)
# Script to generate Ruby data structures used in implementing
# String#unicode_normalize,...
# Constants for input and ouput directory
InputDataDir = ARGV[0] || 'enc/unicode/data'
OuputDataDir = ARGV[1] || 'lib/unicode_normalize'
# convenience methods
class Integer
def to_UTF8() # convert to string, taking legibility into account
if self>0xFFFF
"\\u{#{to_s(16).upcase}}"
elsif self>0x7f
"\\u#{to_s(16).upcase.rjust(4, '0')}"
else
chr.sub(/[\\\"]/, "\\\\\\\&")
end
end
end
module Enumerable
unless method_defined?(:each_slice)
def each_slice(n)
ary = []
each do |i|
ary << i
if ary.size >= n
yield ary
ary = []
end
end
yield ary unless ary.empty?
self
end
end
end
class Array
def to_UTF8() collect {|c| c.to_UTF8}.join('') end
def each_regexp_chars(n = 8) # converts an array of Integers to character ranges
sort.inject([]) do |ranges, value|
if ranges.last and ranges.last[1]+1>=value
ranges.last[1] = value
ranges
else
ranges << [value, value]
end
end.collect do |first, last|
case last-first
when 0
first.to_UTF8
when 1
first.to_UTF8 + last.to_UTF8
else
first.to_UTF8 + '-' + last.to_UTF8
end
end.each_slice(n) do |slice|
yield slice.join('')
end
end
end
# read the file 'CompositionExclusions.txt'
composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt") {|f|
f.grep(/^[A-Z0-9]{4,5}/) {|line| line.hex}
}
decomposition_table = {}
kompatible_table = {}
combining_class = {} # constant to allow use in Integer#to_UTF8
# read the file 'UnicodeData.txt'
vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")
case decomposition
when /^[0-9A-F]/
decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
when /^</
kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
end
combining_class[codepoint.hex] = char_class.to_i if char_class != "0"
if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
warn "Unexpected: Character range with data relevant to normalization!"
end
end
# calculate compositions from decompositions
composition_table = decomposition_table.reject do |character, decomposition|
composition_exclusions.member? character or # predefined composition exclusion
decomposition.length<=1 or # Singleton Decomposition
combining_class[character] or # character is not a Starter
combining_class[decomposition.first] # decomposition begins with a character that is not a Starter
end.invert
# recalculate composition_exclusions
composition_exclusions = decomposition_table.keys - composition_table.values
accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}
composition_starters = composition_table.keys.collect {|key| key.first}
hangul_no_trailing = []
0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}
# expand decomposition table values
decomposition_table.each do |key, value|
position = 0
while position < value.length
if decomposition = decomposition_table[value[position]]
decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
value[position, 1] = decomposition
else
position += 1
end
end
end
# deal with relationship between canonical and kompatibility decompositions
decomposition_table.each do |key, value|
value = value.dup
expanded = false
position = 0
while position < value.length
if decomposition = kompatible_table[value[position]]
value[position, 1] = decomposition
expanded = true
else
position += 1
end
end
kompatible_table[key] = value if expanded
end
while kompatible_table.any? {|key, value|
expanded = value.map {|v| kompatible_table[v] || v}.flatten
kompatible_table[key] = expanded unless value == expanded
}
end
# generate normalization tables file
%># coding: us-ascii
%# >
# automatically generated by template/unicode_norm_gen.tmpl
module UnicodeNormalize
accents = "" \
"[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]" \
"".freeze
ACCENTS = accents
REGEXP_D_STRING = "#{'' # composition starters and composition exclusions
}" \
"[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]#{accents}*" \
"|#{'' # characters that can be the result of a composition, except composition starters
}" \
"[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]?#{accents}+" \
"|#{'' # precomposed Hangul syllables
}" \
"[\u{AC00}-\u{D7A4}]" \
"".freeze
REGEXP_C_STRING = "#{'' # composition exclusions
}" \
"[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]#{accents}*" \
"|#{'' # composition starters and characters that can be the result of a composition
}" \
"[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>]?#{accents}+" \
"|#{'' # Hangul syllables with separate trailer
}" \
"[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
"<% end%>][\u11A8-\u11C2]" \
"|#{'' # decomposed Hangul syllables
}" \
"[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?" \
"".freeze
REGEXP_K_STRING = "" \
"[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
"<%end%>]" \
"".freeze
class_table = {
% combining_class.each_slice(8) do |slice|
<% slice.each do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%>
% end
}
class_table.default = 0
CLASS_TABLE = class_table.freeze
DECOMPOSITION_TABLE = {
% decomposition_table.each_slice(8) do |slice|
<% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
% end
}.freeze
KOMPATIBLE_TABLE = {
% kompatible_table.each_slice(8) do |slice|
<% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
% end
}.freeze
COMPOSITION_TABLE = {
% composition_table.each_slice(8) do |slice|
<% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
% end
}.freeze
end