зеркало из https://github.com/github/ruby.git
1119 строки
30 KiB
Ruby
1119 строки
30 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
require 'optparse'
|
|
require 'erb'
|
|
require 'fileutils'
|
|
require 'pp'
|
|
|
|
class Array
|
|
unless [].respond_to? :product
|
|
def product(*args)
|
|
if args.empty?
|
|
self.map {|e| [e] }
|
|
else
|
|
result = []
|
|
self.each {|e0|
|
|
result.concat args.first.product(*args[1..-1]).map {|es| [e0, *es] }
|
|
}
|
|
result
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
class String
|
|
unless "".respond_to? :start_with?
|
|
def start_with?(*prefixes)
|
|
prefixes.each {|prefix|
|
|
return true if prefix.length <= self.length && prefix == self[0, prefix.length]
|
|
}
|
|
false
|
|
end
|
|
end
|
|
end
|
|
|
|
NUM_ELEM_BYTELOOKUP = 2
|
|
|
|
C_ESC = {
|
|
"\\" => "\\\\",
|
|
'"' => '\"',
|
|
"\n" => '\n',
|
|
}
|
|
|
|
0x00.upto(0x1f) {|ch| C_ESC[[ch].pack("C")] ||= "\\%03o" % ch }
|
|
0x7f.upto(0xff) {|ch| C_ESC[[ch].pack("C")] = "\\%03o" % ch }
|
|
C_ESC_PAT = Regexp.union(*C_ESC.keys)
|
|
|
|
def c_esc(str)
|
|
'"' + str.gsub(C_ESC_PAT) { C_ESC[$&] } + '"'
|
|
end
|
|
|
|
HEX2 = /(?:[0-9A-Fa-f]{2})/
|
|
|
|
class ArrayCode
|
|
def initialize(type, name)
|
|
@type = type
|
|
@name = name
|
|
@len = 0;
|
|
@content = ''.dup
|
|
end
|
|
|
|
def length
|
|
@len
|
|
end
|
|
|
|
def insert_at_last(num, str)
|
|
# newnum = self.length + num
|
|
@content << str
|
|
@len += num
|
|
end
|
|
|
|
def to_s
|
|
<<"End"
|
|
static const #{@type}
|
|
#{@name}[#{@len}] = {
|
|
#{@content}};
|
|
End
|
|
end
|
|
end
|
|
|
|
class Action
|
|
def initialize(value)
|
|
@value = value
|
|
end
|
|
attr_reader :value
|
|
|
|
def hash
|
|
@value.hash
|
|
end
|
|
|
|
def eql?(other)
|
|
self.class == other.class &&
|
|
@value == other.value
|
|
end
|
|
alias == eql?
|
|
end
|
|
|
|
class Branch
|
|
def initialize(byte_min, byte_max, child_tree)
|
|
@byte_min = byte_min
|
|
@byte_max = byte_max
|
|
@child_tree = child_tree
|
|
@hash = byte_min.hash ^ byte_max.hash ^ child_tree.hash
|
|
end
|
|
attr_reader :byte_min, :byte_max, :child_tree, :hash
|
|
|
|
def eql?(other)
|
|
self.class == other.class &&
|
|
@hash == other.hash &&
|
|
@byte_min == other.byte_min &&
|
|
@byte_max == other.byte_max &&
|
|
@child_tree == other.child_tree
|
|
end
|
|
alias == eql?
|
|
end
|
|
|
|
class ActionMap
|
|
def self.parse_to_rects(mapping)
|
|
rects = []
|
|
n = 0
|
|
mapping.each {|pat, action|
|
|
pat = pat.to_s
|
|
if /\A\s*\(empset\)\s*\z/ =~ pat
|
|
next
|
|
elsif /\A\s*\(empstr\)\s*\z/ =~ pat
|
|
rects << ['', '', action]
|
|
n += 1
|
|
elsif /\A\s*(#{HEX2}+)\s*\z/o =~ pat
|
|
hex = $1.upcase
|
|
rects << [hex, hex, action]
|
|
elsif /\A\s*((#{HEX2}|\{#{HEX2}(?:-#{HEX2})?(,#{HEX2}(?:-#{HEX2})?)*\})+(\s+|\z))*\z/o =~ pat
|
|
pat = pat.upcase
|
|
pat.scan(/\S+/) {
|
|
pat1 = $&
|
|
ranges_list = []
|
|
pat1.scan(/#{HEX2}|\{([^\}]*)\}/o) {
|
|
ranges_list << []
|
|
if !$1
|
|
ranges_list.last << [$&,$&]
|
|
else
|
|
set = {}
|
|
$1.scan(/(#{HEX2})(?:-(#{HEX2}))?/o) {
|
|
if !$2
|
|
c = $1.to_i(16)
|
|
set[c] = true
|
|
else
|
|
b = $1.to_i(16)
|
|
e = $2.to_i(16)
|
|
b.upto(e) {|_| set[_] = true }
|
|
end
|
|
}
|
|
i = nil
|
|
0.upto(256) {|j|
|
|
if set[j]
|
|
if !i
|
|
i = j
|
|
end
|
|
if !set[j+1]
|
|
ranges_list.last << ["%02X" % i, "%02X" % j]
|
|
i = nil
|
|
end
|
|
end
|
|
}
|
|
end
|
|
}
|
|
first_ranges = ranges_list.shift
|
|
first_ranges.product(*ranges_list).each {|range_list|
|
|
min = range_list.map {|x, y| x }.join
|
|
max = range_list.map {|x, y| y }.join
|
|
rects << [min, max, action]
|
|
}
|
|
}
|
|
else
|
|
raise ArgumentError, "invalid pattern: #{pat.inspect}"
|
|
end
|
|
}
|
|
rects
|
|
end
|
|
|
|
def self.unambiguous_action(actions0)
|
|
actions = actions0.uniq
|
|
if actions.length == 1
|
|
actions[0]
|
|
else
|
|
actions.delete(:nomap0)
|
|
if actions.length == 1
|
|
actions[0]
|
|
else
|
|
raise ArgumentError, "ambiguous actions: #{actions0.inspect}"
|
|
end
|
|
end
|
|
end
|
|
|
|
def self.build_tree(rects)
|
|
expand(rects) {|prefix, actions|
|
|
unambiguous_action(actions)
|
|
}
|
|
end
|
|
|
|
def self.parse(mapping)
|
|
rects = parse_to_rects(mapping)
|
|
tree = build_tree(rects)
|
|
self.new(tree)
|
|
end
|
|
|
|
def self.merge_rects(*rects_list)
|
|
if rects_list.length < 2
|
|
raise ArgumentError, "not enough arguments"
|
|
end
|
|
|
|
all_rects = []
|
|
rects_list.each_with_index {|rects, i|
|
|
all_rects.concat rects.map {|min, max, action| [min, max, [i, action]] }
|
|
}
|
|
|
|
tree = expand(all_rects) {|prefix, actions|
|
|
args = Array.new(rects_list.length) { [] }
|
|
actions.each {|i, action|
|
|
args[i] << action
|
|
}
|
|
yield(prefix, *args)
|
|
}
|
|
|
|
self.new(tree)
|
|
end
|
|
|
|
def self.merge(*mappings, &block)
|
|
merge_rects(*mappings.map {|m| parse_to_rects(m) }, &block)
|
|
end
|
|
|
|
def self.merge2(map1, map2, &block)
|
|
rects1 = parse_to_rects(map1)
|
|
rects2 = parse_to_rects(map2)
|
|
|
|
actions = []
|
|
all_rects = []
|
|
|
|
rects1.each {|rect|
|
|
_, _, action = rect
|
|
rect[2] = actions.length
|
|
actions << action
|
|
all_rects << rect
|
|
}
|
|
|
|
boundary = actions.length
|
|
|
|
rects2.each {|rect|
|
|
_, _, action = rect
|
|
rect[2] = actions.length
|
|
actions << action
|
|
all_rects << rect
|
|
}
|
|
|
|
tree = expand(all_rects) {|prefix, as0|
|
|
as1 = []
|
|
as2 = []
|
|
as0.each {|i|
|
|
if i < boundary
|
|
as1 << actions[i]
|
|
else
|
|
as2 << actions[i]
|
|
end
|
|
}
|
|
yield(prefix, as1, as2)
|
|
}
|
|
|
|
self.new(tree)
|
|
end
|
|
|
|
def self.expand(rects, &block)
|
|
#numsing = numreg = 0
|
|
#rects.each {|min, max, action| if min == max then numsing += 1 else numreg += 1 end }
|
|
#puts "#{numsing} singleton mappings and #{numreg} region mappings."
|
|
singleton_rects = []
|
|
region_rects = []
|
|
rects.each {|rect|
|
|
min, max, = rect
|
|
if min == max
|
|
singleton_rects << rect
|
|
else
|
|
region_rects << rect
|
|
end
|
|
}
|
|
@singleton_rects = singleton_rects.sort_by {|min, max, action| min }
|
|
@singleton_rects.reverse!
|
|
ret = expand_rec("", region_rects, &block)
|
|
@singleton_rects = nil
|
|
ret
|
|
end
|
|
|
|
TMPHASH = {}
|
|
def self.expand_rec(prefix, region_rects, &block)
|
|
return region_rects if region_rects.empty? && !((s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix))
|
|
if region_rects.empty? ? s_rect[0].length == prefix.length : region_rects[0][0].empty?
|
|
h = TMPHASH
|
|
while (s_rect = @singleton_rects.last) && s_rect[0].start_with?(prefix)
|
|
min, _, action = @singleton_rects.pop
|
|
raise ArgumentError, "ambiguous pattern: #{prefix}" if min.length != prefix.length
|
|
h[action] = true
|
|
end
|
|
for min, _, action in region_rects
|
|
raise ArgumentError, "ambiguous pattern: #{prefix}" if !min.empty?
|
|
h[action] = true
|
|
end
|
|
tree = Action.new(block.call(prefix, h.keys))
|
|
h.clear
|
|
else
|
|
tree = []
|
|
each_firstbyte_range(prefix, region_rects) {|byte_min, byte_max, r_rects2|
|
|
if byte_min == byte_max
|
|
prefix2 = prefix + "%02X" % byte_min
|
|
else
|
|
prefix2 = prefix + "{%02X-%02X}" % [byte_min, byte_max]
|
|
end
|
|
child_tree = expand_rec(prefix2, r_rects2, &block)
|
|
tree << Branch.new(byte_min, byte_max, child_tree)
|
|
}
|
|
end
|
|
return tree
|
|
end
|
|
|
|
def self.each_firstbyte_range(prefix, region_rects)
|
|
index_from = TMPHASH
|
|
|
|
region_ary = []
|
|
region_rects.each {|min, max, action|
|
|
raise ArgumentError, "ambiguous pattern: #{prefix}" if min.empty?
|
|
min_firstbyte = min[0,2].to_i(16)
|
|
min_rest = min[2..-1]
|
|
max_firstbyte = max[0,2].to_i(16)
|
|
max_rest = max[2..-1]
|
|
region_ary << [min_firstbyte, max_firstbyte, [min_rest, max_rest, action]]
|
|
index_from[min_firstbyte] = true
|
|
index_from[max_firstbyte+1] = true
|
|
}
|
|
|
|
byte_from = Array.new(index_from.size)
|
|
bytes = index_from.keys
|
|
bytes.sort!
|
|
bytes.reverse!
|
|
bytes.each_with_index {|byte, i|
|
|
index_from[byte] = i
|
|
byte_from[i] = byte
|
|
}
|
|
|
|
region_rects_ary = Array.new(index_from.size) { [] }
|
|
region_ary.each {|min_firstbyte, max_firstbyte, rest_elt|
|
|
index_from[min_firstbyte].downto(index_from[max_firstbyte+1]+1) {|i|
|
|
region_rects_ary[i] << rest_elt
|
|
}
|
|
}
|
|
|
|
index_from.clear
|
|
|
|
r_rects = region_rects_ary.pop
|
|
region_byte = byte_from.pop
|
|
prev_r_start = region_byte
|
|
prev_r_rects = []
|
|
while r_rects && (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
|
|
singleton_byte = seq[prefix.length, 2].to_i(16)
|
|
min_byte = singleton_byte < region_byte ? singleton_byte : region_byte
|
|
if prev_r_start < min_byte && !prev_r_rects.empty?
|
|
yield prev_r_start, min_byte-1, prev_r_rects
|
|
end
|
|
if region_byte < singleton_byte
|
|
prev_r_start = region_byte
|
|
prev_r_rects = r_rects
|
|
r_rects = region_rects_ary.pop
|
|
region_byte = byte_from.pop
|
|
elsif region_byte > singleton_byte
|
|
yield singleton_byte, singleton_byte, prev_r_rects
|
|
prev_r_start = singleton_byte+1
|
|
else # region_byte == singleton_byte
|
|
prev_r_start = region_byte+1
|
|
prev_r_rects = r_rects
|
|
r_rects = region_rects_ary.pop
|
|
region_byte = byte_from.pop
|
|
yield singleton_byte, singleton_byte, prev_r_rects
|
|
end
|
|
end
|
|
|
|
while r_rects
|
|
if prev_r_start < region_byte && !prev_r_rects.empty?
|
|
yield prev_r_start, region_byte-1, prev_r_rects
|
|
end
|
|
prev_r_start = region_byte
|
|
prev_r_rects = r_rects
|
|
r_rects = region_rects_ary.pop
|
|
region_byte = byte_from.pop
|
|
end
|
|
|
|
while (s_rect = @singleton_rects.last) && (seq = s_rect[0]).start_with?(prefix)
|
|
singleton_byte = seq[prefix.length, 2].to_i(16)
|
|
yield singleton_byte, singleton_byte, []
|
|
end
|
|
end
|
|
|
|
def initialize(tree)
|
|
@tree = tree
|
|
end
|
|
|
|
def inspect
|
|
"\#<#{self.class}:" +
|
|
@tree.inspect +
|
|
">"
|
|
end
|
|
|
|
def max_input_length_rec(tree)
|
|
case tree
|
|
when Action
|
|
0
|
|
else
|
|
tree.map {|branch|
|
|
max_input_length_rec(branch.child_tree)
|
|
}.max + 1
|
|
end
|
|
end
|
|
|
|
def max_input_length
|
|
max_input_length_rec(@tree)
|
|
end
|
|
|
|
def empty_action
|
|
if @tree.kind_of? Action
|
|
@tree.value
|
|
else
|
|
nil
|
|
end
|
|
end
|
|
|
|
OffsetsMemo = {}
|
|
InfosMemo = {}
|
|
|
|
def format_offsets(min, max, offsets)
|
|
offsets = offsets[min..max]
|
|
code = "%d, %d,\n" % [min, max]
|
|
0.step(offsets.length-1,16) {|i|
|
|
code << " "
|
|
code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
|
|
if i+8 < offsets.length
|
|
code << " "
|
|
code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
|
|
end
|
|
code << "\n"
|
|
}
|
|
code
|
|
end
|
|
|
|
UsedName = {}
|
|
|
|
StrMemo = {}
|
|
|
|
def str_name(bytes)
|
|
size = @bytes_code.length
|
|
rawbytes = [bytes].pack("H*")
|
|
|
|
n = nil
|
|
if !n && !(suf = rawbytes.gsub(/[^A-Za-z0-9_]/, '')).empty? && !UsedName[nn = "str1_" + suf] then n = nn end
|
|
if !n && !UsedName[nn = "str1_" + bytes] then n = nn end
|
|
n ||= "str1s_#{size}"
|
|
|
|
StrMemo[bytes] = n
|
|
UsedName[n] = true
|
|
n
|
|
end
|
|
|
|
def gen_str(bytes)
|
|
if n = StrMemo[bytes]
|
|
n
|
|
else
|
|
len = bytes.length/2
|
|
size = @bytes_code.length
|
|
n = str_name(bytes)
|
|
@bytes_code.insert_at_last(1 + len,
|
|
"\#define #{n} makeSTR1(#{size})\n" +
|
|
" makeSTR1LEN(#{len})," + bytes.gsub(/../, ' 0x\&,') + "\n\n")
|
|
n
|
|
end
|
|
end
|
|
|
|
def generate_info(info)
|
|
case info
|
|
when :nomap, :nomap0
|
|
# :nomap0 is low priority. it never collides.
|
|
"NOMAP"
|
|
when :undef
|
|
"UNDEF"
|
|
when :invalid
|
|
"INVALID"
|
|
when :func_ii
|
|
"FUNii"
|
|
when :func_si
|
|
"FUNsi"
|
|
when :func_io
|
|
"FUNio"
|
|
when :func_so
|
|
"FUNso"
|
|
when /\A(#{HEX2})\z/o
|
|
"o1(0x#$1)"
|
|
when /\A(#{HEX2})(#{HEX2})\z/o
|
|
"o2(0x#$1,0x#$2)"
|
|
when /\A(#{HEX2})(#{HEX2})(#{HEX2})\z/o
|
|
"o3(0x#$1,0x#$2,0x#$3)"
|
|
when /funsio\((\d+)\)/
|
|
"funsio(#{$1})"
|
|
when /\A(#{HEX2})(3[0-9])(#{HEX2})(3[0-9])\z/o
|
|
"g4(0x#$1,0x#$2,0x#$3,0x#$4)"
|
|
when /\A(f[0-7])(#{HEX2})(#{HEX2})(#{HEX2})\z/o
|
|
"o4(0x#$1,0x#$2,0x#$3,0x#$4)"
|
|
when /\A(#{HEX2}){4,259}\z/o
|
|
gen_str(info.upcase)
|
|
when /\A\/\*BYTE_LOOKUP\*\// # pointer to BYTE_LOOKUP structure
|
|
$'.to_s
|
|
else
|
|
raise "unexpected action: #{info.inspect}"
|
|
end
|
|
end
|
|
|
|
def format_infos(infos)
|
|
infos = infos.map {|info| generate_info(info) }
|
|
maxlen = infos.map {|info| info.length }.max
|
|
columns = maxlen <= 16 ? 4 : 2
|
|
code = "".dup
|
|
0.step(infos.length-1, columns) {|i|
|
|
code << " "
|
|
is = infos[i,columns]
|
|
is.each {|info|
|
|
code << sprintf(" %#{maxlen}s,", info)
|
|
}
|
|
code << "\n"
|
|
}
|
|
code
|
|
end
|
|
|
|
def generate_lookup_node(name, table)
|
|
bytes_code = @bytes_code
|
|
words_code = @words_code
|
|
offsets = []
|
|
infos = []
|
|
infomap = {}
|
|
min = max = nil
|
|
table.each_with_index {|action, byte|
|
|
action ||= :invalid
|
|
if action != :invalid
|
|
min = byte if !min
|
|
max = byte
|
|
end
|
|
unless o = infomap[action]
|
|
infomap[action] = o = infos.length
|
|
infos[o] = action
|
|
end
|
|
offsets[byte] = o
|
|
}
|
|
infomap.clear
|
|
if !min
|
|
min = max = 0
|
|
end
|
|
|
|
offsets_key = [min, max, offsets[min..max]]
|
|
if n = OffsetsMemo[offsets_key]
|
|
offsets_name = n
|
|
else
|
|
offsets_name = "#{name}_offsets"
|
|
OffsetsMemo[offsets_key] = offsets_name
|
|
size = bytes_code.length
|
|
bytes_code.insert_at_last(2+max-min+1,
|
|
"\#define #{offsets_name} #{size}\n" +
|
|
format_offsets(min,max,offsets) + "\n")
|
|
end
|
|
|
|
if n = InfosMemo[infos]
|
|
infos_name = n
|
|
else
|
|
infos_name = "#{name}_infos"
|
|
InfosMemo[infos] = infos_name
|
|
|
|
size = words_code.length
|
|
words_code.insert_at_last(infos.length,
|
|
"\#define #{infos_name} WORDINDEX2INFO(#{size})\n" +
|
|
format_infos(infos) + "\n")
|
|
end
|
|
|
|
size = words_code.length
|
|
words_code.insert_at_last(NUM_ELEM_BYTELOOKUP,
|
|
"\#define #{name} WORDINDEX2INFO(#{size})\n" +
|
|
<<"End" + "\n")
|
|
#{offsets_name},
|
|
#{infos_name},
|
|
End
|
|
end
|
|
|
|
PreMemo = {}
|
|
NextName = "a"
|
|
|
|
def generate_node(name_hint=nil)
|
|
if n = PreMemo[@tree]
|
|
return n
|
|
end
|
|
|
|
table = Array.new(0x100, :invalid)
|
|
@tree.each {|branch|
|
|
byte_min, byte_max, child_tree = branch.byte_min, branch.byte_max, branch.child_tree
|
|
rest = ActionMap.new(child_tree)
|
|
if a = rest.empty_action
|
|
table.fill(a, byte_min..byte_max)
|
|
else
|
|
name_hint2 = nil
|
|
if name_hint
|
|
name_hint2 = "#{name_hint}_#{byte_min == byte_max ? '%02X' % byte_min : '%02Xto%02X' % [byte_min, byte_max]}"
|
|
end
|
|
v = "/*BYTE_LOOKUP*/" + rest.gennode(@bytes_code, @words_code, name_hint2)
|
|
table.fill(v, byte_min..byte_max)
|
|
end
|
|
}
|
|
|
|
if !name_hint
|
|
name_hint = "fun_" + NextName
|
|
NextName.succ!
|
|
end
|
|
|
|
PreMemo[@tree] = name_hint
|
|
|
|
generate_lookup_node(name_hint, table)
|
|
name_hint
|
|
end
|
|
|
|
def gennode(bytes_code, words_code, name_hint=nil)
|
|
@bytes_code = bytes_code
|
|
@words_code = words_code
|
|
name = generate_node(name_hint)
|
|
@bytes_code = nil
|
|
@words_code = nil
|
|
return name
|
|
end
|
|
end
|
|
|
|
def citrus_mskanji_cstomb(csid, index)
|
|
case csid
|
|
when 0
|
|
index
|
|
when 1
|
|
index + 0x80
|
|
when 2, 3
|
|
row = index >> 8
|
|
raise "invalid byte sequence" if row < 0x21
|
|
if csid == 3
|
|
if row <= 0x2F
|
|
offset = (row == 0x22 || row >= 0x26) ? 0xED : 0xF0
|
|
elsif row >= 0x4D && row <= 0x7E
|
|
offset = 0xCE
|
|
else
|
|
raise "invalid byte sequence"
|
|
end
|
|
else
|
|
raise "invalid byte sequence" if row > 0x97
|
|
offset = (row < 0x5F) ? 0x81 : 0xC1
|
|
end
|
|
col = index & 0xFF
|
|
raise "invalid byte sequence" if (col < 0x21 || col > 0x7E)
|
|
|
|
row -= 0x21
|
|
col -= 0x21
|
|
if (row & 1) == 0
|
|
col += 0x40
|
|
col += 1 if (col >= 0x7F)
|
|
else
|
|
col += 0x9F;
|
|
end
|
|
row = row / 2 + offset
|
|
(row << 8) | col
|
|
end.to_s(16)
|
|
end
|
|
|
|
def citrus_euc_cstomb(csid, index)
|
|
case csid
|
|
when 0x0000
|
|
index
|
|
when 0x8080
|
|
index | 0x8080
|
|
when 0x0080
|
|
index | 0x8E80
|
|
when 0x8000
|
|
index | 0x8F8080
|
|
end.to_s(16)
|
|
end
|
|
|
|
def citrus_stateless_iso_cstomb(csid, index)
|
|
(index | 0x8080 | (csid << 16)).to_s(16)
|
|
end
|
|
|
|
def citrus_cstomb(ces, csid, index)
|
|
case ces
|
|
when 'mskanji'
|
|
citrus_mskanji_cstomb(csid, index)
|
|
when 'euc'
|
|
citrus_euc_cstomb(csid, index)
|
|
when 'stateless_iso'
|
|
citrus_stateless_iso_cstomb(csid, index)
|
|
end
|
|
end
|
|
|
|
SUBDIR = %w/APPLE AST BIG5 CNS CP EBCDIC EMOJI GB GEORGIAN ISO646 ISO-8859 JIS KAZAKH KOI KS MISC TCVN/
|
|
|
|
|
|
def citrus_decode_mapsrc(ces, csid, mapsrcs)
|
|
table = []
|
|
mapsrcs.split(',').each do |mapsrc|
|
|
path = [$srcdir]
|
|
mode = nil
|
|
if mapsrc.rindex(/UCS(?:@[A-Z]+)?/, 0)
|
|
mode = :from_ucs
|
|
from = mapsrc[$&.size+1..-1]
|
|
path << SUBDIR.find{|x| from.rindex(x, 0) }
|
|
else
|
|
mode = :to_ucs
|
|
path << SUBDIR.find{|x| mapsrc.rindex(x, 0) }
|
|
end
|
|
if /\bUCS@(BMP|SMP|SIP|TIP|SSP)\b/ =~ mapsrc
|
|
plane = {"BMP"=>0, "SMP"=>1, "SIP"=>2, "TIP"=>3, "SSP"=>14}[$1]
|
|
else
|
|
plane = 0
|
|
end
|
|
plane <<= 16
|
|
path << mapsrc.gsub(':', '@')
|
|
path = File.join(*path)
|
|
path << ".src"
|
|
path[path.rindex('/')] = '%'
|
|
STDOUT.puts 'load mapsrc %s' % path if VERBOSE_MODE > 1
|
|
File.open(path, 'rb') do |f|
|
|
f.each_line do |l|
|
|
break if /^BEGIN_MAP/ =~ l
|
|
end
|
|
f.each_line do |l|
|
|
next if /^\s*(?:#|$)/ =~ l
|
|
break if /^END_MAP/ =~ l
|
|
case mode
|
|
when :from_ucs
|
|
case l
|
|
when /0x(\w+)\s*-\s*0x(\w+)\s*=\s*INVALID/
|
|
# Citrus OOB_MODE
|
|
when /(0x\w+)\s*=\s*(0x\w+)/
|
|
table.push << [plane | $1.hex, citrus_cstomb(ces, csid, $2.hex)]
|
|
else
|
|
raise "unknown notation '%s'"% l.chomp
|
|
end
|
|
when :to_ucs
|
|
case l
|
|
when /(0x\w+)\s*=\s*(0x\w+)/
|
|
table.push << [citrus_cstomb(ces, csid, $1.hex), plane | $2.hex]
|
|
else
|
|
raise "unknown notation '%s'"% l.chomp
|
|
end
|
|
end
|
|
end
|
|
end
|
|
end
|
|
return table
|
|
end
|
|
|
|
def import_ucm(path)
|
|
to_ucs = []
|
|
from_ucs = []
|
|
File.foreach(File.join($srcdir, "ucm", path)) do |line|
|
|
uc, bs, fb = nil
|
|
if /^<U([0-9a-fA-F]+)>\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
|
|
uc = $1.hex
|
|
bs = $2.delete('x\\')
|
|
fb = $3.to_i
|
|
next if uc < 128 && uc == bs.hex
|
|
elsif /^([<U0-9a-fA-F>+]+)\s*([\+0-9a-fA-Fx\\]+)\s*\|(\d)/ =~ line
|
|
uc = $1.scan(/[0-9a-fA-F]+>/).map(&:hex).pack("U*").unpack("H*")[0]
|
|
bs = $2.delete('x\\')
|
|
fb = $3.to_i
|
|
end
|
|
to_ucs << [bs, uc] if fb == 0 || fb == 3
|
|
from_ucs << [uc, bs] if fb == 0 || fb == 1
|
|
end
|
|
[to_ucs, from_ucs]
|
|
end
|
|
|
|
def encode_utf8(map)
|
|
r = []
|
|
map.each {|k, v|
|
|
# integer means UTF-8 encoded sequence.
|
|
k = [k].pack("U").unpack("H*")[0].upcase if Integer === k
|
|
v = [v].pack("U").unpack("H*")[0].upcase if Integer === v
|
|
r << [k,v]
|
|
}
|
|
r
|
|
end
|
|
|
|
UnspecifiedValidEncoding = Object.new
|
|
|
|
def transcode_compile_tree(name, from, map, valid_encoding)
|
|
map = encode_utf8(map)
|
|
h = {}
|
|
map.each {|k, v|
|
|
h[k] = v unless h[k] # use first mapping
|
|
}
|
|
if valid_encoding.equal? UnspecifiedValidEncoding
|
|
valid_encoding = ValidEncoding.fetch(from)
|
|
end
|
|
if valid_encoding
|
|
am = ActionMap.merge2(h, {valid_encoding => :undef}) {|prefix, as1, as2|
|
|
a1 = as1.empty? ? nil : ActionMap.unambiguous_action(as1)
|
|
a2 = as2.empty? ? nil : ActionMap.unambiguous_action(as2)
|
|
if !a2
|
|
raise "invalid mapping: #{prefix}"
|
|
end
|
|
a1 || a2
|
|
}
|
|
else
|
|
am = ActionMap.parse(h)
|
|
end
|
|
h.clear
|
|
|
|
max_input = am.max_input_length
|
|
defined_name = am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name)
|
|
return defined_name, max_input
|
|
end
|
|
|
|
TRANSCODERS = []
|
|
TRANSCODE_GENERATED_TRANSCODER_CODE = ''.dup
|
|
|
|
def transcode_tbl_only(from, to, map, valid_encoding=UnspecifiedValidEncoding)
|
|
if VERBOSE_MODE > 1
|
|
if from.empty? || to.empty?
|
|
STDOUT.puts "converter for #{from.empty? ? to : from}"
|
|
else
|
|
STDOUT.puts "converter from #{from} to #{to}"
|
|
end
|
|
end
|
|
id_from = from.tr('^0-9A-Za-z', '_')
|
|
id_to = to.tr('^0-9A-Za-z', '_')
|
|
if from == "UTF-8"
|
|
tree_name = "to_#{id_to}"
|
|
elsif to == "UTF-8"
|
|
tree_name = "from_#{id_from}"
|
|
else
|
|
tree_name = "from_#{id_from}_to_#{id_to}"
|
|
end
|
|
real_tree_name, max_input = transcode_compile_tree(tree_name, from, map, valid_encoding)
|
|
return map, tree_name, real_tree_name, max_input
|
|
end
|
|
|
|
#
|
|
# call-seq:
|
|
# transcode_tblgen(from_name, to_name, map [, valid_encoding_check [, ascii_compatibility]]) -> ''
|
|
#
|
|
# Returns an empty string just in case the result is used somewhere.
|
|
# Stores the actual product for later output with transcode_generated_code and
|
|
# transcode_register_code.
|
|
#
|
|
# The first argument is a string that will be used for the source (from) encoding.
|
|
# The second argument is a string that will be used for the target (to) encoding.
|
|
#
|
|
# The third argument is the actual data, a map represented as an array of two-element
|
|
# arrays. Each element of the array stands for one character being converted. The
|
|
# first element of each subarray is the code of the character in the source encoding,
|
|
# the second element of each subarray is the code of the character in the target encoding.
|
|
#
|
|
# Each code (i.e. byte sequence) is represented as a string of hexadecimal characters
|
|
# of even length. Codes can also be represented as integers (usually in the form Ox...),
|
|
# in which case they are interpreted as Unicode codepoints encoded in UTF-8. So as
|
|
# an example, 0x677E is the same as "E69DBE" (but somewhat easier to produce and check).
|
|
#
|
|
# In addition, the following symbols can also be used instead of actual codes in the
|
|
# second element of a subarray:
|
|
# :nomap (no mapping, just copy input to output), :nomap0 (same as :nomap, but low priority),
|
|
# :undef (input code undefined in the destination encoding),
|
|
# :invalid (input code is an invalid byte sequence in the source encoding),
|
|
# :func_ii, :func_si, :func_io, :func_so (conversion by function with specific call
|
|
# convention).
|
|
#
|
|
# The forth argument specifies the overall structure of the encoding. For examples,
|
|
# see ValidEncoding below. This is used to cross-check the data in the third argument
|
|
# and to automatically add :undef and :invalid mappings where necessary.
|
|
#
|
|
# The fifth argument gives the ascii-compatibility of the transcoding. See
|
|
# rb_transcoder_asciicompat_type_t in transcode_data.h for details. In most
|
|
# cases, this argument can be left out.
|
|
#
|
|
def transcode_tblgen(from, to, map, valid_encoding=UnspecifiedValidEncoding,
|
|
ascii_compatibility='asciicompat_converter')
|
|
map, tree_name, real_tree_name, max_input = transcode_tbl_only(from, to, map, valid_encoding)
|
|
transcoder_name = "rb_#{tree_name}"
|
|
TRANSCODERS << transcoder_name
|
|
input_unit_length = UnitLength[from]
|
|
max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
|
|
transcoder_code = <<"End"
|
|
static const rb_transcoder
|
|
#{transcoder_name} = {
|
|
#{c_esc from}, #{c_esc to}, #{real_tree_name},
|
|
TRANSCODE_TABLE_INFO,
|
|
#{input_unit_length}, /* input_unit_length */
|
|
#{max_input}, /* max_input */
|
|
#{max_output}, /* max_output */
|
|
#{ascii_compatibility}, /* asciicompat_type */
|
|
0, 0, 0, /* state_size, state_init, state_fini */
|
|
0, 0, 0, 0,
|
|
0, 0, 0
|
|
};
|
|
End
|
|
TRANSCODE_GENERATED_TRANSCODER_CODE << transcoder_code
|
|
''
|
|
end
|
|
|
|
def transcode_generate_node(am, name_hint=nil)
|
|
STDOUT.puts "converter for #{name_hint}" if VERBOSE_MODE > 1
|
|
am.gennode(TRANSCODE_GENERATED_BYTES_CODE, TRANSCODE_GENERATED_WORDS_CODE, name_hint)
|
|
''
|
|
end
|
|
|
|
def transcode_generated_code
|
|
TRANSCODE_GENERATED_BYTES_CODE.to_s +
|
|
TRANSCODE_GENERATED_WORDS_CODE.to_s +
|
|
"\#define TRANSCODE_TABLE_INFO " +
|
|
"#{OUTPUT_PREFIX}byte_array, #{TRANSCODE_GENERATED_BYTES_CODE.length}, " +
|
|
"#{OUTPUT_PREFIX}word_array, #{TRANSCODE_GENERATED_WORDS_CODE.length}, " +
|
|
"((int)sizeof(unsigned int))\n" +
|
|
TRANSCODE_GENERATED_TRANSCODER_CODE
|
|
end
|
|
|
|
def transcode_register_code
|
|
code = ''.dup
|
|
TRANSCODERS.each {|transcoder_name|
|
|
code << " rb_register_transcoder(&#{transcoder_name});\n"
|
|
}
|
|
code
|
|
end
|
|
|
|
UnitLength = {
|
|
'UTF-16BE' => 2,
|
|
'UTF-16LE' => 2,
|
|
'UTF-32BE' => 4,
|
|
'UTF-32LE' => 4,
|
|
}
|
|
UnitLength.default = 1
|
|
|
|
ValidEncoding = {
|
|
'1byte' => '{00-ff}',
|
|
'2byte' => '{00-ff}{00-ff}',
|
|
'4byte' => '{00-ff}{00-ff}{00-ff}{00-ff}',
|
|
'US-ASCII' => '{00-7f}',
|
|
'UTF-8' => '{00-7f}
|
|
{c2-df}{80-bf}
|
|
e0{a0-bf}{80-bf}
|
|
{e1-ec}{80-bf}{80-bf}
|
|
ed{80-9f}{80-bf}
|
|
{ee-ef}{80-bf}{80-bf}
|
|
f0{90-bf}{80-bf}{80-bf}
|
|
{f1-f3}{80-bf}{80-bf}{80-bf}
|
|
f4{80-8f}{80-bf}{80-bf}',
|
|
'UTF-16BE' => '{00-d7,e0-ff}{00-ff}
|
|
{d8-db}{00-ff}{dc-df}{00-ff}',
|
|
'UTF-16LE' => '{00-ff}{00-d7,e0-ff}
|
|
{00-ff}{d8-db}{00-ff}{dc-df}',
|
|
'UTF-32BE' => '0000{00-d7,e0-ff}{00-ff}
|
|
00{01-10}{00-ff}{00-ff}',
|
|
'UTF-32LE' => '{00-ff}{00-d7,e0-ff}0000
|
|
{00-ff}{00-ff}{01-10}00',
|
|
'EUC-JP' => '{00-7f}
|
|
{a1-fe}{a1-fe}
|
|
8e{a1-fe}
|
|
8f{a1-fe}{a1-fe}',
|
|
'CP51932' => '{00-7f}
|
|
{a1-fe}{a1-fe}
|
|
8e{a1-fe}',
|
|
'EUC-JIS-2004' => '{00-7f}
|
|
{a1-fe}{a1-fe}
|
|
8e{a1-fe}
|
|
8f{a1-fe}{a1-fe}',
|
|
'Shift_JIS' => '{00-7f}
|
|
{81-9f,e0-fc}{40-7e,80-fc}
|
|
{a1-df}',
|
|
'EUC-KR' => '{00-7f}
|
|
{a1-fe}{a1-fe}',
|
|
'CP949' => '{00-7f}
|
|
{81-fe}{41-5a,61-7a,81-fe}',
|
|
'Big5' => '{00-7f}
|
|
{81-fe}{40-7e,a1-fe}',
|
|
'EUC-TW' => '{00-7f}
|
|
{a1-fe}{a1-fe}
|
|
8e{a1-b0}{a1-fe}{a1-fe}',
|
|
'GBK' => '{00-80}
|
|
{81-fe}{40-7e,80-fe}',
|
|
'GB18030' => '{00-7f}
|
|
{81-fe}{40-7e,80-fe}
|
|
{81-fe}{30-39}{81-fe}{30-39}',
|
|
}
|
|
|
|
def ValidEncoding(enc)
|
|
ValidEncoding.fetch(enc)
|
|
end
|
|
|
|
def set_valid_byte_pattern(encoding, pattern_or_label)
|
|
pattern =
|
|
if ValidEncoding[pattern_or_label]
|
|
ValidEncoding[pattern_or_label]
|
|
else
|
|
pattern_or_label
|
|
end
|
|
if ValidEncoding[encoding] and ValidEncoding[encoding]!=pattern
|
|
raise ArgumentError, "trying to change valid byte pattern for encoding #{encoding} from #{ValidEncoding[encoding]} to #{pattern}"
|
|
end
|
|
ValidEncoding[encoding] = pattern
|
|
end
|
|
|
|
# the following may be used in different places, so keep them here for the moment
|
|
set_valid_byte_pattern 'ASCII-8BIT', '1byte'
|
|
set_valid_byte_pattern 'Windows-31J', 'Shift_JIS'
|
|
set_valid_byte_pattern 'eucJP-ms', 'EUC-JP'
|
|
|
|
def make_signature(filename, src)
|
|
"src=#{filename.dump}, len=#{src.length}, checksum=#{src.sum}"
|
|
end
|
|
|
|
if __FILE__ == $0
|
|
start_time = Time.now
|
|
|
|
output_filename = nil
|
|
verbose_mode = 0
|
|
force_mode = false
|
|
|
|
op = OptionParser.new
|
|
op.def_option("--help", "show help message") { puts op; exit 0 }
|
|
op.def_option("--verbose", "verbose mode, twice for more verbose") { verbose_mode += 1 }
|
|
op.def_option("--force", "force table generation") { force_mode = true }
|
|
op.def_option("--output=FILE", "specify output file") {|arg| output_filename = arg }
|
|
op.parse!
|
|
|
|
VERBOSE_MODE = verbose_mode
|
|
|
|
OUTPUT_FILENAME = output_filename
|
|
OUTPUT_PREFIX = output_filename ? File.basename(output_filename)[/\A[A-Za-z0-9_]*/] : "".dup
|
|
OUTPUT_PREFIX.sub!(/\A_+/, '')
|
|
OUTPUT_PREFIX.sub!(/_*\z/, '_')
|
|
|
|
TRANSCODE_GENERATED_BYTES_CODE = ArrayCode.new("unsigned char", "#{OUTPUT_PREFIX}byte_array")
|
|
TRANSCODE_GENERATED_WORDS_CODE = ArrayCode.new("unsigned int", "#{OUTPUT_PREFIX}word_array")
|
|
|
|
arg = ARGV.shift
|
|
$srcdir = File.dirname(arg)
|
|
$:.unshift $srcdir unless $:.include? $srcdir
|
|
src = File.read(arg)
|
|
src.force_encoding("ascii-8bit") if src.respond_to? :force_encoding
|
|
this_script = File.read(__FILE__)
|
|
this_script.force_encoding("ascii-8bit") if this_script.respond_to? :force_encoding
|
|
|
|
base_signature = "/* autogenerated. */\n".dup
|
|
base_signature << "/* #{make_signature(File.basename(__FILE__), this_script)} */\n"
|
|
base_signature << "/* #{make_signature(File.basename(arg), src)} */\n"
|
|
|
|
if !force_mode && output_filename && File.readable?(output_filename)
|
|
old_signature = File.open(output_filename) {|f| f.gets("").chomp }
|
|
chk_signature = base_signature.dup
|
|
old_signature.each_line {|line|
|
|
if %r{/\* src="([0-9a-z_.-]+)",} =~ line
|
|
name = $1
|
|
next if name == File.basename(arg) || name == File.basename(__FILE__)
|
|
path = File.join($srcdir, name)
|
|
if File.readable? path
|
|
chk_signature << "/* #{make_signature(name, File.read(path))} */\n"
|
|
end
|
|
end
|
|
}
|
|
if old_signature == chk_signature
|
|
now = Time.now
|
|
File.utime(now, now, output_filename)
|
|
STDOUT.puts "already up-to-date: #{output_filename}" if VERBOSE_MODE > 0
|
|
exit
|
|
end
|
|
end
|
|
|
|
if VERBOSE_MODE > 0
|
|
if output_filename
|
|
STDOUT.puts "generating #{output_filename} ..."
|
|
end
|
|
end
|
|
|
|
libs1 = $".dup
|
|
if ERB.instance_method(:initialize).parameters.assoc(:key) # Ruby 2.6+
|
|
erb = ERB.new(src, trim_mode: '%')
|
|
else
|
|
erb = ERB.new(src, nil, '%')
|
|
end
|
|
erb.filename = arg
|
|
erb_result = erb.result(binding)
|
|
libs2 = $".dup
|
|
|
|
libs = libs2 - libs1
|
|
lib_sigs = ''.dup
|
|
libs.each {|lib|
|
|
lib = File.basename(lib)
|
|
path = File.join($srcdir, lib)
|
|
if File.readable? path
|
|
lib_sigs << "/* #{make_signature(lib, File.read(path))} */\n"
|
|
end
|
|
}
|
|
|
|
result = ''.dup
|
|
result << base_signature
|
|
result << lib_sigs
|
|
result << "\n"
|
|
result << erb_result
|
|
result << "\n"
|
|
|
|
if output_filename
|
|
new_filename = output_filename + ".new"
|
|
FileUtils.mkdir_p(File.dirname(output_filename))
|
|
File.open(new_filename, "wb") {|f| f << result }
|
|
File.rename(new_filename, output_filename)
|
|
tms = Process.times
|
|
elapsed = Time.now - start_time
|
|
STDOUT.puts "done. (#{'%.2f' % tms.utime}user #{'%.2f' % tms.stime}system #{'%.2f' % elapsed}elapsed)" if VERBOSE_MODE > 1
|
|
else
|
|
print result
|
|
end
|
|
end
|