ruby/template/unicode_norm_gen.tmpl

%# -*- mode: ruby; coding: utf-8 -*-
<%
# Copyright Ayumu Nojima (野島 歩) and Martin J. Dürst (duerst@it.aoyama.ac.jp)

# Script to generate Ruby data structures used in implementing
# String#unicode_normalize,...

# Constants for input and ouput directory
InputDataDir = ARGV[0] || 'enc/unicode/data'
unicode_version = InputDataDir[/[\d.]+\z/]

# convenience methods
class Integer
  def to_UTF8() # convert to string, taking legibility into account
    if self>0xFFFF
      "\\u{#{to_s(16).upcase}}"
    elsif self>0x7f
      "\\u#{to_s(16).upcase.rjust(4, '0')}"
    else
      chr.sub(/[\\\"]/, "\\\\\\\&")
    end
  end
end

module Enumerable
  unless method_defined?(:each_slice)
    def each_slice(n)
      ary = []
      each do |i|
        ary << i
        if ary.size >= n
          yield ary
          ary = []
        end
      end
      yield ary unless ary.empty?
      self
    end
  end
end

class Array
  def to_UTF8() collect {|c| c.to_UTF8}.join('') end

  def each_regexp_chars(n = 8) # converts an array of Integers to character ranges
    sort.inject([]) do |ranges, value|
      if ranges.last and ranges.last[1]+1>=value
        ranges.last[1] = value
        ranges
      else
        ranges << [value, value]
      end
    end.collect do |first, last|
      case last-first
      when 0
        first.to_UTF8
      when 1
        first.to_UTF8 + last.to_UTF8
      else
        first.to_UTF8 + '-' + last.to_UTF8
      end
    end.each_slice(n) do |slice|
      yield slice.join('')
    end
  end
end

# read the file 'CompositionExclusions.txt'
composition_exclusions = vpath.open("#{InputDataDir}/CompositionExclusions.txt") {|f|
  base = Regexp.quote(File.basename(f.path, '.*'))
  ext = Regexp.quote(File.extname(f.path))
  version = (line = f.gets)[/^# *#{base}-([\d.]+)#{ext}\s*$/, 1] or
    abort "No file version in #{f.path}: #{line}"
  (unicode_version ||= version) == version or
    abort "Unicode version of directory (#{unicode_version}) and file (#{version}) mismatch"
  f.grep(/^[A-Z0-9]{4,5}/) {|code| code.hex}
}

decomposition_table = {}
kompatible_table = {}
combining_class = {}  # constant to allow use in Integer#to_UTF8

# read the file 'UnicodeData.txt'
vpath.foreach("#{InputDataDir}/UnicodeData.txt") do |line|
  codepoint, name, _2, char_class, _4, decomposition, *_rest = line.split(";")

  case decomposition
  when /^[0-9A-F]/
    decomposition_table[codepoint.hex] = decomposition.split(' ').collect {|w| w.hex}
  when /^</
    kompatible_table[codepoint.hex] = decomposition.split(' ')[1..-1].collect {|w| w.hex}
  end
  combining_class[codepoint.hex] = char_class.to_i if char_class != "0"

  if name=~/(First|Last)>$/ and (char_class!="0" or decomposition!="")
    warn "Unexpected: Character range with data relevant to normalization!"
  end
end

# calculate compositions from decompositions
composition_table = decomposition_table.reject do |character, decomposition|
  composition_exclusions.member? character or # predefined composition exclusion
    decomposition.length<=1 or                # Singleton Decomposition
    combining_class[character] or             # character is not a Starter
    combining_class[decomposition.first]      # decomposition begins with a character that is not a Starter
end.invert

# recalculate composition_exclusions
composition_exclusions = decomposition_table.keys - composition_table.values

accent_array = combining_class.keys + composition_table.keys.collect {|key| key.last}

composition_starters = composition_table.keys.collect {|key| key.first}

hangul_no_trailing = []
0xAC00.step(0xD7A3, 28) {|c| hangul_no_trailing << c}

# expand decomposition table values
decomposition_table.each do |key, value|
  position = 0
  while position < value.length
    if decomposition = decomposition_table[value[position]]
      decomposition_table[key] = value = value.dup # avoid overwriting composition_table key
      value[position, 1] = decomposition
    else
      position += 1
    end
  end
end

# deal with relationship between canonical and kompatibility decompositions
decomposition_table.each do |key, value|
  value = value.dup
  expanded = false
  position = 0
  while position < value.length
    if decomposition = kompatible_table[value[position]]
      value[position, 1] = decomposition
      expanded = true
    else
      position += 1
    end
  end
  kompatible_table[key] = value if expanded
end

while kompatible_table.any? {|key, value|
        expanded = value.map {|v| kompatible_table[v] || v}.flatten
        kompatible_table[key] = expanded unless value == expanded
      }
end

# generate normalization tables file
%># coding: us-ascii
%# >

# automatically generated by template/unicode_norm_gen.tmpl

module UnicodeNormalize
  UNICODE_VERSION = "<%=unicode_version%>".freeze

  accents = "" \
    "[<% accent_array.each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]" \
  "".freeze
  ACCENTS = accents
  REGEXP_D_STRING = "#{''  # composition starters and composition exclusions
    }" \
    "[<% (composition_table.values+composition_exclusions).each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]#{accents}*" \
    "|#{''  # characters that can be the result of a composition, except composition starters
    }" \
    "[<% (composition_starters-composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]?#{accents}+" \
    "|#{''  # precomposed Hangul syllables
    }" \
    "[\u{AC00}-\u{D7A4}]" \
  "".freeze
  REGEXP_C_STRING = "#{''  # composition exclusions
    }" \
    "[<% composition_exclusions.each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]#{accents}*" \
    "|#{''  # composition starters and characters that can be the result of a composition
    }" \
    "[<% (composition_starters+composition_table.values).each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>]?#{accents}+" \
    "|#{''  # Hangul syllables with separate trailer
    }" \
    "[<% hangul_no_trailing.each_regexp_chars do |rx|%><%=rx%>" \
    "<% end%>][\u11A8-\u11C2]" \
    "|#{''  # decomposed Hangul syllables
    }" \
    "[\u1100-\u1112][\u1161-\u1175][\u11A8-\u11C2]?" \
  "".freeze
  REGEXP_K_STRING = "" \
    "[<% kompatible_table.keys.each_regexp_chars do |rx|%><%=rx%>" \
    "<%end%>]" \
  "".freeze

  class_table = {
% combining_class.each_slice(8)  do |slice|
   <% slice.each  do |key, value|%> "<%=key.to_UTF8%>"=><%=value%><%=%>,<% end%>
% end
  }
  class_table.default = 0
  CLASS_TABLE = class_table.freeze

  DECOMPOSITION_TABLE = {
% decomposition_table.each_slice(8) do |slice|
   <% slice.each do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
% end
  }.freeze

  KOMPATIBLE_TABLE = {
% kompatible_table.each_slice(8)  do |slice|
   <% slice.each  do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
% end
   }.freeze

  COMPOSITION_TABLE = {
% composition_table.each_slice(8)  do |slice|
   <% slice.each  do |key, value|%> "<%=key.to_UTF8%>"=>"<%=value.to_UTF8%>".freeze<%=%>,<% end%>
% end
   }.freeze
end