case-folding.rb: perfect hash for case folding

* enc/unicode/case-folding.rb (lookup_hash): make perfect hash to
  lookup case folding table.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@46269 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
nobu 2014-05-30 23:57:45 +00:00
Родитель 88eae35862
Коммит c39e659263
4 изменённых файлов: 1575 добавлений и 27 удалений

Просмотреть файл

@ -1,4 +1,7 @@
Sat May 31 08:55:58 2014 Nobuyoshi Nakada <nobu@ruby-lang.org>
Sat May 31 08:57:42 2014 Nobuyoshi Nakada <nobu@ruby-lang.org>
* enc/unicode/case-folding.rb (lookup_hash): make perfect hash to
lookup case folding table.
* enc/unicode/case-folding.rb (print_table): merge non-locale and
locale tables, and reduce initializing loops.

Просмотреть файл

@ -71,8 +71,6 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
};
#include "enc/unicode/name2ctype.h"
typedef struct {
int n;
OnigCodePoint code[3];
@ -103,8 +101,22 @@ typedef struct {
CodePointList2 to;
} CaseUnfold_13_Type;
static inline int
bits_of(const OnigCodePoint c, const int n)
{
return (c >> (2 - n) * 7) & 127;
}
static int
code1_equal(const OnigCodePoint x, const OnigCodePoint y)
{
if (x != y) return 0;
return 1;
}
#include "enc/unicode/casefold.h"
#include "enc/unicode/name2ctype.h"
#define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
#define CODE_RANGES_NUM numberof(CodeRanges)
@ -228,7 +240,6 @@ static const struct st_hash_type type_code3_hash = {
};
static st_table* FoldTable; /* fold-1, fold-2, fold-3 */
static st_table* Unfold1Table;
static st_table* Unfold2Table;
static st_table* Unfold3Table;
@ -240,13 +251,6 @@ static int init_case_fold_table(void)
THREAD_ATOMIC_START;
FoldTable = st_init_numtable_with_size(FOLD_TABLE_SIZE);
if (ONIG_IS_NULL(FoldTable)) return ONIGERR_MEMORY;
for (i = 0; i < numberof(CaseFold_11_Table); i++) {
const CaseFold_11_Type *p = &CaseFold_11_Table[i];
st_add_direct(FoldTable, (st_data_t )p->from, (st_data_t )&(p->to));
}
Unfold1Table = st_init_numtable_with_size(UNFOLD1_TABLE_SIZE);
if (ONIG_IS_NULL(Unfold1Table)) return ONIGERR_MEMORY;
@ -276,15 +280,7 @@ static int init_case_fold_table(void)
return 0;
}
static inline const CodePointList3 *
onigenc_unicode_fold_lookup(OnigCodePoint code)
{
st_data_t to;
if (onig_st_lookup(FoldTable, (st_data_t)code, &to) != 0) {
return (const CodePointList3 *)to;
}
return 0;
}
#define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup
static inline const CodePointList3 *
onigenc_unicode_unfold1_lookup(OnigCodePoint code)

Просмотреть файл

@ -13,21 +13,23 @@ class CaseFolding
end
def print_table_1(dest, data)
for k, v in data.sort
for k, v in data = data.sort
sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k)
dest.print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n")
end
data
end
def print_table(dest, type, data)
dest.print("static const #{type}_Type #{type}_Table[] = {\n")
i = 0
data.each do |n, d|
ret = data.inject([]) do |a, (n, d)|
dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n")
i += d.size
print_table_1(dest, d)
a.concat(print_table_1(dest, d))
end
dest.print("};\n\n")
ret
end
end
@ -76,6 +78,55 @@ class CaseFolding
self
end
def lookup_hash(key, type, data)
hash = "onigenc_unicode_#{key}_hash"
lookup = "onigenc_unicode_#{key}_lookup"
gperf = %W"gperf -7 -k1,2,3 -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}"
argname = "code"
argdecl = "const OnigCodePoint #{argname}"
n = 7
m = (1 << n) - 1
min, max = data.map {|c, *|c}.minmax
src = IO.popen(gperf, "r+") {|f|
f << "short\n%%\n"
data.each_with_index {|(k, _), i|
ks = [(k >> n*2) & m, (k >> n) & m, (k) & m].map {|c| "\\x%.2x" % c}.join("")
f.printf "\"%s\", ::::/*0x%.4x*/ %d\n", ks, k, i
}
f << "%%\n"
f.close_write
f.read
}
src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
name = $1
body = $2
body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_of(#{argname}, \\1)")
"#{name}(#{argdecl})\n{\n#{body}}"
}
src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
name = $1
body = $2
body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
body.gsub!(/(#{hash})\s*\(.*?\)/, '\1(code)')
body.gsub!(/\{"",-1}/, "-1")
body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
body.sub!(/(\s+if\s)\(len\b.*\)/) {"#$1(code <= MAX_CODE_VALUE && code >= MIN_CODE_VALUE)"}
v = nil
body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
pre = $1
indent = $2
s = $3
s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
v = $1
s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code1_equal(#{argname}, #{key}_Table[#{v}].from))")
"#{pre}{#{s}\n#{indent}}"
}
body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
"static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}"
}
src
end
def display(dest)
# print the header
dest.print("/* DO NOT EDIT THIS FILE. */\n")
@ -85,7 +136,8 @@ class CaseFolding
# CaseFold + CaseFold_Locale
name = "CaseFold_11"
print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
data = print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
dest.print lookup_hash(name, "CodePointList3", data)
# print unfolding data
@ -102,8 +154,6 @@ class CaseFolding
print_table(dest, name, name=>unfold[2])
# table sizes
fold_table_size = fold.size + fold_locale.size
dest.printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2))
unfold1_table_size = unfold[0].size + unfold_locale[0].size
dest.printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2))
unfold2_table_size = unfold[1].size + unfold_locale[1].size

Разница между файлами не показана из-за своего большого размера Загрузить разницу