зеркало из https://github.com/github/ruby.git
case-folding.rb: perfect hash for case folding
* enc/unicode/case-folding.rb (lookup_hash): make perfect hash to lookup case folding table. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@46269 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
Родитель
88eae35862
Коммит
c39e659263
|
@ -1,4 +1,7 @@
|
|||
Sat May 31 08:55:58 2014 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||
Sat May 31 08:57:42 2014 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||
|
||||
* enc/unicode/case-folding.rb (lookup_hash): make perfect hash to
|
||||
lookup case folding table.
|
||||
|
||||
* enc/unicode/case-folding.rb (print_table): merge non-locale and
|
||||
locale tables, and reduce initializing loops.
|
||||
|
|
|
@ -71,8 +71,6 @@ static const unsigned short EncUNICODE_ISO_8859_1_CtypeTable[256] = {
|
|||
0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2, 0x30e2
|
||||
};
|
||||
|
||||
#include "enc/unicode/name2ctype.h"
|
||||
|
||||
typedef struct {
|
||||
int n;
|
||||
OnigCodePoint code[3];
|
||||
|
@ -103,8 +101,22 @@ typedef struct {
|
|||
CodePointList2 to;
|
||||
} CaseUnfold_13_Type;
|
||||
|
||||
static inline int
|
||||
bits_of(const OnigCodePoint c, const int n)
|
||||
{
|
||||
return (c >> (2 - n) * 7) & 127;
|
||||
}
|
||||
|
||||
static int
|
||||
code1_equal(const OnigCodePoint x, const OnigCodePoint y)
|
||||
{
|
||||
if (x != y) return 0;
|
||||
return 1;
|
||||
}
|
||||
|
||||
#include "enc/unicode/casefold.h"
|
||||
|
||||
#include "enc/unicode/name2ctype.h"
|
||||
|
||||
#define numberof(array) (int)(sizeof(array) / sizeof((array)[0]))
|
||||
#define CODE_RANGES_NUM numberof(CodeRanges)
|
||||
|
@ -228,7 +240,6 @@ static const struct st_hash_type type_code3_hash = {
|
|||
};
|
||||
|
||||
|
||||
static st_table* FoldTable; /* fold-1, fold-2, fold-3 */
|
||||
static st_table* Unfold1Table;
|
||||
static st_table* Unfold2Table;
|
||||
static st_table* Unfold3Table;
|
||||
|
@ -240,13 +251,6 @@ static int init_case_fold_table(void)
|
|||
|
||||
THREAD_ATOMIC_START;
|
||||
|
||||
FoldTable = st_init_numtable_with_size(FOLD_TABLE_SIZE);
|
||||
if (ONIG_IS_NULL(FoldTable)) return ONIGERR_MEMORY;
|
||||
for (i = 0; i < numberof(CaseFold_11_Table); i++) {
|
||||
const CaseFold_11_Type *p = &CaseFold_11_Table[i];
|
||||
st_add_direct(FoldTable, (st_data_t )p->from, (st_data_t )&(p->to));
|
||||
}
|
||||
|
||||
Unfold1Table = st_init_numtable_with_size(UNFOLD1_TABLE_SIZE);
|
||||
if (ONIG_IS_NULL(Unfold1Table)) return ONIGERR_MEMORY;
|
||||
|
||||
|
@ -276,15 +280,7 @@ static int init_case_fold_table(void)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static inline const CodePointList3 *
|
||||
onigenc_unicode_fold_lookup(OnigCodePoint code)
|
||||
{
|
||||
st_data_t to;
|
||||
if (onig_st_lookup(FoldTable, (st_data_t)code, &to) != 0) {
|
||||
return (const CodePointList3 *)to;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
#define onigenc_unicode_fold_lookup onigenc_unicode_CaseFold_11_lookup
|
||||
|
||||
static inline const CodePointList3 *
|
||||
onigenc_unicode_unfold1_lookup(OnigCodePoint code)
|
||||
|
|
|
@ -13,21 +13,23 @@ class CaseFolding
|
|||
end
|
||||
|
||||
def print_table_1(dest, data)
|
||||
for k, v in data.sort
|
||||
for k, v in data = data.sort
|
||||
sk = (Array === k and k.length > 1) ? "{#{hex_seq(k)}}" : ("0x%04x" % k)
|
||||
dest.print(" {#{sk}, {#{v.length}, {#{hex_seq(v)}}}},\n")
|
||||
end
|
||||
data
|
||||
end
|
||||
|
||||
def print_table(dest, type, data)
|
||||
dest.print("static const #{type}_Type #{type}_Table[] = {\n")
|
||||
i = 0
|
||||
data.each do |n, d|
|
||||
ret = data.inject([]) do |a, (n, d)|
|
||||
dest.print("#define #{n} (*(#{type}_Type (*)[#{d.size}])(#{type}_Table+#{i}))\n")
|
||||
i += d.size
|
||||
print_table_1(dest, d)
|
||||
a.concat(print_table_1(dest, d))
|
||||
end
|
||||
dest.print("};\n\n")
|
||||
ret
|
||||
end
|
||||
end
|
||||
|
||||
|
@ -76,6 +78,55 @@ class CaseFolding
|
|||
self
|
||||
end
|
||||
|
||||
def lookup_hash(key, type, data)
|
||||
hash = "onigenc_unicode_#{key}_hash"
|
||||
lookup = "onigenc_unicode_#{key}_lookup"
|
||||
gperf = %W"gperf -7 -k1,2,3 -F,-1 -c -j1 -i1 -t -T -E -C -H #{hash} -N #{lookup}"
|
||||
argname = "code"
|
||||
argdecl = "const OnigCodePoint #{argname}"
|
||||
n = 7
|
||||
m = (1 << n) - 1
|
||||
min, max = data.map {|c, *|c}.minmax
|
||||
src = IO.popen(gperf, "r+") {|f|
|
||||
f << "short\n%%\n"
|
||||
data.each_with_index {|(k, _), i|
|
||||
ks = [(k >> n*2) & m, (k >> n) & m, (k) & m].map {|c| "\\x%.2x" % c}.join("")
|
||||
f.printf "\"%s\", ::::/*0x%.4x*/ %d\n", ks, k, i
|
||||
}
|
||||
f << "%%\n"
|
||||
f.close_write
|
||||
f.read
|
||||
}
|
||||
src.sub!(/^(#{hash})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
|
||||
name = $1
|
||||
body = $2
|
||||
body.gsub!(/\(unsigned char\)str\[(\d+)\]/, "bits_of(#{argname}, \\1)")
|
||||
"#{name}(#{argdecl})\n{\n#{body}}"
|
||||
}
|
||||
src.sub!(/const short *\*\n^(#{lookup})\s*\(.*?\).*?\n\{\n(.*)^\}/m) {
|
||||
name = $1
|
||||
body = $2
|
||||
body.sub!(/\benum\s+\{(\n[ \t]+)/, "\\&MIN_CODE_VALUE = 0x#{min.to_s(16)},\\1""MAX_CODE_VALUE = 0x#{max.to_s(16)},\\1")
|
||||
body.gsub!(/(#{hash})\s*\(.*?\)/, '\1(code)')
|
||||
body.gsub!(/\{"",-1}/, "-1")
|
||||
body.gsub!(/\{"(?:[^"]|\\")+", *::::(.*)\}/, '\1')
|
||||
body.sub!(/(\s+if\s)\(len\b.*\)/) {"#$1(code <= MAX_CODE_VALUE && code >= MIN_CODE_VALUE)"}
|
||||
v = nil
|
||||
body.sub!(/(if\s*\(.*MAX_HASH_VALUE.*\)\n([ \t]*))\{(.*?)\n\2\}/m) {
|
||||
pre = $1
|
||||
indent = $2
|
||||
s = $3
|
||||
s.sub!(/const char *\* *(\w+)( *= *wordlist\[\w+\]).\w+/, 'short \1 = wordlist[key]')
|
||||
v = $1
|
||||
s.sub!(/\bif *\(.*\)/, "if (#{v} >= 0 && code1_equal(#{argname}, #{key}_Table[#{v}].from))")
|
||||
"#{pre}{#{s}\n#{indent}}"
|
||||
}
|
||||
body.sub!(/\b(return\s+&)([^;]+);/, '\1'"#{key}_Table[#{v}].to;")
|
||||
"static const #{type} *\n#{name}(#{argdecl})\n{\n#{body}}"
|
||||
}
|
||||
src
|
||||
end
|
||||
|
||||
def display(dest)
|
||||
# print the header
|
||||
dest.print("/* DO NOT EDIT THIS FILE. */\n")
|
||||
|
@ -85,7 +136,8 @@ class CaseFolding
|
|||
|
||||
# CaseFold + CaseFold_Locale
|
||||
name = "CaseFold_11"
|
||||
print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
|
||||
data = print_table(dest, name, "CaseFold"=>fold, "CaseFold_Locale"=>fold_locale)
|
||||
dest.print lookup_hash(name, "CodePointList3", data)
|
||||
|
||||
# print unfolding data
|
||||
|
||||
|
@ -102,8 +154,6 @@ class CaseFolding
|
|||
print_table(dest, name, name=>unfold[2])
|
||||
|
||||
# table sizes
|
||||
fold_table_size = fold.size + fold_locale.size
|
||||
dest.printf("#define FOLD_TABLE_SIZE\t\t%d\n", (fold_table_size * 1.2))
|
||||
unfold1_table_size = unfold[0].size + unfold_locale[0].size
|
||||
dest.printf("#define UNFOLD1_TABLE_SIZE\t%d\n", (unfold1_table_size * 1.2))
|
||||
unfold2_table_size = unfold[1].size + unfold_locale[1].size
|
||||
|
|
Разница между файлами не показана из-за своего большого размера
Загрузить разницу
Загрузка…
Ссылка в новой задаче