* transcode_data.h (rb_transcoder): from_unit_length field added.

from_utf8 field removed.

* tool/transcode-tblgen.rb: generate offsets range.
  follow rb_transcoder change.

* transcode.c (transcode_loop): don't use from_utf8.
  make invalid region from_unit_length wise.

* enc/trans/iso2022.erb.c: follow rb_transcoder and 
  transcode_generate_node change.

* enc/trans/utf_16_32.erb.c: follow rb_transcoder and
  transcode_generate_node change.
  explicit :invalid map removed.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18445 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2008-08-08 15:48:17 +00:00
Родитель a456f022fc
Коммит 2833d9f95d
7 изменённых файлов: 109 добавлений и 68 удалений

Просмотреть файл

@ -1,3 +1,21 @@
Sat Aug 9 00:42:33 2008 Tanaka Akira <akr@fsij.org>
* transcode_data.h (rb_transcoder): from_unit_length field added.
from_utf8 field removed.
* tool/transcode-tblgen.rb: generate offsets range.
follow rb_transcoder change.
* transcode.c (transcode_loop): don't use from_utf8.
make invalid region from_unit_length wise.
* enc/trans/iso2022.erb.c: follow rb_transcoder and
transcode_generate_node change.
* enc/trans/utf_16_32.erb.c: follow rb_transcoder and
transcode_generate_node change.
explicit :invalid map removed.
Fri Aug 8 23:29:44 2008 Nobuyoshi Nakada <nobu@ruby-lang.org> Fri Aug 8 23:29:44 2008 Nobuyoshi Nakada <nobu@ruby-lang.org>
* enc/depend (TRANSCSRCS): needs rule_subst to apply. * enc/depend (TRANSCSRCS): needs rule_subst to apply.

Просмотреть файл

@ -12,8 +12,8 @@
map_jisx0208_rest["{21-7e}"] = :func_so map_jisx0208_rest["{21-7e}"] = :func_so
%> %>
<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp", []) %> <%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp") %>
<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest", []) %> <%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest") %>
static VALUE static VALUE
fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l) fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l)
@ -57,7 +57,7 @@ fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l, u
static const rb_transcoder static const rb_transcoder
rb_ISO_2022_JP_to_EUC_JP = { rb_ISO_2022_JP_to_EUC_JP = {
"ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 3, 0, "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 1, 3,
NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp
}; };
@ -71,7 +71,7 @@ rb_ISO_2022_JP_to_EUC_JP = {
} }
%> %>
<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp", []) %> <%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp") %>
static int static int
fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, unsigned char *o) fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, unsigned char *o)
@ -129,7 +129,7 @@ finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o)
static const rb_transcoder static const rb_transcoder
rb_EUC_JP_to_ISO_2022_JP = { rb_EUC_JP_to_ISO_2022_JP = {
"EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 5, 0, "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 1, 5,
NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp
}; };

Просмотреть файл

@ -183,14 +183,12 @@ fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned
map = {} map = {}
map["{00-d7,e0-ff}{00-ff}"] = :func_so map["{00-d7,e0-ff}{00-ff}"] = :func_so
map["{d8-db}{00-ff}{dc-df}{00-ff}"] = :func_so map["{d8-db}{00-ff}{dc-df}{00-ff}"] = :func_so
map["{dc-df}{00-ff}"] = :invalid transcode_generate_node(ActionMap.parse(map), "from_UTF_16BE")
map["{d8-db}{00-ff}{00-db,e0-ff}{00-ff}"] = :invalid
transcode_generate_node(ActionMap.parse(map), "from_UTF_16BE", [])
%> %>
static const rb_transcoder static const rb_transcoder
rb_from_UTF_16BE = { rb_from_UTF_16BE = {
"UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0, "UTF-16BE", "UTF-8", &from_UTF_16BE, 2, 4,
NULL, NULL, NULL, &fun_so_from_utf_16be NULL, NULL, NULL, &fun_so_from_utf_16be
}; };
@ -205,18 +203,13 @@ rb_from_UTF_16BE = {
map["f0{90-bf}{80-bf}{80-bf}"] = :func_so map["f0{90-bf}{80-bf}{80-bf}"] = :func_so
map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so
map["f4{80-8f}{80-bf}{80-bf}"] = :func_so map["f4{80-8f}{80-bf}{80-bf}"] = :func_so
map["{80-c1,f5-ff}"] = :invalid
map["e0{80-9f}"] = :invalid
map["ed{a0-bf}"] = :invalid
map["f0{80-8f}"] = :invalid
map["f4{90-bf}"] = :invalid
am = ActionMap.parse(map) am = ActionMap.parse(map)
transcode_generate_node(am, "to_UTF_16BE", [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf]) transcode_generate_node(am, "to_UTF_16BE")
%> %>
static const rb_transcoder static const rb_transcoder
rb_to_UTF_16BE = { rb_to_UTF_16BE = {
"UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1, "UTF-8", "UTF-16BE", &to_UTF_16BE, 1, 4,
NULL, NULL, NULL, &fun_so_to_utf_16be NULL, NULL, NULL, &fun_so_to_utf_16be
}; };
@ -224,20 +217,18 @@ rb_to_UTF_16BE = {
map = {} map = {}
map["{00-ff}{00-d7,e0-ff}"] = :func_so map["{00-ff}{00-d7,e0-ff}"] = :func_so
map["{00-ff}{d8-db}{00-ff}{dc-df}"] = :func_so map["{00-ff}{d8-db}{00-ff}{dc-df}"] = :func_so
map["{00-ff}{dc-df}"] = :invalid transcode_generate_node(ActionMap.parse(map), "from_UTF_16LE")
map["{00-ff}{d8-db}{00-ff}{00-db,e0-ff}"] = :invalid
transcode_generate_node(ActionMap.parse(map), "from_UTF_16LE", [])
%> %>
static const rb_transcoder static const rb_transcoder
rb_from_UTF_16LE = { rb_from_UTF_16LE = {
"UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0, "UTF-16LE", "UTF-8", &from_UTF_16LE, 2, 4,
NULL, NULL, NULL, &fun_so_from_utf_16le NULL, NULL, NULL, &fun_so_from_utf_16le
}; };
static const rb_transcoder static const rb_transcoder
rb_to_UTF_16LE = { rb_to_UTF_16LE = {
"UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1, "UTF-8", "UTF-16LE", &to_UTF_16BE, 1, 4,
NULL, NULL, NULL, &fun_so_to_utf_16le NULL, NULL, NULL, &fun_so_to_utf_16le
}; };
@ -245,21 +236,18 @@ rb_to_UTF_16LE = {
map = {} map = {}
map["0000{00-d7,e0-ff}{00-ff}"] = :func_so map["0000{00-d7,e0-ff}{00-ff}"] = :func_so
map["00{01-10}{00-ff}{00-ff}"] = :func_so map["00{01-10}{00-ff}{00-ff}"] = :func_so
map["00{11-ff}{00-ff}{00-ff}"] = :invalid transcode_generate_node(ActionMap.parse(map), "from_UTF_32BE")
map["0000{d8-df}{00-ff}"] = :invalid
map["{01-ff}{00-ff}{00-ff}{00-ff}"] = :invalid
transcode_generate_node(ActionMap.parse(map), "from_UTF_32BE", [])
%> %>
static const rb_transcoder static const rb_transcoder
rb_from_UTF_32BE = { rb_from_UTF_32BE = {
"UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0, "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 4,
NULL, NULL, NULL, &fun_so_from_utf_32be NULL, NULL, NULL, &fun_so_from_utf_32be
}; };
static const rb_transcoder static const rb_transcoder
rb_to_UTF_32BE = { rb_to_UTF_32BE = {
"UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1, "UTF-8", "UTF-32BE", &to_UTF_16BE, 1, 4,
NULL, NULL, NULL, &fun_so_to_utf_32be NULL, NULL, NULL, &fun_so_to_utf_32be
}; };
@ -267,21 +255,18 @@ rb_to_UTF_32BE = {
map = {} map = {}
map["{00-ff}{00-d7,e0-ff}0000"] = :func_so map["{00-ff}{00-d7,e0-ff}0000"] = :func_so
map["{00-ff}{00-ff}{01-10}00"] = :func_so map["{00-ff}{00-ff}{01-10}00"] = :func_so
map["{00-ff}{00-ff}{00-ff}{01-ff}"] = :invalid transcode_generate_node(ActionMap.parse(map), "from_UTF_32LE")
map["{00-ff}{00-ff}{11-ff}00"] = :invalid
map["{00-ff}{d8-df}0000"] = :invalid
transcode_generate_node(ActionMap.parse(map), "from_UTF_32LE", [])
%> %>
static const rb_transcoder static const rb_transcoder
rb_from_UTF_32LE = { rb_from_UTF_32LE = {
"UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0, "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 4,
NULL, NULL, NULL, &fun_so_from_utf_32le NULL, NULL, NULL, &fun_so_from_utf_32le
}; };
static const rb_transcoder static const rb_transcoder
rb_to_UTF_32LE = { rb_to_UTF_32LE = {
"UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1, "UTF-8", "UTF-32LE", &to_UTF_16BE, 1, 4,
NULL, NULL, NULL, &fun_so_to_utf_32le NULL, NULL, NULL, &fun_so_to_utf_32le
}; };

Просмотреть файл

@ -267,8 +267,30 @@ class TestTranscode < Test::Unit::TestCase
"\x80".encode("UTF-32BE", "UTF-8", invalid: :replace)) "\x80".encode("UTF-32BE", "UTF-8", invalid: :replace))
assert_equal("\xFD\xFF\x00\x00".force_encoding("UTF-32LE"), assert_equal("\xFD\xFF\x00\x00".force_encoding("UTF-32LE"),
"\x80".encode("UTF-32LE", "UTF-8", invalid: :replace)) "\x80".encode("UTF-32LE", "UTF-8", invalid: :replace))
assert_equal("\uFFFD!", assert_equal("\uFFFD!",
"\x01\x00\x00\x00\x00\x00\x00\x21".encode("utf-8", "utf-32be", :invalid=>:replace), "[ruby-dev:35726]") "\xdc\x00\x00!".encode("utf-8", "utf-16be", :invalid=>:replace))
assert_equal("\uFFFD!",
"\xd8\x00\x00!".encode("utf-8", "utf-16be", :invalid=>:replace))
assert_equal("\uFFFD!",
"\x00\xdc!\x00".encode("utf-8", "utf-16le", :invalid=>:replace))
assert_equal("\uFFFD!",
"\x00\xd8!\x00".encode("utf-8", "utf-16le", :invalid=>:replace))
assert_equal("\uFFFD!",
"\x01\x00\x00\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace), "[ruby-dev:35726]")
assert_equal("\uFFFD!",
"\x00\xff\x00\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace))
assert_equal("\uFFFD!",
"\x00\x00\xd8\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace))
assert_equal("\uFFFD!",
"\xff!".encode("utf-8", "euc-jp", :invalid=>:replace))
assert_equal("\uFFFD!",
"\xa1!".encode("utf-8", "euc-jp", :invalid=>:replace))
assert_equal("\uFFFD!",
"\x8f\xa1!".encode("utf-8", "euc-jp", :invalid=>:replace))
end end
def test_undef_replace def test_undef_replace

Просмотреть файл

@ -213,13 +213,16 @@ class ActionMap
OffsetsMemo = {} OffsetsMemo = {}
InfosMemo = {} InfosMemo = {}
def format_offsets(offsets) def format_offsets(min, max, offsets)
code = "{\n" offsets = offsets[min..max]
code = "{ %d, %d,\n" % [min, max]
0.step(offsets.length-1,16) {|i| 0.step(offsets.length-1,16) {|i|
code << " " code << " "
code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('') code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('')
code << " " if i+8 < offsets.length
code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('') code << " "
code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('')
end
code << "\n" code << "\n"
} }
code << '}' code << '}'
@ -276,14 +279,22 @@ class ActionMap
offsets = [] offsets = []
infos = [] infos = []
infomap = {} infomap = {}
min = max = nil
table.each_with_index {|action, byte| table.each_with_index {|action, byte|
action ||= :invalid action ||= :invalid
if action != :invalid
min = byte if !min
max = byte
end
unless o = infomap[action] unless o = infomap[action]
infomap[action] = o = infos.length infomap[action] = o = infos.length
infos[o] = action infos[o] = action
end end
offsets[byte] = o offsets[byte] = o
} }
if !min
min = max = 0
end
if n = OffsetsMemo[offsets] if n = OffsetsMemo[offsets]
offsets_name = n offsets_name = n
@ -292,7 +303,7 @@ class ActionMap
offsets_name = "#{name}_offsets" offsets_name = "#{name}_offsets"
offsets_code = <<"End" offsets_code = <<"End"
static const unsigned char static const unsigned char
#{offsets_name}[#{offsets.length}] = #{format_offsets(offsets)}; #{offsets_name}[#{2+max-min+1}] = #{format_offsets(min,max,offsets)};
End End
OffsetsMemo[offsets] = offsets_name OffsetsMemo[offsets] = offsets_name
end end
@ -324,24 +335,19 @@ End
PostMemo = {} PostMemo = {}
NextName = "a" NextName = "a"
def generate_node(code, name_hint=nil, ranges=[], valid_encoding=nil) def generate_node(code, name_hint=nil, valid_encoding=nil)
ranges = [0x00..0xff] if ranges.empty?
range = ranges.first
if n = PreMemo[[self,valid_encoding]] if n = PreMemo[[self,valid_encoding]]
return n return n
end end
table = Array.new(range.end - range.begin + 1) table = Array.new(0x100, :invalid)
each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding| each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding|
unless range === byte
raise "byte not in range"
end
if a = rest.empty_action if a = rest.empty_action
table[byte-range.begin] = a table[byte] = a
else else
name_hint2 = nil name_hint2 = nil
name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint
table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1], rest_valid_encoding) table[byte] = "&" + rest.generate_node(code, name_hint2, rest_valid_encoding)
end end
} }
@ -386,9 +392,8 @@ def transcode_compile_tree(name, from, map)
valid_encoding = nil valid_encoding = nil
end end
ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : []
code = '' code = ''
defined_name = am.generate_node(code, name, ranges, valid_encoding) defined_name = am.generate_node(code, name, valid_encoding)
return defined_name, code return defined_name, code
end end
@ -409,22 +414,22 @@ def transcode_tblgen(from, to, map)
real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map) real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map)
transcoder_name = "rb_#{tree_name}" transcoder_name = "rb_#{tree_name}"
TRANSCODERS << transcoder_name TRANSCODERS << transcoder_name
from_utf8 = from == 'UTF-8' ? 1 : 0 from_unit_length = UnitLength[from]
max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max
transcoder_code = <<"End" transcoder_code = <<"End"
static const rb_transcoder static const rb_transcoder
#{transcoder_name} = { #{transcoder_name} = {
#{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{max_output}, #{from_utf8}, #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{from_unit_length}, #{max_output},
NULL, NULL, NULL, NULL,
}; };
End End
tree_code + "\n" + transcoder_code tree_code + "\n" + transcoder_code
end end
def transcode_generate_node(am, name_hint=nil, ranges=[]) def transcode_generate_node(am, name_hint=nil)
STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE
code = '' code = ''
am.generate_node(code, name_hint, ranges) am.generate_node(code, name_hint)
code code
end end
@ -436,6 +441,14 @@ def transcode_register_code
code code
end end
UnitLength = {
'UTF-16BE' => 2,
'UTF-16LE' => 2,
'UTF-32BE' => 4,
'UTF-32LE' => 4,
}
UnitLength.default = 1
ValidEncoding = { ValidEncoding = {
'1byte' => '{00-ff}', '1byte' => '{00-ff}',
'2byte' => '{00-ff}{00-ff}', '2byte' => '{00-ff}{00-ff}',

Просмотреть файл

@ -336,10 +336,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start; const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start;
const BYTE_LOOKUP *next_table; const BYTE_LOOKUP *next_table;
const unsigned char *char_start; const unsigned char *char_start;
unsigned int next_offset;
VALUE next_info; VALUE next_info;
unsigned char next_byte; unsigned char next_byte;
int from_utf8 = my_transcoder->from_utf8;
unsigned char *out_s = out_stop - my_transcoder->max_output + 1; unsigned char *out_s = out_stop - my_transcoder->max_output + 1;
rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding); rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding);
@ -355,8 +353,12 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
} }
next_byte = (unsigned char)*in_p++; next_byte = (unsigned char)*in_p++;
follow_byte: follow_byte:
next_offset = next_table->base[next_byte]; if (next_byte < next_table->base[0] || next_table->base[1] < next_byte)
next_info = (VALUE)next_table->info[next_offset]; next_info = INVALID;
else {
unsigned int next_offset = next_table->base[2+next_byte-next_table->base[0]];
next_info = (VALUE)next_table->info[next_offset];
}
follow_info: follow_info:
switch (next_info & 0x1F) { switch (next_info & 0x1F) {
case NOMAP: case NOMAP:
@ -370,14 +372,6 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
goto invalid; goto invalid;
} }
next_byte = (unsigned char)*in_p++; next_byte = (unsigned char)*in_p++;
if (from_utf8) {
if ((next_byte&0xC0) == 0x80)
next_byte -= 0x80;
else {
in_p--; /* may need to add more code later to revert other things */
goto invalid;
}
}
next_table = (const BYTE_LOOKUP *)next_info; next_table = (const BYTE_LOOKUP *)next_info;
goto follow_byte; goto follow_byte;
/* maybe rewrite the following cases to use fallthrough???? */ /* maybe rewrite the following cases to use fallthrough???? */
@ -411,7 +405,16 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)(in_p-char_start), out_p); out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)(in_p-char_start), out_p);
break; break;
case INVALID: case INVALID:
goto invalid; {
int unitlen = my_transcoder->from_unit_length;
if (in_stop - char_start <= unitlen)
in_p = in_stop;
else if (in_p - char_start <= unitlen)
in_p = char_start + unitlen;
else
in_p = char_start + ((in_p - char_start - 1) / unitlen) * unitlen;
goto invalid;
}
case UNDEF: case UNDEF:
goto undef; goto undef;
} }

Просмотреть файл

@ -72,8 +72,8 @@ typedef struct rb_transcoder {
const char *from_encoding; const char *from_encoding;
const char *to_encoding; const char *to_encoding;
const BYTE_LOOKUP *conv_tree_start; const BYTE_LOOKUP *conv_tree_start;
int from_unit_length;
int max_output; int max_output;
int from_utf8;
VALUE (*func_ii)(rb_transcoding*, VALUE); /* info -> info */ VALUE (*func_ii)(rb_transcoding*, VALUE); /* info -> info */
VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */ VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */
int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */ int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */