diff --git a/ChangeLog b/ChangeLog index 2781b23428..2c0fe8041d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,21 @@ +Sat Aug 9 00:42:33 2008 Tanaka Akira + + * transcode_data.h (rb_transcoder): from_unit_length field added. + from_utf8 field removed. + + * tool/transcode-tblgen.rb: generate offsets range. + follow rb_transcoder change. + + * transcode.c (transcode_loop): don't use from_utf8. + make invalid region from_unit_length wise. + + * enc/trans/iso2022.erb.c: follow rb_transcoder and + transcode_generate_node change. + + * enc/trans/utf_16_32.erb.c: follow rb_transcoder and + transcode_generate_node change. + explicit :invalid map removed. + Fri Aug 8 23:29:44 2008 Nobuyoshi Nakada * enc/depend (TRANSCSRCS): needs rule_subst to apply. diff --git a/enc/trans/iso2022.erb.c b/enc/trans/iso2022.erb.c index c3f6be693c..72553f4054 100644 --- a/enc/trans/iso2022.erb.c +++ b/enc/trans/iso2022.erb.c @@ -12,8 +12,8 @@ map_jisx0208_rest["{21-7e}"] = :func_so %> -<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp", []) %> -<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest", []) %> +<%= transcode_generate_node(ActionMap.parse(map), "iso2022jp_to_eucjp") %> +<%= transcode_generate_node(ActionMap.parse(map_jisx0208_rest), "iso2022jp_to_eucjp_jisx0208_rest") %> static VALUE fun_si_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l) @@ -57,7 +57,7 @@ fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l, u static const rb_transcoder rb_ISO_2022_JP_to_EUC_JP = { - "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 3, 0, + "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 1, 3, NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp }; @@ -71,7 +71,7 @@ rb_ISO_2022_JP_to_EUC_JP = { } %> -<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp", []) %> +<%= transcode_generate_node(ActionMap.parse(map_eucjp), "eucjp_to_iso2022jp") %> static int fun_so_eucjp_to_iso2022jp(rb_transcoding *t, const unsigned char *s, size_t l, unsigned char *o) @@ -129,7 +129,7 @@ finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o) static const rb_transcoder rb_EUC_JP_to_ISO_2022_JP = { - "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 5, 0, + "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 1, 5, NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp }; diff --git a/enc/trans/utf_16_32.erb.c b/enc/trans/utf_16_32.erb.c index 67f84e74bf..2cf7560b4e 100644 --- a/enc/trans/utf_16_32.erb.c +++ b/enc/trans/utf_16_32.erb.c @@ -183,14 +183,12 @@ fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned map = {} map["{00-d7,e0-ff}{00-ff}"] = :func_so map["{d8-db}{00-ff}{dc-df}{00-ff}"] = :func_so - map["{dc-df}{00-ff}"] = :invalid - map["{d8-db}{00-ff}{00-db,e0-ff}{00-ff}"] = :invalid - transcode_generate_node(ActionMap.parse(map), "from_UTF_16BE", []) + transcode_generate_node(ActionMap.parse(map), "from_UTF_16BE") %> static const rb_transcoder rb_from_UTF_16BE = { - "UTF-16BE", "UTF-8", &from_UTF_16BE, 4, 0, + "UTF-16BE", "UTF-8", &from_UTF_16BE, 2, 4, NULL, NULL, NULL, &fun_so_from_utf_16be }; @@ -205,18 +203,13 @@ rb_from_UTF_16BE = { map["f0{90-bf}{80-bf}{80-bf}"] = :func_so map["{f1-f3}{80-bf}{80-bf}{80-bf}"] = :func_so map["f4{80-8f}{80-bf}{80-bf}"] = :func_so - map["{80-c1,f5-ff}"] = :invalid - map["e0{80-9f}"] = :invalid - map["ed{a0-bf}"] = :invalid - map["f0{80-8f}"] = :invalid - map["f4{90-bf}"] = :invalid am = ActionMap.parse(map) - transcode_generate_node(am, "to_UTF_16BE", [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf]) + transcode_generate_node(am, "to_UTF_16BE") %> static const rb_transcoder rb_to_UTF_16BE = { - "UTF-8", "UTF-16BE", &to_UTF_16BE, 4, 1, + "UTF-8", "UTF-16BE", &to_UTF_16BE, 1, 4, NULL, NULL, NULL, &fun_so_to_utf_16be }; @@ -224,20 +217,18 @@ rb_to_UTF_16BE = { map = {} map["{00-ff}{00-d7,e0-ff}"] = :func_so map["{00-ff}{d8-db}{00-ff}{dc-df}"] = :func_so - map["{00-ff}{dc-df}"] = :invalid - map["{00-ff}{d8-db}{00-ff}{00-db,e0-ff}"] = :invalid - transcode_generate_node(ActionMap.parse(map), "from_UTF_16LE", []) + transcode_generate_node(ActionMap.parse(map), "from_UTF_16LE") %> static const rb_transcoder rb_from_UTF_16LE = { - "UTF-16LE", "UTF-8", &from_UTF_16LE, 4, 0, + "UTF-16LE", "UTF-8", &from_UTF_16LE, 2, 4, NULL, NULL, NULL, &fun_so_from_utf_16le }; static const rb_transcoder rb_to_UTF_16LE = { - "UTF-8", "UTF-16LE", &to_UTF_16BE, 4, 1, + "UTF-8", "UTF-16LE", &to_UTF_16BE, 1, 4, NULL, NULL, NULL, &fun_so_to_utf_16le }; @@ -245,21 +236,18 @@ rb_to_UTF_16LE = { map = {} map["0000{00-d7,e0-ff}{00-ff}"] = :func_so map["00{01-10}{00-ff}{00-ff}"] = :func_so - map["00{11-ff}{00-ff}{00-ff}"] = :invalid - map["0000{d8-df}{00-ff}"] = :invalid - map["{01-ff}{00-ff}{00-ff}{00-ff}"] = :invalid - transcode_generate_node(ActionMap.parse(map), "from_UTF_32BE", []) + transcode_generate_node(ActionMap.parse(map), "from_UTF_32BE") %> static const rb_transcoder rb_from_UTF_32BE = { - "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 0, + "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 4, NULL, NULL, NULL, &fun_so_from_utf_32be }; static const rb_transcoder rb_to_UTF_32BE = { - "UTF-8", "UTF-32BE", &to_UTF_16BE, 4, 1, + "UTF-8", "UTF-32BE", &to_UTF_16BE, 1, 4, NULL, NULL, NULL, &fun_so_to_utf_32be }; @@ -267,21 +255,18 @@ rb_to_UTF_32BE = { map = {} map["{00-ff}{00-d7,e0-ff}0000"] = :func_so map["{00-ff}{00-ff}{01-10}00"] = :func_so - map["{00-ff}{00-ff}{00-ff}{01-ff}"] = :invalid - map["{00-ff}{00-ff}{11-ff}00"] = :invalid - map["{00-ff}{d8-df}0000"] = :invalid - transcode_generate_node(ActionMap.parse(map), "from_UTF_32LE", []) + transcode_generate_node(ActionMap.parse(map), "from_UTF_32LE") %> static const rb_transcoder rb_from_UTF_32LE = { - "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 0, + "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 4, NULL, NULL, NULL, &fun_so_from_utf_32le }; static const rb_transcoder rb_to_UTF_32LE = { - "UTF-8", "UTF-32LE", &to_UTF_16BE, 4, 1, + "UTF-8", "UTF-32LE", &to_UTF_16BE, 1, 4, NULL, NULL, NULL, &fun_so_to_utf_32le }; diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index 26e56ffb0c..095028e42a 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -267,8 +267,30 @@ class TestTranscode < Test::Unit::TestCase "\x80".encode("UTF-32BE", "UTF-8", invalid: :replace)) assert_equal("\xFD\xFF\x00\x00".force_encoding("UTF-32LE"), "\x80".encode("UTF-32LE", "UTF-8", invalid: :replace)) + assert_equal("\uFFFD!", - "\x01\x00\x00\x00\x00\x00\x00\x21".encode("utf-8", "utf-32be", :invalid=>:replace), "[ruby-dev:35726]") + "\xdc\x00\x00!".encode("utf-8", "utf-16be", :invalid=>:replace)) + assert_equal("\uFFFD!", + "\xd8\x00\x00!".encode("utf-8", "utf-16be", :invalid=>:replace)) + + assert_equal("\uFFFD!", + "\x00\xdc!\x00".encode("utf-8", "utf-16le", :invalid=>:replace)) + assert_equal("\uFFFD!", + "\x00\xd8!\x00".encode("utf-8", "utf-16le", :invalid=>:replace)) + + assert_equal("\uFFFD!", + "\x01\x00\x00\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace), "[ruby-dev:35726]") + assert_equal("\uFFFD!", + "\x00\xff\x00\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace)) + assert_equal("\uFFFD!", + "\x00\x00\xd8\x00\x00\x00\x00!".encode("utf-8", "utf-32be", :invalid=>:replace)) + + assert_equal("\uFFFD!", + "\xff!".encode("utf-8", "euc-jp", :invalid=>:replace)) + assert_equal("\uFFFD!", + "\xa1!".encode("utf-8", "euc-jp", :invalid=>:replace)) + assert_equal("\uFFFD!", + "\x8f\xa1!".encode("utf-8", "euc-jp", :invalid=>:replace)) end def test_undef_replace diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb index 767ea0bbf3..3a20b3f0b1 100644 --- a/tool/transcode-tblgen.rb +++ b/tool/transcode-tblgen.rb @@ -213,13 +213,16 @@ class ActionMap OffsetsMemo = {} InfosMemo = {} - def format_offsets(offsets) - code = "{\n" + def format_offsets(min, max, offsets) + offsets = offsets[min..max] + code = "{ %d, %d,\n" % [min, max] 0.step(offsets.length-1,16) {|i| code << " " code << offsets[i,8].map {|off| "%3d," % off.to_s }.join('') - code << " " - code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('') + if i+8 < offsets.length + code << " " + code << offsets[i+8,8].map {|off| "%3d," % off.to_s }.join('') + end code << "\n" } code << '}' @@ -276,14 +279,22 @@ class ActionMap offsets = [] infos = [] infomap = {} + min = max = nil table.each_with_index {|action, byte| action ||= :invalid + if action != :invalid + min = byte if !min + max = byte + end unless o = infomap[action] infomap[action] = o = infos.length infos[o] = action end offsets[byte] = o } + if !min + min = max = 0 + end if n = OffsetsMemo[offsets] offsets_name = n @@ -292,7 +303,7 @@ class ActionMap offsets_name = "#{name}_offsets" offsets_code = <<"End" static const unsigned char -#{offsets_name}[#{offsets.length}] = #{format_offsets(offsets)}; +#{offsets_name}[#{2+max-min+1}] = #{format_offsets(min,max,offsets)}; End OffsetsMemo[offsets] = offsets_name end @@ -324,24 +335,19 @@ End PostMemo = {} NextName = "a" - def generate_node(code, name_hint=nil, ranges=[], valid_encoding=nil) - ranges = [0x00..0xff] if ranges.empty? - range = ranges.first + def generate_node(code, name_hint=nil, valid_encoding=nil) if n = PreMemo[[self,valid_encoding]] return n end - table = Array.new(range.end - range.begin + 1) + table = Array.new(0x100, :invalid) each_firstbyte(valid_encoding) {|byte, rest, rest_valid_encoding| - unless range === byte - raise "byte not in range" - end if a = rest.empty_action - table[byte-range.begin] = a + table[byte] = a else name_hint2 = nil name_hint2 = "#{name_hint}_#{'%02X' % byte}" if name_hint - table[byte-range.begin] = "&" + rest.generate_node(code, name_hint2, ranges[1..-1], rest_valid_encoding) + table[byte] = "&" + rest.generate_node(code, name_hint2, rest_valid_encoding) end } @@ -386,9 +392,8 @@ def transcode_compile_tree(name, from, map) valid_encoding = nil end - ranges = from == "UTF-8" ? [0x00..0xff, 0x80..0xbf, 0x80..0xbf, 0x80..0xbf] : [] code = '' - defined_name = am.generate_node(code, name, ranges, valid_encoding) + defined_name = am.generate_node(code, name, valid_encoding) return defined_name, code end @@ -409,22 +414,22 @@ def transcode_tblgen(from, to, map) real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map) transcoder_name = "rb_#{tree_name}" TRANSCODERS << transcoder_name - from_utf8 = from == 'UTF-8' ? 1 : 0 + from_unit_length = UnitLength[from] max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max transcoder_code = <<"End" static const rb_transcoder #{transcoder_name} = { - #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{max_output}, #{from_utf8}, + #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{from_unit_length}, #{max_output}, NULL, NULL, }; End tree_code + "\n" + transcoder_code end -def transcode_generate_node(am, name_hint=nil, ranges=[]) +def transcode_generate_node(am, name_hint=nil) STDERR.puts "converter for #{name_hint}" if VERBOSE_MODE code = '' - am.generate_node(code, name_hint, ranges) + am.generate_node(code, name_hint) code end @@ -436,6 +441,14 @@ def transcode_register_code code end +UnitLength = { + 'UTF-16BE' => 2, + 'UTF-16LE' => 2, + 'UTF-32BE' => 4, + 'UTF-32LE' => 4, +} +UnitLength.default = 1 + ValidEncoding = { '1byte' => '{00-ff}', '2byte' => '{00-ff}{00-ff}', diff --git a/transcode.c b/transcode.c index 3a1ab70a81..75a802572c 100644 --- a/transcode.c +++ b/transcode.c @@ -336,10 +336,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start; const BYTE_LOOKUP *next_table; const unsigned char *char_start; - unsigned int next_offset; VALUE next_info; unsigned char next_byte; - int from_utf8 = my_transcoder->from_utf8; unsigned char *out_s = out_stop - my_transcoder->max_output + 1; rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding); @@ -355,8 +353,12 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, } next_byte = (unsigned char)*in_p++; follow_byte: - next_offset = next_table->base[next_byte]; - next_info = (VALUE)next_table->info[next_offset]; + if (next_byte < next_table->base[0] || next_table->base[1] < next_byte) + next_info = INVALID; + else { + unsigned int next_offset = next_table->base[2+next_byte-next_table->base[0]]; + next_info = (VALUE)next_table->info[next_offset]; + } follow_info: switch (next_info & 0x1F) { case NOMAP: @@ -370,14 +372,6 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, goto invalid; } next_byte = (unsigned char)*in_p++; - if (from_utf8) { - if ((next_byte&0xC0) == 0x80) - next_byte -= 0x80; - else { - in_p--; /* may need to add more code later to revert other things */ - goto invalid; - } - } next_table = (const BYTE_LOOKUP *)next_info; goto follow_byte; /* maybe rewrite the following cases to use fallthrough???? */ @@ -411,7 +405,16 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)(in_p-char_start), out_p); break; case INVALID: - goto invalid; + { + int unitlen = my_transcoder->from_unit_length; + if (in_stop - char_start <= unitlen) + in_p = in_stop; + else if (in_p - char_start <= unitlen) + in_p = char_start + unitlen; + else + in_p = char_start + ((in_p - char_start - 1) / unitlen) * unitlen; + goto invalid; + } case UNDEF: goto undef; } diff --git a/transcode_data.h b/transcode_data.h index 92f8ade436..ba2e6e99b3 100644 --- a/transcode_data.h +++ b/transcode_data.h @@ -72,8 +72,8 @@ typedef struct rb_transcoder { const char *from_encoding; const char *to_encoding; const BYTE_LOOKUP *conv_tree_start; + int from_unit_length; int max_output; - int from_utf8; VALUE (*func_ii)(rb_transcoding*, VALUE); /* info -> info */ VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */ int (*func_io)(rb_transcoding*, VALUE, const unsigned char*); /* info -> output */