From 139234e1a091ac3167d3bebdfcd29b3952665334 Mon Sep 17 00:00:00 2001 From: akr Date: Sat, 9 Aug 2008 06:02:01 +0000 Subject: [PATCH] * transcode_data.h (rb_transcoding): add fields for restartable transcoding. (rb_transcoder): add max_input field. from_unit_length field is renamed to input_unit_length. * tool/transcode-tblgen.rb: generate max_input field. * enc/trans/iso2022.erb.c: follow rb_transcoder change. * enc/trans/utf_16_32.erb.c: ditto. * transcode.c (PARTIAL_INPUT): new constant. (transcode_char_start): new function. (transcode_result_t): new type. (transcode_restartable): new function. (more_output_buffer): new function. (transcode_loop): use transcode_restartable. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18452 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 20 ++ enc/trans/iso2022.erb.c | 10 +- enc/trans/utf_16_32.erb.c | 40 +++- tool/transcode-tblgen.rb | 35 +++- transcode.c | 401 +++++++++++++++++++++++++++++++------- transcode_data.h | 15 +- 6 files changed, 439 insertions(+), 82 deletions(-) diff --git a/ChangeLog b/ChangeLog index 40f67df545..902bd6831b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,23 @@ +Sat Aug 9 14:39:34 2008 Tanaka Akira + + * transcode_data.h (rb_transcoding): add fields for restartable + transcoding. + (rb_transcoder): add max_input field. + from_unit_length field is renamed to input_unit_length. + + * tool/transcode-tblgen.rb: generate max_input field. + + * enc/trans/iso2022.erb.c: follow rb_transcoder change. + + * enc/trans/utf_16_32.erb.c: ditto. + + * transcode.c (PARTIAL_INPUT): new constant. + (transcode_char_start): new function. + (transcode_result_t): new type. + (transcode_restartable): new function. + (more_output_buffer): new function. + (transcode_loop): use transcode_restartable. + Sat Aug 9 13:35:08 2008 Nobuyoshi Nakada * stable/ext/socket/socket.c (NI_MAXHOST, NI_MAXSERV): fixed invalid diff --git a/enc/trans/iso2022.erb.c b/enc/trans/iso2022.erb.c index 72553f4054..3209fad163 100644 --- a/enc/trans/iso2022.erb.c +++ b/enc/trans/iso2022.erb.c @@ -57,7 +57,10 @@ fun_so_iso2022jp_to_eucjp(rb_transcoding* t, const unsigned char* s, size_t l, u static const rb_transcoder rb_ISO_2022_JP_to_EUC_JP = { - "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, 1, 3, + "ISO-2022-JP", "EUC-JP", &iso2022jp_to_eucjp, + 1, /* input_unit_length */ + 3, /* max_input */ + 3, /* max_output */ NULL, fun_si_iso2022jp_to_eucjp, NULL, fun_so_iso2022jp_to_eucjp }; @@ -129,7 +132,10 @@ finish_eucjp_to_iso2022jp(rb_transcoding *t, unsigned char *o) static const rb_transcoder rb_EUC_JP_to_ISO_2022_JP = { - "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, 1, 5, + "EUC-JP", "ISO-2022-JP", &eucjp_to_iso2022jp, + 1, /* input_unit_length */ + 3, /* max_input */ + 5, /* max_output */ NULL, NULL, NULL, fun_so_eucjp_to_iso2022jp, finish_eucjp_to_iso2022jp }; diff --git a/enc/trans/utf_16_32.erb.c b/enc/trans/utf_16_32.erb.c index 86f1ed4727..aea2ab50a8 100644 --- a/enc/trans/utf_16_32.erb.c +++ b/enc/trans/utf_16_32.erb.c @@ -231,7 +231,10 @@ fun_so_to_utf_32le(rb_transcoding* t, const unsigned char* s, size_t l, unsigned static const rb_transcoder rb_from_UTF_16BE = { - "UTF-16BE", "UTF-8", &from_UTF_16BE, 2, 4, + "UTF-16BE", "UTF-8", &from_UTF_16BE, + 2, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ NULL, NULL, NULL, &fun_so_from_utf_16be }; @@ -252,7 +255,10 @@ rb_from_UTF_16BE = { static const rb_transcoder rb_to_UTF_16BE = { - "UTF-8", "UTF-16BE", &to_UTF_16BE, 1, 4, + "UTF-8", "UTF-16BE", &to_UTF_16BE, + 1, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ NULL, NULL, NULL, &fun_so_to_utf_16be }; @@ -265,13 +271,19 @@ rb_to_UTF_16BE = { static const rb_transcoder rb_from_UTF_16LE = { - "UTF-16LE", "UTF-8", &from_UTF_16LE, 2, 4, + "UTF-16LE", "UTF-8", &from_UTF_16LE, + 2, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ NULL, NULL, NULL, &fun_so_from_utf_16le }; static const rb_transcoder rb_to_UTF_16LE = { - "UTF-8", "UTF-16LE", &to_UTF_16BE, 1, 4, + "UTF-8", "UTF-16LE", &to_UTF_16BE, + 1, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ NULL, NULL, NULL, &fun_so_to_utf_16le }; @@ -284,13 +296,19 @@ rb_to_UTF_16LE = { static const rb_transcoder rb_from_UTF_32BE = { - "UTF-32BE", "UTF-8", &from_UTF_32BE, 4, 4, + "UTF-32BE", "UTF-8", &from_UTF_32BE, + 4, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ NULL, NULL, NULL, &fun_so_from_utf_32be }; static const rb_transcoder rb_to_UTF_32BE = { - "UTF-8", "UTF-32BE", &to_UTF_16BE, 1, 4, + "UTF-8", "UTF-32BE", &to_UTF_16BE, + 1, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ NULL, NULL, NULL, &fun_so_to_utf_32be }; @@ -303,13 +321,19 @@ rb_to_UTF_32BE = { static const rb_transcoder rb_from_UTF_32LE = { - "UTF-32LE", "UTF-8", &from_UTF_32LE, 4, 4, + "UTF-32LE", "UTF-8", &from_UTF_32LE, + 4, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ NULL, NULL, NULL, &fun_so_from_utf_32le }; static const rb_transcoder rb_to_UTF_32LE = { - "UTF-8", "UTF-32LE", &to_UTF_16BE, 1, 4, + "UTF-8", "UTF-32LE", &to_UTF_16BE, + 1, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ NULL, NULL, NULL, &fun_so_to_utf_32le }; diff --git a/tool/transcode-tblgen.rb b/tool/transcode-tblgen.rb index 3a20b3f0b1..119fa0d1cb 100644 --- a/tool/transcode-tblgen.rb +++ b/tool/transcode-tblgen.rb @@ -101,6 +101,22 @@ class StrSet "\#<#{self.class}: #{self.to_s}>" end + def min_length + if @pat.empty? + nil + else + @pat.map {|seq| seq.length }.min + end + end + + def max_length + if @pat.empty? + nil + else + @pat.map {|seq| seq.length }.max + end + end + def emptyable? @pat.any? {|seq| seq.empty? @@ -170,6 +186,10 @@ class ActionMap ">" end + def max_input_length + @map.keys.map {|k| k.max_length }.max + end + def empty_action @map.each {|ss, action| return action if ss.emptyable? @@ -386,6 +406,8 @@ def transcode_compile_tree(name, from, map) } am = ActionMap.parse(h) + max_input = am.max_input_length + if ValidEncoding[from] valid_encoding = StrSet.parse(ValidEncoding[from]) else @@ -394,7 +416,7 @@ def transcode_compile_tree(name, from, map) code = '' defined_name = am.generate_node(code, name, valid_encoding) - return defined_name, code + return defined_name, code, max_input end TRANSCODERS = [] @@ -411,16 +433,19 @@ def transcode_tblgen(from, to, map) tree_name = "from_#{id_from}_to_#{id_to}" end map = encode_utf8(map) - real_tree_name, tree_code = transcode_compile_tree(tree_name, from, map) + real_tree_name, tree_code, max_input = transcode_compile_tree(tree_name, from, map) transcoder_name = "rb_#{tree_name}" TRANSCODERS << transcoder_name - from_unit_length = UnitLength[from] + input_unit_length = UnitLength[from] max_output = map.map {|k,v| String === v ? v.length/2 : 1 }.max transcoder_code = <<"End" static const rb_transcoder #{transcoder_name} = { - #{c_esc from}, #{c_esc to}, &#{real_tree_name}, #{from_unit_length}, #{max_output}, - NULL, NULL, + #{c_esc from}, #{c_esc to}, &#{real_tree_name}, + #{input_unit_length}, /* input_unit_length */ + #{max_input}, /* max_input */ + #{max_output}, /* max_output */ + NULL, NULL, NULL, NULL, NULL }; End tree_code + "\n" + transcoder_code diff --git a/transcode.c b/transcode.c index 75a802572c..4c979e8c41 100644 --- a/transcode.c +++ b/transcode.c @@ -20,6 +20,7 @@ static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace; #define INVALID_REPLACE 0x2 #define UNDEF_IGNORE 0x10 #define UNDEF_REPLACE 0x20 +#define PARTIAL_INPUT 0x100 /* * Dispatch data and logic @@ -324,34 +325,117 @@ output_replacement_character(unsigned char **out_pp, rb_encoding *enc) /* * Transcoding engine logic */ -static void -transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, - const unsigned char *in_stop, unsigned char *out_stop, - const rb_transcoder *my_transcoder, - rb_transcoding *my_transcoding, - const int opt) -{ - const unsigned char *in_p = *in_pos; - unsigned char *out_p = *out_pos; - const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start; - const BYTE_LOOKUP *next_table; - const unsigned char *char_start; - VALUE next_info; - unsigned char next_byte; - unsigned char *out_s = out_stop - my_transcoder->max_output + 1; - rb_encoding *to_encoding = rb_enc_find(my_transcoder->to_encoding); - while (in_p < in_stop) { - char_start = in_p; - next_table = conv_tree_start; - if (out_p >= out_s) { - int len = (out_p - *out_pos); - int new_len = (len + my_transcoder->max_output) * 2; - *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); - out_p = *out_pos + len; - out_s = *out_pos + new_len - my_transcoder->max_output; - } +static const unsigned char * +transcode_char_start(rb_transcoding *my_transcoding, + const unsigned char **in_pos, + const unsigned char *in_p, + int readlen) +{ + const unsigned char *ptr; + if (in_p - *in_pos < readlen) { + int restlen = readlen - my_transcoding->readlen; + MEMCPY(TRANSCODING_READBUF(my_transcoding) + my_transcoding->readlen, + in_p - restlen, unsigned char, restlen); + my_transcoding->readlen = readlen; + ptr = TRANSCODING_READBUF(my_transcoding); + } + else { + ptr = in_p - readlen; + } + return ptr; +} + +typedef enum { + transcode_invalid_input, + transcode_undefined_conversion, + transcode_obuf_full, + transcode_ibuf_empty, + transcode_finished, +} transcode_result_t; + +static transcode_result_t +transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos, + const unsigned char *in_stop, unsigned char *out_stop, + const rb_transcoder *my_transcoder, + rb_transcoding *my_transcoding, + const int opt) + +{ + int unitlen = my_transcoder->input_unit_length; + + const unsigned char *in_p; + unsigned char *out_p; + int readlen; + const BYTE_LOOKUP *next_table; + + unsigned char empty_buf; + unsigned char *empty_ptr = &empty_buf; + + if (!in_pos) { + in_pos = (const unsigned char **)&empty_ptr; + in_stop = empty_ptr; + } + + if (!out_pos) { + out_pos = &empty_ptr; + out_stop = empty_ptr; + } + + in_p = *in_pos; + out_p = *out_pos; + readlen = my_transcoding->readlen; + next_table = my_transcoding->next_table; + +#define SUSPEND(ret, num) \ + do { \ + my_transcoding->resume_position = (num); \ + if (my_transcoding->readlen < readlen) \ + MEMCPY(TRANSCODING_READBUF(my_transcoding)+my_transcoding->readlen, \ + in_p - (readlen-my_transcoding->readlen), \ + unsigned char, \ + readlen-my_transcoding->readlen); \ + *in_pos = in_p; \ + *out_pos = out_p; \ + my_transcoding->readlen = readlen; \ + my_transcoding->next_table = next_table; \ + return ret; \ + resume_label ## num:; \ + } while (0) + + switch (my_transcoding->resume_position) { + case 0: break; + case 1: goto resume_label1; + case 2: goto resume_label2; + case 3: goto resume_label3; + case 4: goto resume_label4; + case 5: goto resume_label5; + case 6: goto resume_label6; + case 7: goto resume_label7; + case 8: goto resume_label8; + case 9: goto resume_label9; + case 10: goto resume_label10; + case 11: goto resume_label11; + case 12: goto resume_label12; + case 13: goto resume_label13; + case 14: goto resume_label14; + } + + while (1) { + unsigned char next_byte; + VALUE next_info; + + if (in_stop <= in_p) { + if (!(opt & PARTIAL_INPUT)) + break; + SUSPEND(transcode_ibuf_empty, 7); + continue; + } + + my_transcoding->readlen = readlen = 0; + next_table = my_transcoder->conv_tree_start; next_byte = (unsigned char)*in_p++; + readlen++; follow_byte: if (next_byte < next_table->base[0] || next_table->base[1] < next_byte) next_info = INVALID; @@ -361,32 +445,42 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, } follow_info: switch (next_info & 0x1F) { - case NOMAP: + case NOMAP: /* xxx: copy last byte only? */ + while (out_stop - out_p < 1) { SUSPEND(transcode_obuf_full, 3); } *out_p++ = next_byte; continue; case 0x00: case 0x04: case 0x08: case 0x0C: case 0x10: case 0x14: case 0x18: case 0x1C: - if (in_p >= in_stop) { - /* todo: deal with the case of backtracking */ - /* todo: deal with incomplete input (streaming) */ - goto invalid; + while (in_p >= in_stop) { + if (!(opt & PARTIAL_INPUT)) + goto invalid; + SUSPEND(transcode_ibuf_empty, 5); } next_byte = (unsigned char)*in_p++; + readlen++; next_table = (const BYTE_LOOKUP *)next_info; goto follow_byte; /* maybe rewrite the following cases to use fallthrough???? */ case ZERObt: /* drop input */ continue; case ONEbt: + while (out_stop - out_p < 1) { SUSPEND(transcode_obuf_full, 9); } *out_p++ = getBT1(next_info); continue; case TWObt: + while (out_stop - out_p < 2) { SUSPEND(transcode_obuf_full, 10); } *out_p++ = getBT1(next_info); *out_p++ = getBT2(next_info); continue; + case THREEbt: + while (out_stop - out_p < 3) { SUSPEND(transcode_obuf_full, 11); } + *out_p++ = getBT1(next_info); + *out_p++ = getBT2(next_info); + *out_p++ = getBT3(next_info); + continue; case FOURbt: + while (out_stop - out_p < 4) { SUSPEND(transcode_obuf_full, 12); } *out_p++ = getBT0(next_info); - case THREEbt: /* fall through */ *out_p++ = getBT1(next_info); *out_p++ = getBT2(next_info); *out_p++ = getBT3(next_info); @@ -395,70 +489,245 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, next_info = (VALUE)(*my_transcoder->func_ii)(my_transcoding, next_info); goto follow_info; case FUNsi: - next_info = (VALUE)(*my_transcoder->func_si)(my_transcoding, char_start, (size_t)(in_p-char_start)); - goto follow_info; - break; + { + const unsigned char *char_start; + char_start = transcode_char_start(my_transcoding, in_pos, in_p, readlen); + next_info = (VALUE)(*my_transcoder->func_si)(my_transcoding, char_start, (size_t)readlen); + break; + } case FUNio: + while (out_stop - out_p < my_transcoder->max_output) { SUSPEND(transcode_obuf_full, 13); } out_p += (VALUE)(*my_transcoder->func_io)(my_transcoding, next_info, out_p); break; case FUNso: - out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)(in_p-char_start), out_p); - break; + { + const unsigned char *char_start; + while (out_stop - out_p < my_transcoder->max_output) { SUSPEND(transcode_obuf_full, 14); } + char_start = transcode_char_start(my_transcoding, in_pos, in_p, readlen); + out_p += (VALUE)(*my_transcoder->func_so)(my_transcoding, char_start, (size_t)readlen, out_p); + break; + } case INVALID: { - int unitlen = my_transcoder->from_unit_length; - if (in_stop - char_start <= unitlen) - in_p = in_stop; - else if (in_p - char_start <= unitlen) - in_p = char_start + unitlen; - else - in_p = char_start + ((in_p - char_start - 1) / unitlen) * unitlen; + if (readlen <= unitlen) { + while ((opt & PARTIAL_INPUT) && readlen + (in_stop - in_p) < unitlen) { + readlen += in_stop - in_p; + in_p = in_stop; + SUSPEND(transcode_ibuf_empty, 8); + } + if (readlen + (in_stop - in_p) <= unitlen) + in_p = in_stop; + else + in_p += unitlen - readlen; + } + else { + /* xxx: possibly in_p is lesser than *in_pos + * caller may want to access readbuf. */ + in_p += ((readlen - 1) / unitlen) * unitlen - readlen; + } goto invalid; } case UNDEF: goto undef; } continue; + invalid: + SUSPEND(transcode_invalid_input, 1); + continue; + + undef: + SUSPEND(transcode_undefined_conversion, 2); + continue; + } + + /* cleanup */ + if (my_transcoder->finish_func) { + while (out_stop - out_p < my_transcoder->max_output) { + SUSPEND(transcode_obuf_full, 4); + } + out_p += my_transcoder->finish_func(my_transcoding, out_p); + } + while (1) + SUSPEND(transcode_finished, 6); +#undef SUSPEND +} + +static void +more_output_buffer( + rb_transcoding *my_transcoding, + unsigned char **out_start_ptr, + unsigned char **out_pos, + unsigned char **out_stop_ptr) +{ + size_t len = (*out_pos - *out_start_ptr); + size_t new_len = (len + my_transcoding->transcoder->max_output) * 2; + *out_start_ptr = (*my_transcoding->flush_func)(my_transcoding, len, new_len); + *out_pos = *out_start_ptr + len; + *out_stop_ptr = *out_start_ptr + new_len; +} + +#if 1 +static void +transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, + const unsigned char *in_stop, unsigned char *out_stop, + const rb_transcoder *my_transcoder, + rb_transcoding *my_transcoding, + const int opt) +{ + transcode_result_t ret; + unsigned char *out_start = *out_pos; + + my_transcoding->resume_position = 0; + my_transcoding->readlen = 0; + + if (sizeof(my_transcoding->readbuf.ary) < my_transcoder->max_input) { + my_transcoding->readbuf.ptr = xmalloc(my_transcoder->max_input); + } +#define CLEANUP \ + do { \ + if (sizeof(my_transcoding->readbuf.ary) < my_transcoder->max_input) \ + xfree(my_transcoding->readbuf.ptr); \ + } while(0) + +resume: + ret = transcode_restartable(in_pos, out_pos, in_stop, out_stop, my_transcoder, my_transcoding, opt); + if (ret == transcode_invalid_input) { /* deal with invalid byte sequence */ /* todo: add more alternative behaviors */ if (opt&INVALID_IGNORE) { - continue; + goto resume; } else if (opt&INVALID_REPLACE) { - output_replacement_character(&out_p, to_encoding); - continue; + if (out_stop - *out_pos < my_transcoder->max_output) + more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop); + output_replacement_character(out_pos, rb_enc_find(my_transcoder->to_encoding)); + goto resume; } + CLEANUP; rb_raise(TRANSCODE_ERROR, "invalid byte sequence"); - continue; - undef: + } + if (ret == transcode_undefined_conversion) { /* valid character in from encoding * but no related character(s) in to encoding */ /* todo: add more alternative behaviors */ if (opt&UNDEF_IGNORE) { - continue; + goto resume; } else if (opt&UNDEF_REPLACE) { - output_replacement_character(&out_p, to_encoding); - continue; + if (out_stop - *out_pos < my_transcoder->max_output) + more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop); + output_replacement_character(out_pos, rb_enc_find(my_transcoder->to_encoding)); + goto resume; } - rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)"); - continue; + CLEANUP; + rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)"); } - /* cleanup */ - if (my_transcoder->finish_func) { - if (out_p >= out_s) { - int len = (out_p - *out_pos); - int new_len = (len + my_transcoder->max_output) * 2; - *out_pos = (*my_transcoding->flush_func)(my_transcoding, len, new_len); - out_p = *out_pos + len; - out_s = *out_pos + new_len - my_transcoder->max_output; - } - out_p += my_transcoder->finish_func(my_transcoding, out_p); + if (ret == transcode_obuf_full) { + more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop); + goto resume; } - *in_pos = in_p; - *out_pos = out_p; + + CLEANUP; + return; +#undef CLEANUP } +#else +/* sample transcode_loop implementation in byte-by-byte stream style */ +static void +transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, + const unsigned char *in_stop, unsigned char *out_stop, + const rb_transcoder *my_transcoder, + rb_transcoding *my_transcoding, + const int opt) +{ + transcode_result_t ret; + unsigned char *out_start = *out_pos; + const unsigned char *ptr; + + my_transcoding->resume_position = 0; + my_transcoding->readlen = 0; + + if (sizeof(my_transcoding->readbuf.ary) < my_transcoder->max_input) { + my_transcoding->readbuf.ptr = xmalloc(my_transcoder->max_input); + } +#define CLEANUP \ + do { \ + if (sizeof(my_transcoding->readbuf.ary) < my_transcoder->max_input) \ + xfree(my_transcoding->readbuf.ptr); \ + } while(0) + + ret = transcode_ibuf_empty; + ptr = *in_pos; + while (ret != transcode_finished) { + unsigned char input_byte; + const unsigned char *p = &input_byte; + + if (ret == transcode_ibuf_empty) { + if (ptr < in_stop) { + input_byte = *ptr; + ret = transcode_restartable(&p, out_pos, p+1, out_stop, my_transcoder, my_transcoding, opt|PARTIAL_INPUT); + } + else { + ret = transcode_restartable(NULL, out_pos, NULL, out_stop, my_transcoder, my_transcoding, opt); + } + } + else { + ret = transcode_restartable(NULL, out_pos, NULL, out_stop, my_transcoder, my_transcoding, opt|PARTIAL_INPUT); + } + if (&input_byte != p) + ptr += p - &input_byte; + switch (ret) { + case transcode_invalid_input: + /* deal with invalid byte sequence */ + /* todo: add more alternative behaviors */ + if (opt&INVALID_IGNORE) { + break; + } + else if (opt&INVALID_REPLACE) { + if (out_stop - *out_pos < my_transcoder->max_output) + more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop); + output_replacement_character(out_pos, rb_enc_find(my_transcoder->to_encoding)); + break; + } + CLEANUP; + rb_raise(TRANSCODE_ERROR, "invalid byte sequence"); + break; + + case transcode_undefined_conversion: + /* valid character in from encoding + * but no related character(s) in to encoding */ + /* todo: add more alternative behaviors */ + if (opt&UNDEF_IGNORE) { + break; + } + else if (opt&UNDEF_REPLACE) { + if (out_stop - *out_pos < my_transcoder->max_output) + more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop); + output_replacement_character(out_pos, rb_enc_find(my_transcoder->to_encoding)); + break; + } + CLEANUP; + rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)"); + break; + + case transcode_obuf_full: + more_output_buffer(my_transcoding, &out_start, out_pos, &out_stop); + break; + + case transcode_ibuf_empty: + break; + + case transcode_finished: + break; + } + } + CLEANUP; + *in_pos = in_stop; + return; +#undef CLEANUP +} +#endif /* diff --git a/transcode_data.h b/transcode_data.h index ba2e6e99b3..3801c38ec8 100644 --- a/transcode_data.h +++ b/transcode_data.h @@ -64,15 +64,28 @@ typedef struct rb_transcoding { or NULL if something else is being converted */ unsigned char *(*flush_func)(struct rb_transcoding*, int, int); + int resume_position; + const BYTE_LOOKUP *next_table; + int readlen; + union { + unsigned char ary[8]; /* max_input <= sizeof(ary) */ + unsigned char *ptr; /* length is max_input */ + } readbuf; + unsigned char stateful[256]; /* opaque data for stateful encoding */ } rb_transcoding; +#define TRANSCODING_READBUF(tc) \ + ((tc)->transcoder->max_input <= sizeof((tc)->readbuf.ary) ? \ + (tc)->readbuf.ary : \ + (tc)->readbuf.ptr) /* static structure, one per supported encoding pair */ typedef struct rb_transcoder { const char *from_encoding; const char *to_encoding; const BYTE_LOOKUP *conv_tree_start; - int from_unit_length; + int input_unit_length; + int max_input; int max_output; VALUE (*func_ii)(rb_transcoding*, VALUE); /* info -> info */ VALUE (*func_si)(rb_transcoding*, const unsigned char*, size_t); /* start -> info */