* transcode_data.h (rb_trans_result_t): new type.

(rb_trans_elem_t): new type.
  (rb_trans_t): new type.

* transcode.c (transcode_dispatch_cb): removed.
  (transcode_dispatch): removed.
  (rb_transcoding_result_t): moved to rb_trans_result_t in
  transcode_data.h.
  (transcode_restartable0): goto follow_info when FUNsi.
  (rb_transcoding_open): use get_transcoder_entry.
  (rb_trans_open): new function.
  (rb_trans_conv): ditto.
  (rb_trans_close): ditto.
  (trans_open_i): ditto.
  (trans_sweep): ditto.
  (more_output_buffer): take rb_trans_t instead of rb_transcoding as
  an argument.
  (transcode_loop): take from_encoding and to_encoding instead of tr
  as arguments.  use rb_trans_open/rb_trans_conv/rb_trans_close.
  (str_transcode): don't use transcode_dispatch.



git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18498 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2008-08-11 15:50:42 +00:00
Родитель 96765d153f
Коммит a2901a7c75
4 изменённых файлов: 297 добавлений и 127 удалений

Просмотреть файл

@ -1,3 +1,26 @@
Tue Aug 12 00:43:44 2008 Tanaka Akira <akr@fsij.org>
* transcode_data.h (rb_trans_result_t): new type.
(rb_trans_elem_t): new type.
(rb_trans_t): new type.
* transcode.c (transcode_dispatch_cb): removed.
(transcode_dispatch): removed.
(rb_transcoding_result_t): moved to rb_trans_result_t in
transcode_data.h.
(transcode_restartable0): goto follow_info when FUNsi.
(rb_transcoding_open): use get_transcoder_entry.
(rb_trans_open): new function.
(rb_trans_conv): ditto.
(rb_trans_close): ditto.
(trans_open_i): ditto.
(trans_sweep): ditto.
(more_output_buffer): take rb_trans_t instead of rb_transcoding as
an argument.
(transcode_loop): take from_encoding and to_encoding instead of tr
as arguments. use rb_trans_open/rb_trans_conv/rb_trans_close.
(str_transcode): don't use transcode_dispatch.
Mon Aug 11 20:37:05 2008 Yukihiro Matsumoto <matz@ruby-lang.org>
* iseq.c (rb_iseq_clone): should preserve cref_stack link.

Просмотреть файл

@ -357,6 +357,10 @@ class TestTranscode < Test::Unit::TestCase
assert_raise(RuntimeError) { "\u9299".encode("iso-2022-jp") }
assert_raise(RuntimeError) { "\uff71\uff72\uff73\uff74\uff75".encode("iso-2022-jp") }
assert_raise(RuntimeError) { "\x1b(I12345\x1b(B".encode("utf-8", "iso-2022-jp") }
assert_equal("\xA1\xA1".force_encoding("euc-jp"),
"\e$B!!\e(B".encode("EUC-JP", "ISO-2022-JP"))
assert_equal("\e$B!!\e(B".force_encoding("ISO-2022-JP"),
"\xA1\xA1".encode("ISO-2022-JP", "EUC-JP"))
end
def test_iso_2022_jp_1

Просмотреть файл

@ -255,47 +255,6 @@ load_transcoder(transcoder_entry_t *entry)
return NULL;
}
static void
transcode_dispatch_cb(const char *from, const char *to, int depth, void *arg)
{
const rb_transcoder **first_transcoder_ptr = (const rb_transcoder **)arg;
transcoder_entry_t *entry;
if (!*first_transcoder_ptr)
return;
entry = get_transcoder_entry(from, to);
if (!entry)
goto failed;
if (!entry->transcoder && entry->lib) {
load_transcoder(entry);
}
if (!entry->transcoder)
goto failed;
if (depth == 0)
*first_transcoder_ptr = entry->transcoder;
return;
failed:
*first_transcoder_ptr = NULL;
return;
}
static const rb_transcoder *
transcode_dispatch(const char *from_encoding, const char *to_encoding)
{
const rb_transcoder *first_transcoder = (rb_transcoder *)1;
if (transcode_search_path(from_encoding, to_encoding, transcode_dispatch_cb, (void *)&first_transcoder)) {
return first_transcoder;
}
return NULL;
}
static void
output_replacement_character(unsigned char **out_pp, rb_encoding *enc)
{
@ -364,15 +323,7 @@ transcode_char_start(rb_transcoding *tc,
return ptr;
}
typedef enum {
transcode_invalid_input,
transcode_undefined_conversion,
transcode_obuf_full,
transcode_ibuf_empty,
transcode_finished,
} rb_transcoding_result_t;
static rb_transcoding_result_t
static rb_trans_result_t
transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
const unsigned char *in_stop, unsigned char *out_stop,
rb_transcoding *tc,
@ -517,7 +468,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
size_t char_len;
char_start = transcode_char_start(tc, *in_pos, inchar_start, in_p, &char_len);
next_info = (VALUE)(*tr->func_si)(tc, char_start, (size_t)char_len);
break;
goto follow_info;
}
case FUNio:
while (out_stop - out_p < tr->max_output) { SUSPEND(transcode_obuf_full, 13); }
@ -579,7 +530,7 @@ transcode_restartable0(const unsigned char **in_pos, unsigned char **out_pos,
#undef SUSPEND
}
static rb_transcoding_result_t
static rb_trans_result_t
transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
const unsigned char *in_stop, unsigned char *out_stop,
rb_transcoding *tc,
@ -589,7 +540,7 @@ transcode_restartable(const unsigned char **in_pos, unsigned char **out_pos,
unsigned char *readagain_buf = ALLOCA_N(unsigned char, tc->readagain_len);
const unsigned char *readagain_pos = readagain_buf;
const unsigned char *readagain_stop = readagain_buf + tc->readagain_len;
rb_transcoding_result_t res;
rb_trans_result_t res;
MEMCPY(readagain_buf, TRANSCODING_READBUF(tc) + tc->recognized_len,
unsigned char, tc->readagain_len);
@ -611,21 +562,12 @@ rb_transcoding_open(const char *from, const char *to, int flags)
rb_transcoding *tc;
const rb_transcoder *tr;
st_data_t val;
st_table *table2;
transcoder_entry_t *entry;
/* xxx: support multistep conversion */
if (!st_lookup(transcoder_table, (st_data_t)from, &val)) {
entry = get_transcoder_entry(from, to);
if (!entry)
return NULL;
}
table2 = (st_table *)val;
if (!st_lookup(table2, (st_data_t)to, &val)) {
return NULL;
}
entry = (transcoder_entry_t *)val;
tr = load_transcoder(entry);
if (!tr)
return NULL;
@ -643,7 +585,7 @@ rb_transcoding_open(const char *from, const char *to, int flags)
return tc;
}
static rb_transcoding_result_t
static rb_trans_result_t
rb_transcoding_convert(rb_transcoding *tc,
const unsigned char **input_ptr, const unsigned char *input_stop,
unsigned char **output_ptr, unsigned char *output_stop,
@ -664,17 +606,199 @@ rb_transcoding_close(rb_transcoding *tc)
xfree(tc);
}
static void
trans_open_i(const char *from, const char *to, int depth, void *arg)
{
rb_trans_t **tsp = (rb_trans_t **)arg;
rb_trans_t *ts;
int i;
if (!*tsp) {
ts = *tsp = ALLOC(rb_trans_t);
ts->num_trans = depth+1;
ts->elems = ALLOC_N(rb_trans_elem_t, ts->num_trans);
ts->num_finished = 0;
for (i = 0; i < ts->num_trans; i++) {
ts->elems[i].tc = NULL;
ts->elems[i].out_buf_start = NULL;
ts->elems[i].out_data_start = NULL;
ts->elems[i].out_data_end = NULL;
ts->elems[i].out_buf_end = NULL;
ts->elems[i].last_result = transcode_ibuf_empty;
}
}
else {
ts = *tsp;
}
ts->elems[depth].tc = rb_transcoding_open(from, to, 0);
if (depth < ts->num_trans-1) {
int bufsize = 4096;
unsigned char *p;
p = xmalloc(bufsize);
ts->elems[depth].out_buf_start = p;
ts->elems[depth].out_buf_end = p + bufsize;
ts->elems[depth].out_data_start = p;
ts->elems[depth].out_data_end = p;
}
}
static rb_trans_t *
rb_trans_open(const char *from, const char *to, int flags)
{
rb_trans_t *ts = NULL;
transcode_search_path(from, to, trans_open_i, (void *)&ts);
if (!ts)
return NULL;
return ts;
}
static int
trans_sweep(rb_trans_t *ts,
const unsigned char **input_ptr, const unsigned char *input_stop,
unsigned char **output_ptr, unsigned char *output_stop,
int flags,
int start)
{
int try;
int i, f;
const unsigned char **ipp, *is, *iold;
unsigned char **opp, *os, *oold;
rb_trans_result_t res;
try = 1;
while (try) {
try = 0;
for (i = start; i < ts->num_trans; i++) {
rb_trans_elem_t *te = &ts->elems[i];
if (i == 0) {
ipp = input_ptr;
is = input_stop;
}
else {
rb_trans_elem_t *prev_te = &ts->elems[i-1];
ipp = (const unsigned char **)&prev_te->out_data_start;
is = prev_te->out_data_end;
}
if (!te->out_buf_start) {
opp = output_ptr;
os = output_stop;
}
else {
if (te->out_buf_start != te->out_data_start) {
int len = te->out_data_end - te->out_data_start;
int off = te->out_data_start - te->out_buf_start;
MEMMOVE(te->out_buf_start, te->out_data_start, unsigned char, len);
te->out_data_start = te->out_buf_start;
te->out_data_end -= off;
}
opp = &te->out_data_end;
os = te->out_buf_end;
}
f = flags;
if (ts->num_finished != i)
f |= PARTIAL_INPUT;
iold = *ipp;
oold = *opp;
te->last_result = res = rb_transcoding_convert(te->tc, ipp, is, opp, os, f);
if (iold != *ipp || oold != *opp)
try = 1;
switch (res) {
case transcode_invalid_input:
case transcode_undefined_conversion:
return i;
case transcode_obuf_full:
case transcode_ibuf_empty:
break;
case transcode_finished:
ts->num_finished = i+1;
break;
}
}
}
return -1;
}
static rb_trans_result_t
rb_trans_conv(rb_trans_t *ts,
const unsigned char **input_ptr, const unsigned char *input_stop,
unsigned char **output_ptr, unsigned char *output_stop,
int flags)
{
int i;
int start, err_index;
unsigned char empty_buf;
unsigned char *empty_ptr = &empty_buf;
if (!input_ptr) {
input_ptr = (const unsigned char **)&empty_ptr;
input_stop = empty_ptr;
}
if (!output_ptr) {
output_ptr = &empty_ptr;
output_stop = empty_ptr;
}
err_index = -1;
for (i = ts->num_trans-2; 0 <= i; i--) {
if (ts->elems[i].last_result == transcode_invalid_input ||
ts->elems[i].last_result == transcode_undefined_conversion) {
err_index = i;
break;
}
}
do {
start = err_index + 1;
err_index = trans_sweep(ts, input_ptr, input_stop, output_ptr, output_stop, flags, start);
} while (err_index != -1 && err_index != ts->num_trans-1);
if (err_index == ts->num_trans-1)
return ts->elems[ts->num_trans-1].last_result;
else if (start == 0)
return ts->elems[ts->num_trans-1].last_result;
else
return ts->elems[start-1].last_result;
}
static void
rb_trans_close(rb_trans_t *ts)
{
int i;
for (i = 0; i < ts->num_trans; i++) {
rb_transcoding_close(ts->elems[i].tc);
if (ts->elems[i].out_buf_start)
xfree(ts->elems[i].out_buf_start);
}
xfree(ts->elems);
xfree(ts);
}
static void
more_output_buffer(
VALUE destination,
unsigned char *(*resize_destination)(VALUE, int, int),
rb_transcoding *tc,
rb_trans_t *ts,
unsigned char **out_start_ptr,
unsigned char **out_pos,
unsigned char **out_stop_ptr)
{
size_t len = (*out_pos - *out_start_ptr);
size_t new_len = (len + tc->transcoder->max_output) * 2;
size_t new_len = (len + ts->elems[ts->num_trans-1].tc->transcoder->max_output) * 2;
*out_start_ptr = resize_destination(destination, len, new_len);
*out_pos = *out_start_ptr + len;
*out_stop_ptr = *out_start_ptr + new_len;
@ -686,17 +810,23 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
const unsigned char *in_stop, unsigned char *out_stop,
VALUE destination,
unsigned char *(*resize_destination)(VALUE, int, int),
const rb_transcoder *tr,
const char *from_encoding,
const char *to_encoding,
const int opt)
{
rb_transcoding *tc;
rb_transcoding_result_t ret;
rb_trans_t *ts;
rb_trans_result_t ret;
unsigned char *out_start = *out_pos;
int max_output;
tc = rb_transcoding_open(tr->from_encoding, tr->to_encoding, 0);
ts = rb_trans_open(from_encoding, to_encoding, 0);
if (!ts)
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
resume:
ret = rb_transcoding_convert(tc, in_pos, in_stop, out_pos, out_stop, opt);
ret = rb_trans_conv(ts, in_pos, in_stop, out_pos, out_stop, opt);
if (ret == transcode_invalid_input) {
/* deal with invalid byte sequence */
/* todo: add more alternative behaviors */
@ -704,12 +834,12 @@ resume:
goto resume;
}
else if (opt&INVALID_REPLACE) {
if (out_stop - *out_pos < tr->max_output)
more_output_buffer(destination, resize_destination, tc, &out_start, out_pos, &out_stop);
output_replacement_character(out_pos, rb_enc_find(tr->to_encoding));
if (out_stop - *out_pos < max_output)
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
output_replacement_character(out_pos, rb_enc_find(to_encoding));
goto resume;
}
rb_transcoding_close(tc);
rb_trans_close(ts);
rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
}
if (ret == transcode_undefined_conversion) {
@ -720,20 +850,20 @@ resume:
goto resume;
}
else if (opt&UNDEF_REPLACE) {
if (out_stop - *out_pos < tr->max_output)
more_output_buffer(destination, resize_destination, tc, &out_start, out_pos, &out_stop);
output_replacement_character(out_pos, rb_enc_find(tr->to_encoding));
if (out_stop - *out_pos < max_output)
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
output_replacement_character(out_pos, rb_enc_find(to_encoding));
goto resume;
}
rb_transcoding_close(tc);
rb_trans_close(ts);
rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)");
}
if (ret == transcode_obuf_full) {
more_output_buffer(destination, resize_destination, tc, &out_start, out_pos, &out_stop);
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
goto resume;
}
rb_transcoding_close(tc);
rb_trans_close(ts);
return;
}
#else
@ -743,15 +873,21 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
const unsigned char *in_stop, unsigned char *out_stop,
VALUE destination,
unsigned char *(*resize_destination)(VALUE, int, int),
const rb_transcoder *tr,
const char *from_encoding,
const char *to_encoding,
const int opt)
{
rb_transcoding *tc;
rb_transcoding_result_t ret;
rb_trans_t *ts;
rb_trans_result_t ret;
unsigned char *out_start = *out_pos;
const unsigned char *ptr;
int max_output;
tc = rb_transcoding_open(tr->from_encoding, tr->to_encoding, 0);
ts = rb_trans_open(from_encoding, to_encoding, 0);
if (!ts)
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
ret = transcode_ibuf_empty;
ptr = *in_pos;
@ -762,14 +898,14 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
if (ret == transcode_ibuf_empty) {
if (ptr < in_stop) {
input_byte = *ptr;
ret = rb_transcoding_convert(tc, &p, p+1, out_pos, out_stop, PARTIAL_INPUT);
ret = rb_trans_conv(ts, &p, p+1, out_pos, out_stop, PARTIAL_INPUT);
}
else {
ret = rb_transcoding_convert(tc, NULL, NULL, out_pos, out_stop, 0);
ret = rb_trans_conv(ts, NULL, NULL, out_pos, out_stop, 0);
}
}
else {
ret = rb_transcoding_convert(tc, NULL, NULL, out_pos, out_stop, PARTIAL_INPUT);
ret = rb_trans_conv(ts, NULL, NULL, out_pos, out_stop, PARTIAL_INPUT);
}
if (&input_byte != p)
ptr += p - &input_byte;
@ -781,12 +917,12 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
else if (opt&INVALID_REPLACE) {
if (out_stop - *out_pos < tr->max_output)
more_output_buffer(destination, resize_destination, tc, &out_start, out_pos, &out_stop);
output_replacement_character(out_pos, rb_enc_find(tr->to_encoding));
if (out_stop - *out_pos < max_output)
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
output_replacement_character(out_pos, rb_enc_find(to_encoding));
break;
}
rb_transcoding_close(tc);
rb_trans_close(ts);
rb_raise(TRANSCODE_ERROR, "invalid byte sequence");
break;
@ -798,17 +934,17 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
else if (opt&UNDEF_REPLACE) {
if (out_stop - *out_pos < tr->max_output)
more_output_buffer(destination, resize_destination, tc, &out_start, out_pos, &out_stop);
output_replacement_character(out_pos, rb_enc_find(tr->to_encoding));
if (out_stop - *out_pos < max_output)
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
output_replacement_character(out_pos, rb_enc_find(to_encoding));
break;
}
rb_transcoding_close(tc);
rb_trans_close(ts);
rb_raise(TRANSCODE_ERROR, "conversion undefined for byte sequence (maybe invalid byte sequence)");
break;
case transcode_obuf_full:
more_output_buffer(destination, resize_destination, tc, &out_start, out_pos, &out_stop);
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop);
break;
case transcode_ibuf_empty:
@ -818,7 +954,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break;
}
}
rb_transcoding_close(tc);
rb_trans_close(ts);
*in_pos = in_stop;
return;
}
@ -848,8 +984,6 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
const char *from_e, *to_e;
int from_encidx, to_encidx;
VALUE from_encval, to_encval;
const rb_transcoder *tr;
int final_encoding = 0;
VALUE opt;
int options = 0;
@ -922,19 +1056,13 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
return -1;
}
do { /* loop for multistep transcoding */
/* later, maybe use smaller intermediate strings for very long strings */
if (!(tr = transcode_dispatch(from_e, to_e))) {
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
}
fromp = sp = (unsigned char *)RSTRING_PTR(str);
slen = RSTRING_LEN(str);
blen = slen + 30; /* len + margin */
dest = rb_str_tmp_new(blen);
bp = (unsigned char *)RSTRING_PTR(dest);
transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, tr, options);
transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), dest, str_transcoding_resize, from_e, to_e, options);
if (fromp != sp+slen) {
rb_raise(rb_eArgError, "not fully converted, %"PRIdPTRDIFF" bytes left", sp+slen-fromp);
}
@ -942,14 +1070,6 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
*bp = '\0';
rb_str_set_len(dest, bp - buf);
if (encoding_equal(tr->to_encoding, to_e)) {
final_encoding = 1;
}
else {
from_e = tr->to_encoding;
str = dest;
}
} while (!final_encoding);
/* set encoding */
if (!to_enc) {
to_encidx = rb_define_dummy_encoding(to_e);

Просмотреть файл

@ -96,6 +96,29 @@ typedef struct rb_transcoder {
int (*finish_func)(rb_transcoding*, unsigned char*); /* -> output */
} rb_transcoder;
typedef enum {
transcode_invalid_input,
transcode_undefined_conversion,
transcode_obuf_full,
transcode_ibuf_empty,
transcode_finished,
} rb_trans_result_t;
typedef struct {
rb_transcoding *tc;
unsigned char *out_buf_start;
unsigned char *out_data_start;
unsigned char *out_data_end;
unsigned char *out_buf_end;
rb_trans_result_t last_result;
} rb_trans_elem_t;
typedef struct {
rb_trans_elem_t *elems;
int num_trans;
int num_finished;
} rb_trans_t;
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);
void rb_register_transcoder(const rb_transcoder *);