* enc/trans/newline.trans: new file.

* transcode_data.h (rb_trans_t): add last_tc field.

* transcode.c (UNIVERSAL_NEWLINE): defined.
  (CRLF_NEWLINE): defined.
  (CR_NEWLINE): defined.
  (rb_trans_open_by_transcoder_entries): initialize last_tc.
  (trans_open_i): allocate one more room for newline converter.
  (rb_trans_open): universal newline implemented.
  (more_output_buffer): take max_output argument instead ts.
  (output_replacement_character): take tc argument instead of ts.
  (transcode_loop): use last_tc field.
  (econv_init): add flags argument for rb_trans_open.
  (Init_transcode): Encoding::Converter::UNIVERSAL_NEWLINE defined.



git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18556 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2008-08-13 05:30:42 +00:00
Родитель ab0ee1d5e9
Коммит 74a2a7bdbf
5 изменённых файлов: 137 добавлений и 23 удалений

Просмотреть файл

@ -1,3 +1,21 @@
Wed Aug 13 14:22:16 2008 Tanaka Akira <akr@fsij.org>
* enc/trans/newline.trans: new file.
* transcode_data.h (rb_trans_t): add last_tc field.
* transcode.c (UNIVERSAL_NEWLINE): defined.
(CRLF_NEWLINE): defined.
(CR_NEWLINE): defined.
(rb_trans_open_by_transcoder_entries): initialize last_tc.
(trans_open_i): allocate one more room for newline converter.
(rb_trans_open): universal newline implemented.
(more_output_buffer): take max_output argument instead ts.
(output_replacement_character): take tc argument instead of ts.
(transcode_loop): use last_tc field.
(econv_init): add flags argument for rb_trans_open.
(Init_transcode): Encoding::Converter::UNIVERSAL_NEWLINE defined.
Wed Aug 13 14:00:19 2008 Nobuyoshi Nakada <nobu@ruby-lang.org> Wed Aug 13 14:00:19 2008 Nobuyoshi Nakada <nobu@ruby-lang.org>
* common.mk (parse.c): generates parse.h together. * common.mk (parse.c): generates parse.h together.

56
enc/trans/newline.trans Normal file
Просмотреть файл

@ -0,0 +1,56 @@
#include "transcode_data.h"
<%
map_normalize = {}
map_normalize["{00-ff}"] = :func_so
%>
<%= transcode_generate_node(ActionMap.parse(map_normalize), "universal_newline") %>
static int
fun_so_universal_newline(rb_transcoding* t, const unsigned char* s, size_t l, unsigned char* o)
{
int len;
/*
t->stateful[0] == 0 : normal
t->stateful[0] == 1 : just after '\r'
*/
if (s[0] == '\n') {
if (t->stateful[0] == 0) {
o[0] = '\n';
len = 1;
}
else {
len = 0;
}
t->stateful[0] = 0;
}
else if (s[0] == '\r') {
o[0] = '\n';
len = 1;
t->stateful[0] = 1;
}
else {
o[0] = s[0];
len = 1;
t->stateful[0] = 0;
}
return len;
}
static const rb_transcoder
rb_universal_newline = {
"universal_newline", "", &universal_newline,
1, /* input_unit_length */
1, /* max_input */
1, /* max_output */
NULL, NULL, NULL, fun_so_universal_newline
};
void
Init_newline(void)
{
rb_register_transcoder(&rb_universal_newline);
}

Просмотреть файл

@ -4,7 +4,7 @@ class TestEncodingConverter < Test::Unit::TestCase
def assert_econv(ret_expected, dst_expected, src_expected, to, from, src, opt={}) def assert_econv(ret_expected, dst_expected, src_expected, to, from, src, opt={})
opt[:obuf_len] ||= 100 opt[:obuf_len] ||= 100
src = src.dup src = src.dup
ec = Encoding::Converter.new(from, to) ec = Encoding::Converter.new(from, to, 0)
dst = '' dst = ''
while true while true
ret = ec.primitive_convert(src, dst2="", opt[:obuf_len], 0) ret = ec.primitive_convert(src, dst2="", opt[:obuf_len], 0)
@ -35,7 +35,7 @@ class TestEncodingConverter < Test::Unit::TestCase
end end
def test_errors def test_errors
ec = Encoding::Converter.new("UTF-16BE", "EUC-JP") ec = Encoding::Converter.new("UTF-16BE", "EUC-JP", 0)
src = "\xFF\xFE\x00A\xDC\x00" src = "\xFF\xFE\x00A\xDC\x00"
ret = ec.primitive_convert(src, dst="", 10, 0) ret = ec.primitive_convert(src, dst="", 10, 0)
assert_equal("", src) assert_equal("", src)
@ -50,4 +50,18 @@ class TestEncodingConverter < Test::Unit::TestCase
assert_equal("", dst) assert_equal("", dst)
assert_equal(:finished, ret) assert_equal(:finished, ret)
end end
def test_universal_newline
ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::UNIVERSAL_NEWLINE)
ret = ec.primitive_convert(src="abc\r\ndef", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
assert_equal([:ibuf_empty, "", "abc\ndef"], [ret, src, dst])
ret = ec.primitive_convert(src="ghi\njkl", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
assert_equal([:ibuf_empty, "", "ghi\njkl"], [ret, src, dst])
ret = ec.primitive_convert(src="mno\rpqr", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
assert_equal([:ibuf_empty, "", "mno\npqr"], [ret, src, dst])
ret = ec.primitive_convert(src="stu\r", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
assert_equal([:ibuf_empty, "", "stu\n"], [ret, src, dst])
ret = ec.primitive_convert(src="\nvwx", dst="", 50, Encoding::Converter::PARTIAL_INPUT)
assert_equal([:ibuf_empty, "", "vwx"], [ret, src, dst])
end
end end

Просмотреть файл

@ -25,7 +25,10 @@ static VALUE sym_invalid, sym_undef, sym_ignore, sym_replace;
#define INVALID_REPLACE 0x2 #define INVALID_REPLACE 0x2
#define UNDEF_IGNORE 0x10 #define UNDEF_IGNORE 0x10
#define UNDEF_REPLACE 0x20 #define UNDEF_REPLACE 0x20
#define PARTIAL_INPUT 0x100 #define PARTIAL_INPUT 0x100
#define UNIVERSAL_NEWLINE 0x200
#define CRLF_NEWLINE 0x400
#define CR_NEWLINE 0x800
/* /*
* Dispatch data and logic * Dispatch data and logic
@ -646,6 +649,7 @@ rb_trans_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
ts->num_trans = n; ts->num_trans = n;
ts->elems = ALLOC_N(rb_trans_elem_t, ts->num_trans); ts->elems = ALLOC_N(rb_trans_elem_t, ts->num_trans);
ts->num_finished = 0; ts->num_finished = 0;
ts->last_tc = NULL;
for (i = 0; i < ts->num_trans; i++) { for (i = 0; i < ts->num_trans; i++) {
const rb_transcoder *tr = load_transcoder_entry(entries[i]); const rb_transcoder *tr = load_transcoder_entry(entries[i]);
ts->elems[i].from = tr->from_encoding; ts->elems[i].from = tr->from_encoding;
@ -657,6 +661,7 @@ rb_trans_open_by_transcoder_entries(int n, transcoder_entry_t **entries)
ts->elems[i].out_buf_end = NULL; ts->elems[i].out_buf_end = NULL;
ts->elems[i].last_result = transcode_ibuf_empty; ts->elems[i].last_result = transcode_ibuf_empty;
} }
ts->last_tc = ts->elems[ts->num_trans-1].tc;
for (i = 0; i < ts->num_trans-1; i++) { for (i = 0; i < ts->num_trans-1; i++) {
int bufsize = 4096; int bufsize = 4096;
@ -678,7 +683,7 @@ trans_open_i(const char *from, const char *to, int depth, void *arg)
transcoder_entry_t **entries; transcoder_entry_t **entries;
if (!*entries_ptr) { if (!*entries_ptr) {
entries = ALLOC_N(transcoder_entry_t *, depth+1); entries = ALLOC_N(transcoder_entry_t *, depth+1+1);
*entries_ptr = entries; *entries_ptr = entries;
} }
else { else {
@ -699,7 +704,19 @@ rb_trans_open(const char *from, const char *to, int flags)
if (num_trans < 0 || !entries) if (num_trans < 0 || !entries)
return NULL; return NULL;
if (flags & UNIVERSAL_NEWLINE) {
transcoder_entry_t *e = get_transcoder_entry("universal_newline", "");
if (!e)
return NULL;
entries[num_trans++] = e;
}
ts = rb_trans_open_by_transcoder_entries(num_trans, entries); ts = rb_trans_open_by_transcoder_entries(num_trans, entries);
if (flags & UNIVERSAL_NEWLINE) {
ts->last_tc = ts->elems[ts->num_trans-2].tc;
}
return ts; return ts;
} }
@ -840,13 +857,13 @@ static void
more_output_buffer( more_output_buffer(
VALUE destination, VALUE destination,
unsigned char *(*resize_destination)(VALUE, int, int), unsigned char *(*resize_destination)(VALUE, int, int),
rb_trans_t *ts, int max_output,
unsigned char **out_start_ptr, unsigned char **out_start_ptr,
unsigned char **out_pos, unsigned char **out_pos,
unsigned char **out_stop_ptr) unsigned char **out_stop_ptr)
{ {
size_t len = (*out_pos - *out_start_ptr); size_t len = (*out_pos - *out_start_ptr);
size_t new_len = (len + ts->elems[ts->num_trans-1].tc->transcoder->max_output) * 2; size_t new_len = (len + max_output) * 2;
*out_start_ptr = resize_destination(destination, len, new_len); *out_start_ptr = resize_destination(destination, len, new_len);
*out_pos = *out_start_ptr + len; *out_pos = *out_start_ptr + len;
*out_stop_ptr = *out_start_ptr + new_len; *out_stop_ptr = *out_start_ptr + new_len;
@ -856,20 +873,18 @@ static void
output_replacement_character( output_replacement_character(
VALUE destination, VALUE destination,
unsigned char *(*resize_destination)(VALUE, int, int), unsigned char *(*resize_destination)(VALUE, int, int),
rb_trans_t *ts, rb_transcoding *tc,
unsigned char **out_start_ptr, unsigned char **out_start_ptr,
unsigned char **out_pos, unsigned char **out_pos,
unsigned char **out_stop_ptr) unsigned char **out_stop_ptr)
{ {
rb_transcoding *tc;
const rb_transcoder *tr; const rb_transcoder *tr;
int max_output; int max_output;
rb_encoding *enc; rb_encoding *enc;
const char *replacement; const char *replacement;
int len; int len;
tc = ts->elems[ts->num_trans-1].tc;
tr = tc->transcoder; tr = tc->transcoder;
max_output = tr->max_output; max_output = tr->max_output;
enc = rb_enc_find(tr->to_encoding); enc = rb_enc_find(tr->to_encoding);
@ -893,12 +908,12 @@ output_replacement_character(
if (tr->resetstate_func) { if (tr->resetstate_func) {
if (*out_stop_ptr - *out_pos < max_output) if (*out_stop_ptr - *out_pos < max_output)
more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr); more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
*out_pos += tr->resetstate_func(tc, *out_pos); *out_pos += tr->resetstate_func(tc, *out_pos);
} }
if (*out_stop_ptr - *out_pos < max_output) if (*out_stop_ptr - *out_pos < max_output)
more_output_buffer(destination, resize_destination, ts, out_start_ptr, out_pos, out_stop_ptr); more_output_buffer(destination, resize_destination, max_output, out_start_ptr, out_pos, out_stop_ptr);
replacement = get_replacement_character(enc, &len); replacement = get_replacement_character(enc, &len);
@ -919,6 +934,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
const int opt) const int opt)
{ {
rb_trans_t *ts; rb_trans_t *ts;
rb_transcoding *last_tc;
rb_trans_result_t ret; rb_trans_result_t ret;
unsigned char *out_start = *out_pos; unsigned char *out_start = *out_pos;
int max_output; int max_output;
@ -927,7 +943,8 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
if (!ts) if (!ts)
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding); rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output; last_tc = ts->last_tc;
max_output = last_tc->transcoder->max_output;
resume: resume:
ret = rb_trans_conv(ts, in_pos, in_stop, out_pos, out_stop, opt); ret = rb_trans_conv(ts, in_pos, in_stop, out_pos, out_stop, opt);
@ -938,7 +955,7 @@ resume:
goto resume; goto resume;
} }
else if (opt&INVALID_REPLACE) { else if (opt&INVALID_REPLACE) {
output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop); output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
goto resume; goto resume;
} }
rb_trans_close(ts); rb_trans_close(ts);
@ -952,14 +969,14 @@ resume:
goto resume; goto resume;
} }
else if (opt&UNDEF_REPLACE) { else if (opt&UNDEF_REPLACE) {
output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop); output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
goto resume; goto resume;
} }
rb_trans_close(ts); rb_trans_close(ts);
rb_raise(rb_eConversionUndefined, "conversion undefined for byte sequence (maybe invalid byte sequence)"); rb_raise(rb_eConversionUndefined, "conversion undefined for byte sequence (maybe invalid byte sequence)");
} }
if (ret == transcode_obuf_full) { if (ret == transcode_obuf_full) {
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop); more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
goto resume; goto resume;
} }
@ -978,6 +995,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
const int opt) const int opt)
{ {
rb_trans_t *ts; rb_trans_t *ts;
rb_transcoding *last_tc;
rb_trans_result_t ret; rb_trans_result_t ret;
unsigned char *out_start = *out_pos; unsigned char *out_start = *out_pos;
const unsigned char *ptr; const unsigned char *ptr;
@ -987,6 +1005,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
if (!ts) if (!ts)
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding); rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding);
last_tc = ts->last_tc;
max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output; max_output = ts->elems[ts->num_trans-1].tc->transcoder->max_output;
ret = transcode_ibuf_empty; ret = transcode_ibuf_empty;
@ -1017,7 +1036,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break; break;
} }
else if (opt&INVALID_REPLACE) { else if (opt&INVALID_REPLACE) {
output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop); output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
break; break;
} }
rb_trans_close(ts); rb_trans_close(ts);
@ -1032,7 +1051,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break; break;
} }
else if (opt&UNDEF_REPLACE) { else if (opt&UNDEF_REPLACE) {
output_replacement_character(destination, resize_destination, ts, &out_start, out_pos, &out_stop); output_replacement_character(destination, resize_destination, last_tc, &out_start, out_pos, &out_stop);
break; break;
} }
rb_trans_close(ts); rb_trans_close(ts);
@ -1040,7 +1059,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos,
break; break;
case transcode_obuf_full: case transcode_obuf_full:
more_output_buffer(destination, resize_destination, ts, &out_start, out_pos, &out_stop); more_output_buffer(destination, resize_destination, max_output, &out_start, out_pos, &out_stop);
break; break;
case transcode_ibuf_empty: case transcode_ibuf_empty:
@ -1261,19 +1280,24 @@ econv_s_allocate(VALUE klass)
} }
static VALUE static VALUE
econv_init(VALUE self, VALUE from_encoding, VALUE to_encoding) econv_init(VALUE self, VALUE from_encoding, VALUE to_encoding, VALUE flags_v)
{ {
const char *from_e, *to_e; const char *from_e, *to_e;
rb_trans_t *ts; rb_trans_t *ts;
int flags;
from_e = StringValueCStr(from_encoding); StringValue(from_encoding);
to_e = StringValueCStr(to_encoding); StringValue(to_encoding);
flags = NUM2INT(flags_v);
from_e = RSTRING_PTR(from_encoding);
to_e = RSTRING_PTR(to_encoding);
if (DATA_PTR(self)) { if (DATA_PTR(self)) {
rb_raise(rb_eTypeError, "already initialized"); rb_raise(rb_eTypeError, "already initialized");
} }
ts = rb_trans_open(from_e, to_e, 0); ts = rb_trans_open(from_e, to_e, flags);
if (!ts) { if (!ts) {
rb_raise(rb_eArgError, "encoding convewrter not supported (from %s to %s)", from_e, to_e); rb_raise(rb_eArgError, "encoding convewrter not supported (from %s to %s)", from_e, to_e);
} }
@ -1363,8 +1387,9 @@ Init_transcode(void)
rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData); rb_cEncodingConverter = rb_define_class_under(rb_cEncoding, "Converter", rb_cData);
rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate); rb_define_alloc_func(rb_cEncodingConverter, econv_s_allocate);
rb_define_method(rb_cEncodingConverter, "initialize", econv_init, 2); rb_define_method(rb_cEncodingConverter, "initialize", econv_init, 3);
rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, 4); rb_define_method(rb_cEncodingConverter, "primitive_convert", econv_primitive_convert, 4);
rb_define_method(rb_cEncodingConverter, "max_output", econv_max_output, 0); rb_define_method(rb_cEncodingConverter, "max_output", econv_max_output, 0);
rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(PARTIAL_INPUT)); rb_define_const(rb_cEncodingConverter, "PARTIAL_INPUT", INT2FIX(PARTIAL_INPUT));
rb_define_const(rb_cEncodingConverter, "UNIVERSAL_NEWLINE", INT2FIX(UNIVERSAL_NEWLINE));
} }

Просмотреть файл

@ -122,6 +122,7 @@ typedef struct {
rb_trans_elem_t *elems; rb_trans_elem_t *elems;
int num_trans; int num_trans;
int num_finished; int num_finished;
rb_transcoding *last_tc;
} rb_trans_t; } rb_trans_t;
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib); void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);