From c0d3881e0e67b2fe9eeaa98c6d912ee15ea430b8 Mon Sep 17 00:00:00 2001 From: akr Date: Fri, 22 Aug 2008 16:44:00 +0000 Subject: [PATCH] * include/ruby/io.h (FMODE_TEXTMODE): defined. * include/ruby/encoding.h (rb_econv_t): new field: flags. (rb_econv_binmode): declared. * io.c (io_unread): text mode hack removed. (NEED_NEWLINE_DECODER): defined. (NEED_NEWLINE_ENCODER): defined. (NEED_READCONV): defined. (NEED_WRITECONV): defined. (TEXTMODE_NEWLINE_ENCODER): defined for windows. (make_writeconv): setup converter with TEXTMODE_NEWLINE_ENCODER for text mode. (io_fwrite): use NEED_WRITECONV. character code conversion is disabled if fptr->writeconv_stateless is nil. (make_readconv): setup converter with ECONV_UNIVERSAL_NEWLINE_DECODER for text mode. (read_all): use NEED_READCONV. (appendline): use NEED_READCONV. (rb_io_getline_1): use NEED_READCONV. (io_getc): use NEED_READCONV. (rb_io_ungetc): use NEED_READCONV. (rb_io_binmode): OS-level text mode test removed. call rb_econv_binmode. (rb_io_binmode_m): call rb_io_binmode_m with write_io as well. (rb_io_flags_mode): return mode string including "t". (rb_io_mode_flags): detect "t" for text mode. (rb_sysopen): always specify O_BINARY. * transcode.c (rb_econv_open_by_transcoder_entries): initialize flags. (rb_econv_open): if source and destination encoding is both empty string, open newline converter. last_tc will be NULL in this case. (rb_econv_encoding_to_insert_output): last_tc may be NULL now. (rb_econv_string): ditto. (output_replacement_character): ditto. (transcode_loop): ditto. (econv_init): ditto. (econv_inspect): ditto. (rb_econv_binmode): new function. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@18780 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 43 ++++++++++ enc/trans/newline.trans | 2 +- include/ruby/encoding.h | 3 + include/ruby/io.h | 1 + io.c | 139 ++++++++++++++++++------------ test/ruby/test_econv.rb | 21 +++++ test/ruby/test_io_m17n.rb | 174 ++++++++++++++++++++++++++++++++++++++ transcode.c | 117 +++++++++++++++++++------ 8 files changed, 420 insertions(+), 80 deletions(-) diff --git a/ChangeLog b/ChangeLog index 9fe566865d..af9ae4a6d7 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,46 @@ +Sat Aug 23 01:42:22 2008 Tanaka Akira + + * include/ruby/io.h (FMODE_TEXTMODE): defined. + + * include/ruby/encoding.h (rb_econv_t): new field: flags. + (rb_econv_binmode): declared. + + * io.c (io_unread): text mode hack removed. + (NEED_NEWLINE_DECODER): defined. + (NEED_NEWLINE_ENCODER): defined. + (NEED_READCONV): defined. + (NEED_WRITECONV): defined. + (TEXTMODE_NEWLINE_ENCODER): defined for windows. + (make_writeconv): setup converter with TEXTMODE_NEWLINE_ENCODER for + text mode. + (io_fwrite): use NEED_WRITECONV. character code conversion is + disabled if fptr->writeconv_stateless is nil. + (make_readconv): setup converter with + ECONV_UNIVERSAL_NEWLINE_DECODER for text mode. + (read_all): use NEED_READCONV. + (appendline): use NEED_READCONV. + (rb_io_getline_1): use NEED_READCONV. + (io_getc): use NEED_READCONV. + (rb_io_ungetc): use NEED_READCONV. + (rb_io_binmode): OS-level text mode test removed. call + rb_econv_binmode. + (rb_io_binmode_m): call rb_io_binmode_m with write_io as well. + (rb_io_flags_mode): return mode string including "t". + (rb_io_mode_flags): detect "t" for text mode. + (rb_sysopen): always specify O_BINARY. + + * transcode.c (rb_econv_open_by_transcoder_entries): initialize flags. + (rb_econv_open): if source and destination encoding is + both empty string, open newline converter. last_tc will be NULL in + this case. + (rb_econv_encoding_to_insert_output): last_tc may be NULL now. + (rb_econv_string): ditto. + (output_replacement_character): ditto. + (transcode_loop): ditto. + (econv_init): ditto. + (econv_inspect): ditto. + (rb_econv_binmode): new function. + Fri Aug 22 21:18:40 2008 Tadayoshi Funaba * complex.c (nucomp_div): now behaves as quo. diff --git a/enc/trans/newline.trans b/enc/trans/newline.trans index 5fbf3f6ed9..409da1dc33 100644 --- a/enc/trans/newline.trans +++ b/enc/trans/newline.trans @@ -44,7 +44,7 @@ rb_universal_newline = { 1, /* input_unit_length */ 1, /* max_input */ 1, /* max_output */ - stateless_converter, /* stateful_type */ + stateful_decoder, /* stateful_type */ NULL, NULL, NULL, fun_so_universal_newline }; diff --git a/include/ruby/encoding.h b/include/ruby/encoding.h index 6c443d6f0d..03aac871fe 100644 --- a/include/ruby/encoding.h +++ b/include/ruby/encoding.h @@ -217,6 +217,7 @@ typedef struct { } rb_econv_elem_t; typedef struct { + int flags; const char *source_encoding_name; const char *destination_encoding_name; @@ -273,6 +274,8 @@ const char *rb_econv_stateless_encoding(const char *stateful_enc); VALUE rb_econv_string(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int flags); +void rb_econv_binmode(rb_econv_t *ec); + /* flags for rb_econv_open */ #define ECONV_UNIVERSAL_NEWLINE_DECODER 0x100 #define ECONV_CRLF_NEWLINE_ENCODER 0x200 diff --git a/include/ruby/io.h b/include/ruby/io.h index 7312f4e9ab..d42cadc058 100644 --- a/include/ruby/io.h +++ b/include/ruby/io.h @@ -85,6 +85,7 @@ typedef struct rb_io_t { #define FMODE_WSPLIT 0x0200 #define FMODE_WSPLIT_INITIALIZED 0x0400 #define FMODE_TRUNC 0x0800 +#define FMODE_TEXTMODE 0x1000 /* #define FMODE_PREP 0x10000 */ #define GetOpenFile(obj,fp) rb_io_check_closed((fp) = RFILE(rb_io_taint_check(obj))->fptr) diff --git a/io.c b/io.c index 3ce2d2a21e..d4792c8c35 100644 --- a/io.c +++ b/io.c @@ -300,17 +300,6 @@ io_unread(rb_io_t *fptr) if (fptr->rbuf_len == 0 || fptr->mode & FMODE_DUPLEX) return; /* xxx: target position may be negative if buffer is filled by ungetc */ -#if defined(_WIN32) || defined(DJGPP) || defined(__CYGWIN__) || defined(__human68k__) || defined(__EMX__) - if (!(fptr->mode & FMODE_BINMODE)) { - int len = fptr->rbuf_len; - while (fptr->rbuf_len-- > 0) { - if (fptr->rbuf[fptr->rbuf_len] == '\n') - ++len; - } - r = lseek(fptr->fd, -len, SEEK_CUR); - } - else -#endif r = lseek(fptr->fd, -fptr->rbuf_len, SEEK_CUR); if (r < 0) { if (errno == ESPIPE) @@ -681,12 +670,41 @@ rb_io_wait_writable(int f) } } +/* xxx: better way to determine the newline of the platform? */ +#if defined(O_BINARY) && O_BINARY != 0 +/* Windows */ +# define NEED_NEWLINE_DECODER(fptr) (!(fptr->mode & FMODE_BINMODE)) +# define NEED_NEWLINE_ENCODER(fptr) (!(fptr->mode & FMODE_BINMODE)) +# define TEXTMODE_NEWLINE_ENCODER ECONV_CRLF_NEWLINE_ENCODER +#else +/* Unix */ +# define NEED_NEWLINE_DECODER(fptr) (fptr->mode & FMODE_TEXTMODE) +# define NEED_NEWLINE_ENCODER(fptr) 0 +#endif +#define NEED_READCONV(fptr) (fptr->enc2 != NULL || NEED_NEWLINE_DECODER(fptr)) +#define NEED_WRITECONV(fptr) (fptr->enc != NULL || NEED_NEWLINE_ENCODER(fptr)) + static void make_writeconv(rb_io_t *fptr) { if (!fptr->writeconv_initialized) { const char *senc, *denc; rb_encoding *enc; + int ecflags; + + fptr->writeconv_initialized = 1; + + ecflags = 0; +#ifdef TEXTMODE_NEWLINE_ENCODER + if (NEED_NEWLINE_ENCODER(fptr)) + ecflags |= TEXTMODE_NEWLINE_ENCODER; + + if (!fptr->enc) { + fptr->writeconv = rb_econv_open("", "", ecflags); + fptr->writeconv_stateless = Qnil; + return; + } +#endif enc = fptr->enc2 ? fptr->enc2 : fptr->enc; senc = rb_econv_stateless_encoding(enc->name); @@ -699,14 +717,13 @@ make_writeconv(rb_io_t *fptr) fptr->writeconv_stateless = Qnil; } if (senc) { - fptr->writeconv = rb_econv_open(senc, denc, 0); + fptr->writeconv = rb_econv_open(senc, denc, ecflags); if (!fptr->writeconv) rb_raise(rb_eIOError, "code converter open failed (%s to %s)", senc, denc); } else { fptr->writeconv = NULL; } - fptr->writeconv_initialized = 1; } } @@ -716,14 +733,12 @@ io_fwrite(VALUE str, rb_io_t *fptr) { long len, n, r, l, offset = 0; - /* - * If an external encoding was specified and it differs from - * the strings encoding then we must transcode before writing. - */ - if (fptr->enc) { + if (NEED_WRITECONV(fptr)) { make_writeconv(fptr); if (fptr->writeconv) { - str = rb_str_transcode(str, fptr->writeconv_stateless); + if (!NIL_P(fptr->writeconv_stateless)) { + str = rb_str_transcode(str, fptr->writeconv_stateless); + } str = rb_econv_string(fptr->writeconv, str, 0, RSTRING_LEN(str), Qnil, ECONV_PARTIAL_INPUT); } else { @@ -1411,9 +1426,20 @@ static void make_readconv(rb_io_t *fptr) { if (!fptr->readconv) { - fptr->readconv = rb_econv_open(fptr->enc2->name, fptr->enc->name, 0); + int ecflags = 0; + const char *sname, *dname; + if (NEED_NEWLINE_DECODER(fptr)) + ecflags |= ECONV_UNIVERSAL_NEWLINE_DECODER; + if (fptr->enc2) { + sname = fptr->enc2->name; + dname = fptr->enc->name; + } + else { + sname = dname = ""; + } + fptr->readconv = rb_econv_open(sname, dname, ecflags); if (!fptr->readconv) - rb_raise(rb_eIOError, "code converter open failed (%s to %s)", fptr->enc2->name, fptr->enc->name); + rb_raise(rb_eIOError, "code converter open failed (%s to %s)", sname, dname); fptr->crbuf_off = 0; fptr->crbuf_len = 0; fptr->crbuf_capa = 1024; @@ -1519,7 +1545,7 @@ read_all(rb_io_t *fptr, long siz, VALUE str) rb_encoding *enc; int cr; - if (fptr->enc2) { + if (NEED_READCONV(fptr)) { VALUE str = rb_str_new(NULL, 0); make_readconv(fptr); while (1) { @@ -1873,7 +1899,7 @@ appendline(rb_io_t *fptr, int delim, VALUE *strp, long *lp) VALUE str = *strp; long limit = *lp; - if (fptr->enc2) { + if (NEED_READCONV(fptr)) { make_readconv(fptr); while (1) { const char *p, *e; @@ -2084,7 +2110,7 @@ rb_io_getline_1(VALUE rs, long limit, VALUE io) else if (limit == 0) { return rb_enc_str_new(0, 0, io_read_encoding(fptr)); } - else if (rs == rb_default_rs && limit < 0 && !fptr->enc2 && + else if (rs == rb_default_rs && limit < 0 && !NEED_READCONV(fptr) && rb_enc_asciicompat(enc = io_read_encoding(fptr))) { return rb_io_getline_fast(fptr, enc); } @@ -2409,18 +2435,19 @@ io_getc(rb_io_t *fptr, rb_encoding *enc) int r, n, cr = 0; VALUE str; - if (fptr->enc2) { + if (NEED_READCONV(fptr)) { VALUE str = Qnil; - if (!fptr->readconv) { - make_readconv(fptr); - } + make_readconv(fptr); while (1) { if (fptr->crbuf_len) { - r = rb_enc_precise_mbclen(fptr->crbuf+fptr->crbuf_off, - fptr->crbuf+fptr->crbuf_off+fptr->crbuf_len, - fptr->enc); + if (fptr->enc) + r = rb_enc_precise_mbclen(fptr->crbuf+fptr->crbuf_off, + fptr->crbuf+fptr->crbuf_off+fptr->crbuf_len, + fptr->enc); + else + r = ONIGENC_CONSTRUCT_MBCLEN_CHARFOUND(1); if (!MBCLEN_NEEDMORE_P(r)) break; if (fptr->crbuf_len == fptr->crbuf_capa) { @@ -2776,7 +2803,7 @@ rb_io_ungetc(VALUE io, VALUE c) else { SafeStringValue(c); } - if (fptr->enc2) { + if (NEED_READCONV(fptr)) { make_readconv(fptr); len = RSTRING_LEN(c); if (fptr->crbuf_capa - fptr->crbuf_len < len) @@ -3462,14 +3489,12 @@ rb_io_binmode(VALUE io) rb_io_t *fptr; GetOpenFile(io, fptr); -#if defined(_WIN32) || defined(DJGPP) || defined(__CYGWIN__) || defined(__human68k__) || defined(__EMX__) - if (!(fptr->mode & FMODE_BINMODE) && READ_DATA_BUFFERED(fptr)) { - rb_raise(rb_eIOError, "buffer already filled with text-mode content"); - } - if (0 <= fptr->fd && setmode(fptr->fd, O_BINARY) == -1) - rb_sys_fail(fptr->path); -#endif + if (fptr->readconv) + rb_econv_binmode(fptr->readconv); + if (fptr->writeconv) + rb_econv_binmode(fptr->writeconv); fptr->mode |= FMODE_BINMODE; + fptr->mode &= ~FMODE_TEXTMODE; return io; } @@ -3485,17 +3510,13 @@ rb_io_binmode(VALUE io) static VALUE rb_io_binmode_m(VALUE io) { -#if defined(_WIN32) || defined(DJGPP) || defined(__CYGWIN__) || defined(__human68k__) || defined(__EMX__) VALUE write_io; -#endif rb_io_binmode(io); -#if defined(_WIN32) || defined(DJGPP) || defined(__CYGWIN__) || defined(__human68k__) || defined(__EMX__) write_io = GetWriteIO(io); if (write_io != io) rb_io_binmode(write_io); -#endif return io; } @@ -3516,27 +3537,24 @@ rb_io_binmode_p(VALUE io) static const char* rb_io_flags_mode(int flags) { -#ifdef O_BINARY -# define MODE_BINMODE(a,b) ((flags & FMODE_BINMODE) ? (b) : (a)) -#else -# define MODE_BINMODE(a,b) (a) -#endif +# define MODE_BTMODE(a,b,c) ((flags & FMODE_BINMODE) ? (b) : \ + (flags & FMODE_TEXTMODE) ? (c) : (a)) if (flags & FMODE_APPEND) { if ((flags & FMODE_READWRITE) == FMODE_READWRITE) { - return MODE_BINMODE("a+", "ab+"); + return MODE_BTMODE("a+", "ab+", "at+"); } - return MODE_BINMODE("a", "ab"); + return MODE_BTMODE("a", "ab", "at"); } switch (flags & FMODE_READWRITE) { case FMODE_READABLE: - return MODE_BINMODE("r", "rb"); + return MODE_BTMODE("r", "rb", "rt"); case FMODE_WRITABLE: - return MODE_BINMODE("w", "wb"); + return MODE_BTMODE("w", "wb", "wt"); case FMODE_READWRITE: if (flags & FMODE_CREATE) { - return MODE_BINMODE("w+", "wb+"); + return MODE_BTMODE("w+", "wb+", "wt+"); } - return MODE_BINMODE("r+", "rb+"); + return MODE_BTMODE("r+", "rb+", "rt+"); } rb_raise(rb_eArgError, "invalid access modenum 0x%x", flags); return NULL; /* not reached */ @@ -3568,16 +3586,23 @@ rb_io_mode_flags(const char *mode) case 'b': flags |= FMODE_BINMODE; break; + case 't': + flags |= FMODE_TEXTMODE; + break; case '+': flags |= FMODE_READWRITE; break; default: goto error; case ':': - return flags; + goto finished; } } +finished: + if ((flags & FMODE_BINMODE) && (flags & FMODE_TEXTMODE)) + goto error; + return flags; } @@ -3887,6 +3912,10 @@ rb_sysopen(char *fname, int flags, mode_t mode) { int fd; +#ifdef O_BINARY + flags |= O_BINARY; +#endif + fd = rb_sysopen_internal(fname, flags, mode); if (fd < 0) { if (errno == EMFILE || errno == ENFILE) { diff --git a/test/ruby/test_econv.rb b/test/ruby/test_econv.rb index 9ba5fcab29..b8d9df7639 100644 --- a/test/ruby/test_econv.rb +++ b/test/ruby/test_econv.rb @@ -305,16 +305,37 @@ class TestEncodingConverter < Test::Unit::TestCase src << "\nyz"; check_ec("abc\ndefghi\njklmno\npqrstu\nvwx\nyz", "", :source_buffer_empty, *a) end + def test_universal_newline2 + ec = Encoding::Converter.new("", "", Encoding::Converter::UNIVERSAL_NEWLINE_DECODER) + a = ["", src="", ec, nil, 50, Encoding::Converter::PARTIAL_INPUT] + src << "abc\r\ndef"; check_ec("abc\ndef", "", :source_buffer_empty, *a) + src << "ghi\njkl"; check_ec("abc\ndefghi\njkl", "", :source_buffer_empty, *a) + src << "mno\rpqr"; check_ec("abc\ndefghi\njklmno\npqr", "", :source_buffer_empty, *a) + src << "stu\r"; check_ec("abc\ndefghi\njklmno\npqrstu\n", "", :source_buffer_empty, *a) + src << "\nvwx"; check_ec("abc\ndefghi\njklmno\npqrstu\nvwx", "", :source_buffer_empty, *a) + src << "\nyz"; check_ec("abc\ndefghi\njklmno\npqrstu\nvwx\nyz", "", :source_buffer_empty, *a) + end + def test_crlf_newline ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::CRLF_NEWLINE_ENCODER) assert_econv("abc\r\ndef", :finished, 50, ec, "abc\ndef", "") end + def test_crlf_newline2 + ec = Encoding::Converter.new("", "", Encoding::Converter::CRLF_NEWLINE_ENCODER) + assert_econv("abc\r\ndef", :finished, 50, ec, "abc\ndef", "") + end + def test_cr_newline ec = Encoding::Converter.new("UTF-8", "EUC-JP", Encoding::Converter::CR_NEWLINE_ENCODER) assert_econv("abc\rdef", :finished, 50, ec, "abc\ndef", "") end + def test_cr_newline2 + ec = Encoding::Converter.new("", "", Encoding::Converter::CR_NEWLINE_ENCODER) + assert_econv("abc\rdef", :finished, 50, ec, "abc\ndef", "") + end + def test_output_followed_by_input ec = Encoding::Converter.new("UTF-8", "EUC-JP") a = ["", "abc\u{3042}def", ec, nil, 100, Encoding::Converter::OUTPUT_FOLLOWED_BY_INPUT] diff --git a/test/ruby/test_io_m17n.rb b/test/ruby/test_io_m17n.rb index d39e03075f..9fb3c63e2a 100644 --- a/test/ruby/test_io_m17n.rb +++ b/test/ruby/test_io_m17n.rb @@ -979,5 +979,179 @@ EOT } end + def test_textmode_decode_universal_newline_read + with_tmpdir { + generate_file("t.crlf", "a\r\nb\r\nc\r\n") + assert_equal("a\nb\nc\n", File.read("t.crlf", mode:"rt:euc-jp:utf-8")) + assert_equal("a\nb\nc\n", File.read("t.crlf", mode:"rt")) + + generate_file("t.cr", "a\rb\rc\r") + assert_equal("a\nb\nc\n", File.read("t.cr", mode:"rt:euc-jp:utf-8")) + assert_equal("a\nb\nc\n", File.read("t.cr", mode:"rt")) + + generate_file("t.lf", "a\nb\nc\n") + assert_equal("a\nb\nc\n", File.read("t.cr", mode:"rt:euc-jp:utf-8")) + assert_equal("a\nb\nc\n", File.read("t.cr", mode:"rt")) + } + end + + def test_textmode_decode_universal_newline_getc + with_tmpdir { + generate_file("t.crlf", "a\r\nb\r\nc\r\n") + open("t.crlf", "rt") {|f| + assert_equal("a", f.getc) + assert_equal("\n", f.getc) + assert_equal("b", f.getc) + assert_equal("\n", f.getc) + assert_equal("c", f.getc) + assert_equal("\n", f.getc) + assert_equal(nil, f.getc) + } + + generate_file("t.cr", "a\rb\rc\r") + open("t.cr", "rt") {|f| + assert_equal("a", f.getc) + assert_equal("\n", f.getc) + assert_equal("b", f.getc) + assert_equal("\n", f.getc) + assert_equal("c", f.getc) + assert_equal("\n", f.getc) + assert_equal(nil, f.getc) + } + + generate_file("t.lf", "a\nb\nc\n") + open("t.lf", "rt") {|f| + assert_equal("a", f.getc) + assert_equal("\n", f.getc) + assert_equal("b", f.getc) + assert_equal("\n", f.getc) + assert_equal("c", f.getc) + assert_equal("\n", f.getc) + assert_equal(nil, f.getc) + } + } + end + + def test_textmode_decode_universal_newline_gets + with_tmpdir { + generate_file("t.crlf", "a\r\nb\r\nc\r\n") + open("t.crlf", "rt") {|f| + assert_equal("a\n", f.gets) + assert_equal("b\n", f.gets) + assert_equal("c\n", f.gets) + assert_equal(nil, f.gets) + } + + generate_file("t.cr", "a\rb\rc\r") + open("t.cr", "rt") {|f| + assert_equal("a\n", f.gets) + assert_equal("b\n", f.gets) + assert_equal("c\n", f.gets) + assert_equal(nil, f.gets) + } + + generate_file("t.lf", "a\nb\nc\n") + open("t.lf", "rt") {|f| + assert_equal("a\n", f.gets) + assert_equal("b\n", f.gets) + assert_equal("c\n", f.gets) + assert_equal(nil, f.gets) + } + } + end + + def test_textmode_decode_universal_newline_utf16 + with_tmpdir { + generate_file("t.utf16be.crlf", "\0a\0\r\0\n\0b\0\r\0\n\0c\0\r\0\n") + assert_equal("a\nb\nc\n", File.read("t.utf16be.crlf", mode:"rt:utf-16be:utf-8")) + + generate_file("t.utf16le.crlf", "a\0\r\0\n\0b\0\r\0\n\0c\0\r\0\n\0") + assert_equal("a\nb\nc\n", File.read("t.utf16le.crlf", mode:"rt:utf-16le:utf-8")) + + generate_file("t.utf16be.cr", "\0a\0\r\0b\0\r\0c\0\r") + assert_equal("a\nb\nc\n", File.read("t.utf16be.cr", mode:"rt:utf-16be:utf-8")) + + generate_file("t.utf16le.cr", "a\0\r\0b\0\r\0c\0\r\0") + assert_equal("a\nb\nc\n", File.read("t.utf16le.cr", mode:"rt:utf-16le:utf-8")) + + generate_file("t.utf16be.lf", "\0a\0\n\0b\0\n\0c\0\n") + assert_equal("a\nb\nc\n", File.read("t.utf16be.lf", mode:"rt:utf-16be:utf-8")) + + generate_file("t.utf16le.lf", "a\0\n\0b\0\n\0c\0\n\0") + assert_equal("a\nb\nc\n", File.read("t.utf16le.lf", mode:"rt:utf-16le:utf-8")) + } + end + + def system_newline + File::BINARY == 0 ? "\n" : "\r\n" + end + + def test_textmode_encode_newline + with_tmpdir { + open("t.txt", "wt") {|f| + f.puts "abc" + f.puts "def" + } + content = File.read("t.txt", :mode=>"rb") + nl = system_newline + assert_equal("abc#{nl}def#{nl}", content) + } + end + + def test_binary + with_tmpdir { + src = "a\nb\rc\r\nd\n" + generate_file("t.txt", src) + open("t.txt", "rb") {|f| + assert_equal(src, f.read) + } + if File::BINARY == 0 + open("t.txt", "r") {|f| + assert_equal(src, f.read) + } + end + } + end + + def test_binmode + with_tmpdir { + src = "a\r\nb\r\nc\r\n" + generate_file("t.txt", src) + open("t.txt", "rt") {|f| + assert_equal("a", f.getc) + assert_equal("\n", f.getc) + f.binmode + assert_equal("\n", f.getc) + assert_equal("b", f.getc) + assert_equal("\r", f.getc) + assert_equal("\n", f.getc) + assert_equal("c", f.getc) + assert_equal("\r", f.getc) + assert_equal("\n", f.getc) + assert_equal(nil, f.getc) + } + } + end + + def test_binmode2 + with_tmpdir { + src = "a\r\nb\r\nc\r\n" + generate_file("t.txt", src) + open("t.txt", "rt:euc-jp:utf-8") {|f| + assert_equal("a", f.getc) + assert_equal("\n", f.getc) + f.binmode + assert_equal("\n", f.getc) + assert_equal("b", f.getc) + assert_equal("\r", f.getc) + assert_equal("\n", f.getc) + assert_equal("c", f.getc) + assert_equal("\r", f.getc) + assert_equal("\n", f.getc) + assert_equal(nil, f.getc) + } + } + end + end diff --git a/transcode.c b/transcode.c index 44fd7e51ca..55f3281559 100644 --- a/transcode.c +++ b/transcode.c @@ -680,6 +680,7 @@ rb_econv_open_by_transcoder_entries(int n, transcoder_entry_t **entries) } ec = ALLOC(rb_econv_t); + ec->flags = 0; ec->source_encoding_name = NULL; ec->destination_encoding_name = NULL; ec->in_buf_start = NULL; @@ -741,7 +742,13 @@ rb_econv_open(const char *from, const char *to, int flags) int num_trans; static rb_econv_t *ec; - num_trans = transcode_search_path(from, to, trans_open_i, (void *)&entries); + if (*from == '\0' && *to == '\0') { + num_trans = 0; + entries = ALLOC_N(transcoder_entry_t *, 1+2); + } + else { + num_trans = transcode_search_path(from, to, trans_open_i, (void *)&entries); + } if (num_trans < 0 || !entries) { xfree(entries); @@ -751,6 +758,10 @@ rb_econv_open(const char *from, const char *to, int flags) if (flags & (ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER)) { const char *name = (flags & ECONV_CRLF_NEWLINE_ENCODER) ? "crlf_newline" : "cr_newline"; transcoder_entry_t *e = get_transcoder_entry("", name); + if (flags & ECONV_CRLF_NEWLINE_ENCODER) + flags &= ~ECONV_CR_NEWLINE_ENCODER; + else + flags &= ~ECONV_CRLF_NEWLINE_ENCODER; if (!e) { xfree(entries); return NULL; @@ -774,12 +785,19 @@ rb_econv_open(const char *from, const char *to, int flags) if (!ec) rb_raise(rb_eArgError, "encoding conversion not supported (from %s to %s)", from, to); + ec->flags = flags; ec->source_encoding_name = from; ec->destination_encoding_name = to; if (flags & ECONV_UNIVERSAL_NEWLINE_DECODER) { - ec->last_tc = ec->elems[ec->num_trans-2].tc; - ec->last_trans_index = ec->num_trans-2; + if (ec->num_trans == 1) { + ec->last_tc = NULL; + ec->last_trans_index = -1; + } + else { + ec->last_tc = ec->elems[ec->num_trans-2].tc; + ec->last_trans_index = ec->num_trans-2; + } } return ec; @@ -1037,7 +1055,12 @@ const char * rb_econv_encoding_to_insert_output(rb_econv_t *ec) { rb_transcoding *tc = ec->last_tc; - const rb_transcoder *tr = tc->transcoder; + const rb_transcoder *tr; + + if (tc == NULL) + return ""; + + tr = tc->transcoder; if (tr->stateful_type == stateful_encoder) return tr->from_encoding; @@ -1103,7 +1126,6 @@ rb_econv_insert_output(rb_econv_t *ec, size_t insert_len; rb_transcoding *tc; - const rb_transcoder *tr; unsigned char **buf_start_p; unsigned char **data_start_p; @@ -1125,11 +1147,16 @@ rb_econv_insert_output(rb_econv_t *ec, return -1; } - tc = ec->last_tc; - tr = tc->transcoder; - need = insert_len; - if (tr->stateful_type == stateful_encoder) { + + tc = ec->last_tc; + if (!tc) { + buf_start_p = &ec->in_buf_start; + data_start_p = &ec->in_data_start; + data_end_p = &ec->in_data_end; + buf_end_p = &ec->in_buf_end; + } + else if (tc->transcoder->stateful_type == stateful_encoder) { need += tc->readagain_len; if (need < insert_len) goto fail; @@ -1179,7 +1206,7 @@ rb_econv_insert_output(rb_econv_t *ec, } } - if (tr->stateful_type == stateful_encoder) { + if (tc && tc->transcoder->stateful_type == stateful_encoder) { memcpy(*data_end_p, TRANSCODING_READBUF(tc)+tc->recognized_len, tc->readagain_len); *data_end_p += tc->readagain_len; tc->readagain_len = 0; @@ -1267,15 +1294,20 @@ rb_econv_string(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int fl unsigned const char *ss, *sp, *se; unsigned char *ds, *dp, *de; rb_econv_result_t res; + int max_output; if (NIL_P(dst)) { dst = rb_str_buf_new(len); } + if (ec->last_tc) + max_output = ec->last_tc->transcoder->max_output; + else + max_output = 1; + res = econv_destination_buffer_full; while (res == econv_destination_buffer_full) { long dlen = RSTRING_LEN(dst); - int max_output = ec->last_tc->transcoder->max_output; if (rb_str_capacity(dst) - dlen < (size_t)len + max_output) { unsigned long new_capa = (unsigned long)dlen + len + max_output; if (LONG_MAX < new_capa) @@ -1297,6 +1329,27 @@ rb_econv_string(rb_econv_t *ec, VALUE src, long off, long len, VALUE dst, int fl return dst; } +void +rb_econv_binmode(rb_econv_t *ec) +{ + if (ec->flags & ECONV_UNIVERSAL_NEWLINE_DECODER) { + int i = ec->num_trans-1; + rb_transcoding_close(ec->elems[i].tc); + xfree(ec->elems[i].out_buf_start); + ec->elems[i].tc = NULL; + ec->elems[i].out_buf_start = NULL; + ec->elems[i].out_data_start = NULL; + ec->elems[i].out_data_end = NULL; + ec->elems[i].out_buf_end = NULL; + ec->num_trans--; + } + if (ec->flags & (ECONV_CRLF_NEWLINE_ENCODER|ECONV_CR_NEWLINE_ENCODER)) { + rb_transcoding_close(ec->elems[0].tc); + xfree(ec->elems[0].out_buf_start); + MEMMOVE(&ec->elems[0], &ec->elems[1], rb_econv_elem_t, ec->num_trans-1); + ec->num_trans--; + } +} static VALUE make_econv_exception(rb_econv_t *ec) @@ -1358,7 +1411,7 @@ more_output_buffer( static int output_replacement_character(rb_econv_t *ec) { - rb_transcoding *tc = ec->last_tc; + rb_transcoding *tc; const rb_transcoder *tr; rb_encoding *enc; const unsigned char *replacement; @@ -1366,10 +1419,17 @@ output_replacement_character(rb_econv_t *ec) int len; int ret; - tr = tc->transcoder; - enc = rb_enc_find(tr->to_encoding); - - replacement = (const unsigned char *)get_replacement_character(enc, &len, &repl_enc); + tc = ec->last_tc; + if (tc) { + tr = tc->transcoder; + enc = rb_enc_find(tr->to_encoding); + replacement = (const unsigned char *)get_replacement_character(enc, &len, &repl_enc); + } + else { + replacement = (unsigned char *)"?"; + len = 1; + repl_enc = ""; + } ret = rb_econv_insert_output(ec, replacement, len, repl_enc); if (ret == -1) @@ -1400,7 +1460,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding); last_tc = ec->last_tc; - max_output = last_tc->transcoder->max_output; + max_output = last_tc ? last_tc->transcoder->max_output : 1; resume: ret = rb_econv_convert(ec, in_pos, in_stop, out_pos, out_stop, opt); @@ -1465,7 +1525,7 @@ transcode_loop(const unsigned char **in_pos, unsigned char **out_pos, rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_encoding, to_encoding); last_tc = ec->last_tc; - max_output = ec->elems[ec->num_trans-1].tc->transcoder->max_output; + max_output = last_tc ? last_tc->transcoder->max_output : 1; ret = econv_source_buffer_empty; ptr = *in_pos; @@ -1835,8 +1895,14 @@ econv_init(int argc, VALUE *argv, VALUE self) ec->source_encoding = senc; ec->destination_encoding = denc; - ec->source_encoding_name = ec->elems[0].tc->transcoder->from_encoding; - ec->destination_encoding_name = ec->last_tc->transcoder->to_encoding; + if (ec->last_tc) { + ec->source_encoding_name = ec->elems[0].tc->transcoder->from_encoding; + ec->destination_encoding_name = ec->last_tc->transcoder->to_encoding; + } + else { + ec->source_encoding_name = ""; + ec->destination_encoding_name = ""; + } DATA_PTR(self) = ec; @@ -1851,10 +1917,13 @@ econv_inspect(VALUE self) if (!ec) return rb_sprintf("#<%s: uninitialized>", cname); - else - return rb_sprintf("#<%s: %s to %s>", cname, - ec->source_encoding_name, - ec->destination_encoding_name); + else { + const char *sname = ec->source_encoding_name; + const char *dname = ec->destination_encoding_name; + if (*sname == '\0') sname = "(none)"; + if (*dname == '\0') dname = "(none)"; + return rb_sprintf("#<%s: %s to %s>", cname, sname, dname); + } } #define IS_ECONV(obj) (RDATA(obj)->dfree == (RUBY_DATA_FUNC)econv_free)