From 77ae7b2e8353c963404c31c6d910fe77286cbd81 Mon Sep 17 00:00:00 2001 From: nobu Date: Wed, 15 Jan 2014 05:03:49 +0000 Subject: [PATCH] string.c: use actual encodings * string.c (get_actual_encoding): get actual encoding according to the BOM if exists. * string.c (rb_str_inspect): use according encoding, instead of pseudo encodings, UTF-{16,32}. [ruby-core:59757] [Bug #8940] git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@44605 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 8 ++++++ encoding.c | 6 +++++ string.c | 61 ++++++++++++++++++++++++++---------------- test/ruby/test_m17n.rb | 4 +++ 4 files changed, 56 insertions(+), 23 deletions(-) diff --git a/ChangeLog b/ChangeLog index 0161f2a77a..48f993766d 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,11 @@ +Wed Jan 15 14:03:47 2014 Nobuyoshi Nakada + + * string.c (get_actual_encoding): get actual encoding according to + the BOM if exists. + + * string.c (rb_str_inspect): use according encoding, instead of + pseudo encodings, UTF-{16,32}. [ruby-core:59757] [Bug #8940] + Tue Jan 14 21:07:22 2014 Masaki Matsushita * ext/thread/thread.c (rb_szqueue_clear): notify SZQUEUE_WAITERS diff --git a/encoding.c b/encoding.c index ab03806dce..f102524379 100644 --- a/encoding.c +++ b/encoding.c @@ -598,6 +598,12 @@ rb_enc_from_index(int index) return enc_table.list[index].enc; } +rb_encoding * +rb_enc_get_from_index(int index) +{ + return must_encindex(index); +} + int rb_enc_registered(const char *name) { diff --git a/string.c b/string.c index 900f900f16..50b050bb5d 100644 --- a/string.c +++ b/string.c @@ -123,6 +123,38 @@ VALUE rb_cSymbol; #define STR_ENC_GET(str) rb_enc_from_index(ENCODING_GET(str)) +rb_encoding *rb_enc_get_from_index(int index); + +static rb_encoding * +get_actual_encoding(const int encidx, VALUE str) +{ + const unsigned char *q; + + switch (encidx) { + case ENCINDEX_UTF_16: + if (RSTRING_LEN(str) < 2) break; + q = (const unsigned char *)RSTRING_PTR(str); + if (q[0] == 0xFE && q[1] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_16BE); + } + if (q[0] == 0xFF && q[1] == 0xFE) { + return rb_enc_get_from_index(ENCINDEX_UTF_16LE); + } + return rb_ascii8bit_encoding(); + case ENCINDEX_UTF_32: + if (RSTRING_LEN(str) < 4) break; + q = (const unsigned char *)RSTRING_PTR(str); + if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_32BE); + } + if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) { + return rb_enc_get_from_index(ENCINDEX_UTF_32LE); + } + return rb_ascii8bit_encoding(); + } + return rb_enc_from_index(encidx); +} + static int fstring_cmp(VALUE a, VALUE b); static st_table* frozen_strings; @@ -4749,8 +4781,8 @@ rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p) VALUE rb_str_inspect(VALUE str) { - rb_encoding *enc = STR_ENC_GET(str); - int encidx = rb_enc_to_index(enc); + int encidx = ENCODING_GET(str); + rb_encoding *enc = rb_enc_from_index(encidx), *actenc; const char *p, *pend, *prev; char buf[CHAR_ESC_LEN + 1]; VALUE result = rb_str_buf_new(0); @@ -4765,27 +4797,10 @@ rb_str_inspect(VALUE str) p = RSTRING_PTR(str); pend = RSTRING_END(str); prev = p; - if (encidx == ENCINDEX_UTF_16 && p + 2 <= pend) { - const unsigned char *q = (const unsigned char *)p; - if (q[0] == 0xFE && q[1] == 0xFF) - enc = rb_enc_from_index(ENCINDEX_UTF_16BE); - else if (q[0] == 0xFF && q[1] == 0xFE) - enc = rb_enc_from_index(ENCINDEX_UTF_16LE); - else { - enc = rb_ascii8bit_encoding(); - unicode_p = 0; - } - } - else if (encidx == ENCINDEX_UTF_32 && p + 4 <= pend) { - const unsigned char *q = (const unsigned char *)p; - if (q[0] == 0 && q[1] == 0 && q[2] == 0xFE && q[3] == 0xFF) - enc = rb_enc_from_index(ENCINDEX_UTF_32BE); - else if (q[3] == 0 && q[2] == 0 && q[1] == 0xFE && q[0] == 0xFF) - enc = rb_enc_from_index(ENCINDEX_UTF_32LE); - else { - enc = rb_ascii8bit_encoding(); - unicode_p = 0; - } + actenc = get_actual_encoding(encidx, str); + if (actenc != enc) { + enc = actenc; + if (unicode_p) unicode_p = rb_enc_unicode_p(enc); } while (p < pend) { unsigned int c, cc; diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb index 4557cd6c3c..f5f23c09f1 100644 --- a/test/ruby/test_m17n.rb +++ b/test/ruby/test_m17n.rb @@ -228,14 +228,18 @@ class TestM17N < Test::Unit::TestCase STR_WITHOUT_BOM = "\u3042".freeze STR_WITH_BOM = "\uFEFF\u3042".freeze + bug8940 = '[ruby-core:59757] [Bug #8940]' %w/UTF-16 UTF-32/.each do |enc| %w/BE LE/.each do |endian| + bom = "\uFEFF".encode("#{enc}#{endian}").force_encoding(enc) + define_method("test_utf_16_32_inspect(#{enc}#{endian})") do s = STR_WITHOUT_BOM.encode(enc + endian) # When a UTF-16/32 string doesn't have a BOM, # inspect as a dummy encoding string. assert_equal(s.dup.force_encoding("ISO-2022-JP").inspect, s.dup.force_encoding(enc).inspect) + assert_normal_exit("#{bom.b.dump}.force_encoding('#{enc}').inspect", bug8940) end define_method("test_utf_16_32_inspect(#{enc}#{endian}-BOM)") do