From 38b482be8c3b53809fa44e35cb0eded148b132c8 Mon Sep 17 00:00:00 2001 From: naruse Date: Wed, 24 Nov 2010 00:08:04 +0000 Subject: [PATCH] * enc/trans/utf_16_32.trans: add the UTF-32 converter. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@29895 b2dd03c8-39d4-4d8f-98ff-823fe69b080e --- ChangeLog | 8 +++- enc/trans/utf_16_32.trans | 87 +++++++++++++++++++++++++++++++++++++ test/ruby/test_transcode.rb | 7 +++ 3 files changed, 101 insertions(+), 1 deletion(-) diff --git a/ChangeLog b/ChangeLog index d7b917fe66..0152c3e62a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,9 @@ +Wed Nov 24 06:35:32 2010 NARUSE, Yui + + * enc/trans/utf_16_32.trans: add the UTF-32 converter. + +Wed Nov 24 05:40:33 2010 NARUSE, Yui + Wed Nov 24 06:13:32 2010 Nobuyoshi Nakada * win32/win32.c (filecp, wstr_to_mbstr, mbstr_to_wstr): @@ -5,7 +11,7 @@ Wed Nov 24 06:13:32 2010 Nobuyoshi Nakada Wed Nov 24 05:40:33 2010 NARUSE, Yui - * enc/trans/utf_16_32.trans: add a convert from UTF-8 to UTF-16. + * enc/trans/utf_16_32.trans: add a converter from UTF-8 to UTF-16. Wed Nov 24 03:21:35 2010 NARUSE, Yui diff --git a/enc/trans/utf_16_32.trans b/enc/trans/utf_16_32.trans index 01caffe02c..c841df035f 100644 --- a/enc/trans/utf_16_32.trans +++ b/enc/trans/utf_16_32.trans @@ -25,6 +25,10 @@ map["{00-ff}{00-ff}"] = :func_si transcode_generate_node(ActionMap.parse(map), "from_UTF_16") + map = {} + map["{00-ff}{00-ff}{00-ff}{00-ff}"] = :func_si + transcode_generate_node(ActionMap.parse(map), "from_UTF_32") + map = {} map["{00-7f}"] = :func_so map["{c2-df}{80-bf}"] = :func_so @@ -321,6 +325,48 @@ fun_so_from_utf_16(void *statep, const unsigned char *s, size_t l, unsigned char return 0; } +static VALUE +fun_si_from_utf_32(void *statep, const unsigned char *s, size_t l) +{ + unsigned char *sp = statep; + switch (*sp) { + case 0: + if (s[0] == 0 && s[1] == 0 && s[2] == 0xFE && s[3] == 0xFF) { + *sp = BE; + return ZERObt; + } + else if (s[0] == 0xFF && s[1] == 0xFE && s[2] == 0 && s[3] == 0) { + *sp = LE; + return ZERObt; + } + break; + case BE: + if (s[0] == 0 && ((0 < s[1] && s[1] <= 0x10) || + (s[1] == 0 && (s[2] < 0xD8 || 0xDF < s[2])))) + return (VALUE)FUNso; + break; + case LE: + if (s[3] == 0 && ((0 < s[2] && s[2] <= 0x10) || + (s[2] == 0 && (s[1] < 0xD8 || 0xDF < s[1])))) + return (VALUE)FUNso; + break; + } + return (VALUE)INVALID; +} + +static ssize_t +fun_so_from_utf_32(void *statep, const unsigned char *s, size_t l, unsigned char *o, size_t osize) +{ + unsigned char *sp = statep; + switch (*sp) { + case BE: + return fun_so_from_utf_32be(statep, s, l, o, osize); + case LE: + return fun_so_from_utf_32le(statep, s, l, o, osize); + } + return 0; +} + static ssize_t fun_so_to_utf_16(void *statep, const unsigned char *s, size_t l, unsigned char *o, size_t osize) { @@ -334,6 +380,21 @@ fun_so_to_utf_16(void *statep, const unsigned char *s, size_t l, unsigned char * return fun_so_to_utf_16be(statep, s, l, o, osize); } +static ssize_t +fun_so_to_utf_32(void *statep, const unsigned char *s, size_t l, unsigned char *o, size_t osize) +{ + unsigned char *sp = statep; + if (*sp == 0) { + *o++ = 0x00; + *o++ = 0x00; + *o++ = 0xFE; + *o++ = 0xFF; + *sp = 1; + return 4 + fun_so_to_utf_32be(statep, s, l, o, osize); + } + return fun_so_to_utf_32be(statep, s, l, o, osize); +} + static const rb_transcoder rb_from_UTF_16BE = { "UTF-16BE", "UTF-8", from_UTF_16BE, @@ -442,6 +503,18 @@ rb_from_UTF_16 = { NULL, fun_si_from_utf_16, NULL, fun_so_from_utf_16 }; +static const rb_transcoder +rb_from_UTF_32 = { + "UTF-32", "UTF-8", from_UTF_32, + TRANSCODE_TABLE_INFO, + 4, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ + asciicompat_decoder, /* asciicompat_type */ + 1, state_init, NULL, /* state_size, state_init, state_fini */ + NULL, fun_si_from_utf_32, NULL, fun_so_from_utf_32 +}; + static const rb_transcoder rb_to_UTF_16 = { "UTF-8", "UTF-16", from_UTF_8, @@ -454,6 +527,18 @@ rb_to_UTF_16 = { NULL, NULL, NULL, fun_so_to_utf_16 }; +static const rb_transcoder +rb_to_UTF_32 = { + "UTF-8", "UTF-32", from_UTF_8, + TRANSCODE_TABLE_INFO, + 1, /* input_unit_length */ + 4, /* max_input */ + 4, /* max_output */ + asciicompat_encoder, /* asciicompat_type */ + 1, state_init, NULL, /* state_size, state_init, state_fini */ + NULL, NULL, NULL, fun_so_to_utf_32 +}; + void Init_utf_16_32(void) { @@ -467,4 +552,6 @@ Init_utf_16_32(void) rb_register_transcoder(&rb_to_UTF_32LE); rb_register_transcoder(&rb_from_UTF_16); rb_register_transcoder(&rb_to_UTF_16); + rb_register_transcoder(&rb_from_UTF_32); + rb_register_transcoder(&rb_to_UTF_32); } diff --git a/test/ruby/test_transcode.rb b/test/ruby/test_transcode.rb index a313037896..9a3776273f 100644 --- a/test/ruby/test_transcode.rb +++ b/test/ruby/test_transcode.rb @@ -1027,6 +1027,13 @@ class TestTranscode < Test::Unit::TestCase assert_raise(Encoding::InvalidByteSequenceError){%w/fffeb7df/.pack("H*").encode("UTF-8","UTF-16")} end + def test_utf_32_bom + expected = "\u{3042}\u{3044}\u{20bb7}" + assert_equal(expected, %w/fffe00004230000044300000b70b0200/.pack("H*").encode("UTF-8","UTF-32")) + check_both_ways(expected, %w/0000feff000030420000304400020bb7/.pack("H*"), "UTF-32") + assert_raise(Encoding::InvalidByteSequenceError){%w/0000feff00110000/.pack("H*").encode("UTF-8","UTF-32")} + end + def check_utf_32_both_ways(utf8, raw) copy = raw.dup 0.step(copy.length-1, 4) do |i|