diff --git a/ext/bert/c/decode.c b/ext/bert/c/decode.c index dee9926..3b610f9 100644 --- a/ext/bert/c/decode.c +++ b/ext/bert/c/decode.c @@ -1,4 +1,5 @@ #include "ruby.h" +#include "ruby/encoding.h" #include #include @@ -14,9 +15,17 @@ #define ERL_BIN 109 #define ERL_SMALL_BIGNUM 110 #define ERL_LARGE_BIGNUM 111 -#define ERL_VERSION 131 -#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERL_LARGE_BIGNUM) +/* These two types are specific to version 2 of the protocol. They diverge + * from Erlang, but allow us to pass string encodings across the wire. */ +#define ERLEXT_ENC_STRING 112 +#define ERLEXT_UNICODE_STRING 113 + +/* Protocol version constants. */ +#define ERL_VERSION 131 +#define ERL_VERSION2 132 + +#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERLEXT_UNICODE_STRING) #define BERT_TYPE_OFFSET (ERL_SMALL_INT) static VALUE rb_mBERT; @@ -40,6 +49,8 @@ static VALUE bert_read_nil(struct bert_buf *buf); static VALUE bert_read_string(struct bert_buf *buf); static VALUE bert_read_list(struct bert_buf *buf); static VALUE bert_read_bin(struct bert_buf *buf); +static VALUE bert_read_enc_string(struct bert_buf *buf); +static VALUE bert_read_unicode_string(struct bert_buf *buf); static VALUE bert_read_sbignum(struct bert_buf *buf); static VALUE bert_read_lbignum(struct bert_buf *buf); @@ -59,7 +70,9 @@ static bert_ptr bert_callbacks[] = { &bert_read_list, &bert_read_bin, &bert_read_sbignum, - &bert_read_lbignum + &bert_read_lbignum, + &bert_read_enc_string, + &bert_read_unicode_string }; static inline uint8_t bert_buf_read8(struct bert_buf *buf) @@ -293,6 +306,34 @@ static VALUE bert_read_bin(struct bert_buf *buf) return rb_bin; } +static VALUE bert_read_unicode_string(struct bert_buf *buf) +{ + VALUE rb_str; + + rb_str = bert_read_bin(buf); + rb_enc_associate(rb_str, rb_utf8_encoding()); + + return rb_str; +} + +static VALUE bert_read_enc_string(struct bert_buf *buf) +{ + uint8_t type; + VALUE rb_bin, enc; + + rb_bin = bert_read_bin(buf); + + bert_buf_ensure(buf, 1); + type = bert_buf_read8(buf); + if (ERL_BIN != type) + rb_raise(rb_eRuntimeError, "Invalid tag '%d' for term", type); + + enc = bert_read_bin(buf); + rb_enc_associate(rb_bin, rb_find_encoding(enc)); + + return rb_bin; +} + static VALUE bert_read_string(struct bert_buf *buf) { uint16_t i, length; @@ -467,6 +508,7 @@ static VALUE bert_read_invalid(struct bert_buf *buf) static VALUE rb_bert_decode(VALUE klass, VALUE rb_string) { struct bert_buf buf; + uint8_t proto_version; Check_Type(rb_string, T_STRING); buf.data = (uint8_t *)RSTRING_PTR(rb_string); @@ -474,10 +516,12 @@ static VALUE rb_bert_decode(VALUE klass, VALUE rb_string) bert_buf_ensure(&buf, 1); - if (bert_buf_read8(&buf) != ERL_VERSION) - rb_raise(rb_eTypeError, "Invalid magic value for BERT string"); - - return bert_read(&buf); + proto_version = bert_buf_read8(&buf); + if (proto_version == ERL_VERSION || proto_version == ERL_VERSION2) { + return bert_read(&buf); + } else { + rb_raise(rb_eTypeError, "Invalid magic value for BERT string"); + } } static VALUE rb_bert_impl(VALUE klass) diff --git a/lib/bert.rb b/lib/bert.rb index 527113b..08b3016 100644 --- a/lib/bert.rb +++ b/lib/bert.rb @@ -22,4 +22,4 @@ require 'bert/decoder' # Global method for specifying that an array should be encoded as a tuple. def t BERT::Tuple -end \ No newline at end of file +end diff --git a/lib/bert/decode.rb b/lib/bert/decode.rb index 8fe056e..b8aeeb2 100644 --- a/lib/bert/decode.rb +++ b/lib/bert/decode.rb @@ -10,7 +10,13 @@ module BERT def self.decode(string) io = StringIO.new(string) io.set_encoding('binary') if io.respond_to?(:set_encoding) - new(io).read_any + header = io.getbyte + case header + when MAGIC, VERSION_2 + new(io).read_any + else + fail("Bad Magic") + end end def initialize(ins) @@ -19,7 +25,6 @@ module BERT end def read_any - fail("Bad Magic") unless read_1 == MAGIC read_any_raw end @@ -37,6 +42,8 @@ module BERT when STRING then read_erl_string when LIST then read_list when BIN then read_bin + when ENC_STRING then read_enc_string + when UNICODE_STRING then read_unicode_string else fail("Unknown term tag: #{peek_1}") end @@ -223,6 +230,14 @@ module BERT [] end + def read_unicode_string + fail("Invalid Type, not a unicode string") unless read_1 == UNICODE_STRING + length = read_4 + str = read_string(length) + str.force_encoding "UTF-8" + str + end + def read_erl_string fail("Invalid Type, not an erlang string") unless read_1 == STRING length = read_2 @@ -246,5 +261,18 @@ module BERT def fail(str) raise str end + + private + + def read_enc_string + fail("Invalid Type, not an erlang binary") unless read_1 == ENC_STRING + length = read_4 + x = read_string(length) + + fail("Invalid Type, not an erlang binary") unless read_1 == BIN + length = read_4 + x.force_encoding read_string(length) + x + end end end diff --git a/lib/bert/encode.rb b/lib/bert/encode.rb index ddcc895..4c610d7 100644 --- a/lib/bert/encode.rb +++ b/lib/bert/encode.rb @@ -2,6 +2,47 @@ module BERT class Encode include Types + class V2 < Encode + def write_binary(data) + enc = data.encoding + case enc + when ::Encoding::UTF_8, ::Encoding::US_ASCII + write_unicode_string data + when ::Encoding::ASCII_8BIT + super + else + write_enc_string data + end + end + + private + + def write_unicode_string(data) + write_1 UNICODE_STRING + write_4 data.bytesize + write_string data + end + + def write_enc_string(data) + write_1 ENC_STRING + write_4 data.bytesize + write_string data + enc = data.encoding.name + write_1 BIN + write_4 enc.bytesize + write_string enc + end + + def version_header + VERSION_2 + end + end + + class << self + attr_accessor :version + end + self.version = :v1 + attr_accessor :out def initialize(out) @@ -11,12 +52,18 @@ module BERT def self.encode(data) io = StringIO.new io.set_encoding('binary') if io.respond_to?(:set_encoding) - self.new(io).write_any(data) + + if version == :v2 + Encode::V2.new(io).write_any(data) + else + new(io).write_any(data) + end + io.string end def write_any obj - write_1 MAGIC + write_1 version_header write_any_raw obj end @@ -132,6 +179,10 @@ module BERT private + def version_header + MAGIC + end + def fail(obj) raise "Cannot encode to erlang external format: #{obj.inspect}" end diff --git a/lib/bert/types.rb b/lib/bert/types.rb index 09bcd00..0b74abb 100644 --- a/lib/bert/types.rb +++ b/lib/bert/types.rb @@ -12,10 +12,12 @@ module BERT STRING = 107 LIST = 108 BIN = 109 + ENC_STRING = 112 + UNICODE_STRING = 113 FUN = 117 - NEW_FUN = 112 MAGIC = 131 + VERSION_2 = 132 MAX_INT = (1 << 27) -1 MIN_INT = -(1 << 27) end -end \ No newline at end of file +end diff --git a/test/bert_test.rb b/test/bert_test.rb index 10d3f46..04e608e 100644 --- a/test/bert_test.rb +++ b/test/bert_test.rb @@ -5,20 +5,55 @@ class BertTest < Test::Unit::TestCase setup do time = Time.at(1254976067) @ruby = t[:user, {:name => 'TPW'}, [/cat/i, 9.9], time, nil, true, false, :true, :false] - @bert = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false" - @ebin = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>" + @bert_old = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false".b + @ebin_old = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>" end - should "encode" do - assert_equal @bert, BERT.encode(@ruby) + context "v2 encoder" do + setup do + @old_version = BERT::Encode.version + BERT::Encode.version = :v2 + @bert = "\x84h\td\x00\x04userh\x03d\x00\x04bertd\x00\x04dictl\x00\x00\x00\x01h\x02d\x00\x04nameq\x00\x00\x00\x03TPWjl\x00\x00\x00\x02h\x04d\x00\x04bertd\x00\x05regexq\x00\x00\x00\x03catl\x00\x00\x00\x01d\x00\bcaselessjc9.900000000000000e+00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jh\x05d\x00\x04bertd\x00\x04timeb\x00\x00\x04\xE6b\x00\x0E\xE4\xC3a\x00h\x02d\x00\x04bertd\x00\x03nilh\x02d\x00\x04bertd\x00\x04trueh\x02d\x00\x04bertd\x00\x05falsed\x00\x04trued\x00\x05false".b + @ebin = "<<132,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,113,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,113,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>" + end + + teardown do + BERT::Encode.version = @old_version + end + + should "decode new format" do + assert_equal @ruby, BERT.decode(@bert) + end + + should "roundtrip string and maintain encoding" do + str = "日本語".encode 'EUC-JP' + round = BERT.decode(BERT.encode(str)) + assert_equal str, round + assert_equal str.encoding, round.encoding + end + + should "roundtrip binary string" do + str = "日本語".b + round = BERT.decode(BERT.encode(str)) + assert_equal str, round + assert_equal str.encoding, round.encoding + end + + should "encode" do + assert_equal @bert, BERT.encode(@ruby) + end + + should "ebin" do + assert_equal @ebin, BERT.ebin(@bert) + end end - should "decode" do - assert_equal @ruby, BERT.decode(@bert) + should "decode the old format" do + assert_equal @ruby, BERT.decode(@bert_old) end should "ebin" do - assert_equal @ebin, BERT.ebin(@bert) + assert_equal @ebin_old, BERT.ebin(@bert_old) end should "do roundtrips" do diff --git a/test/encoder_test.rb b/test/encoder_test.rb index 75290a7..7d4c62c 100644 --- a/test/encoder_test.rb +++ b/test/encoder_test.rb @@ -82,6 +82,7 @@ class EncoderTest < Test::Unit::TestCase end should 'handle utf8 strings' do + str = "été".encode 'UTF-8' bert = [131, 109, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*') assert_equal bert, BERT::Encoder.encode("été") end @@ -99,6 +100,36 @@ class EncoderTest < Test::Unit::TestCase assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000) end + context "v2" do + setup do + @old_version = BERT::Encode.version + BERT::Encode.version = :v2 + end + + teardown do + BERT::Encode.version = @old_version + end + + should 'handle utf8 strings' do + str = "été".encode 'UTF-8' + bert = [132, 113, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*') + assert_equal bert, BERT::Encoder.encode("été") + end + + should 'handle utf8 symbols' do + bert = [132, 100, 0, 5, 195, 169, 116, 195, 169].pack('C*') + assert_equal bert, BERT::Encoder.encode(:'été') + end + + should "handle bignums" do + bert = [132,110,8,0,0,0,232,137,4,35,199,138].pack('c*') + assert_equal bert, BERT::Encoder.encode(10_000_000_000_000_000_000) + + bert = [132,110,8,1,0,0,232,137,4,35,199,138].pack('c*') + assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000) + end + end + should "leave other stuff alone" do before = [1, 2.0, [:foo, 'bar']] assert_equal before, BERT::Encoder.convert(before)