Merge pull request #1 from github/encoding

Make BERT encoding aware
This commit is contained in:
Aaron Patterson 2016-04-18 08:36:50 -07:00
Родитель c2abcc4868 3113c6f395
Коммит fd47af65eb
7 изменённых файлов: 212 добавлений и 21 удалений

Просмотреть файл

@ -1,4 +1,5 @@
#include "ruby.h"
#include "ruby/encoding.h"
#include <stdint.h>
#include <netinet/in.h>
@ -14,9 +15,17 @@
#define ERL_BIN 109
#define ERL_SMALL_BIGNUM 110
#define ERL_LARGE_BIGNUM 111
#define ERL_VERSION 131
#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERL_LARGE_BIGNUM)
/* These two types are specific to version 2 of the protocol. They diverge
* from Erlang, but allow us to pass string encodings across the wire. */
#define ERLEXT_ENC_STRING 112
#define ERLEXT_UNICODE_STRING 113
/* Protocol version constants. */
#define ERL_VERSION 131
#define ERL_VERSION2 132
#define BERT_VALID_TYPE(t) ((t) >= ERL_SMALL_INT && (t) <= ERLEXT_UNICODE_STRING)
#define BERT_TYPE_OFFSET (ERL_SMALL_INT)
static VALUE rb_mBERT;
@ -40,6 +49,8 @@ static VALUE bert_read_nil(struct bert_buf *buf);
static VALUE bert_read_string(struct bert_buf *buf);
static VALUE bert_read_list(struct bert_buf *buf);
static VALUE bert_read_bin(struct bert_buf *buf);
static VALUE bert_read_enc_string(struct bert_buf *buf);
static VALUE bert_read_unicode_string(struct bert_buf *buf);
static VALUE bert_read_sbignum(struct bert_buf *buf);
static VALUE bert_read_lbignum(struct bert_buf *buf);
@ -59,7 +70,9 @@ static bert_ptr bert_callbacks[] = {
&bert_read_list,
&bert_read_bin,
&bert_read_sbignum,
&bert_read_lbignum
&bert_read_lbignum,
&bert_read_enc_string,
&bert_read_unicode_string
};
static inline uint8_t bert_buf_read8(struct bert_buf *buf)
@ -293,6 +306,34 @@ static VALUE bert_read_bin(struct bert_buf *buf)
return rb_bin;
}
static VALUE bert_read_unicode_string(struct bert_buf *buf)
{
VALUE rb_str;
rb_str = bert_read_bin(buf);
rb_enc_associate(rb_str, rb_utf8_encoding());
return rb_str;
}
static VALUE bert_read_enc_string(struct bert_buf *buf)
{
uint8_t type;
VALUE rb_bin, enc;
rb_bin = bert_read_bin(buf);
bert_buf_ensure(buf, 1);
type = bert_buf_read8(buf);
if (ERL_BIN != type)
rb_raise(rb_eRuntimeError, "Invalid tag '%d' for term", type);
enc = bert_read_bin(buf);
rb_enc_associate(rb_bin, rb_find_encoding(enc));
return rb_bin;
}
static VALUE bert_read_string(struct bert_buf *buf)
{
uint16_t i, length;
@ -467,6 +508,7 @@ static VALUE bert_read_invalid(struct bert_buf *buf)
static VALUE rb_bert_decode(VALUE klass, VALUE rb_string)
{
struct bert_buf buf;
uint8_t proto_version;
Check_Type(rb_string, T_STRING);
buf.data = (uint8_t *)RSTRING_PTR(rb_string);
@ -474,10 +516,12 @@ static VALUE rb_bert_decode(VALUE klass, VALUE rb_string)
bert_buf_ensure(&buf, 1);
if (bert_buf_read8(&buf) != ERL_VERSION)
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");
proto_version = bert_buf_read8(&buf);
if (proto_version == ERL_VERSION || proto_version == ERL_VERSION2) {
return bert_read(&buf);
} else {
rb_raise(rb_eTypeError, "Invalid magic value for BERT string");
}
}
static VALUE rb_bert_impl(VALUE klass)

Просмотреть файл

@ -10,7 +10,13 @@ module BERT
def self.decode(string)
io = StringIO.new(string)
io.set_encoding('binary') if io.respond_to?(:set_encoding)
header = io.getbyte
case header
when MAGIC, VERSION_2
new(io).read_any
else
fail("Bad Magic")
end
end
def initialize(ins)
@ -19,7 +25,6 @@ module BERT
end
def read_any
fail("Bad Magic") unless read_1 == MAGIC
read_any_raw
end
@ -37,6 +42,8 @@ module BERT
when STRING then read_erl_string
when LIST then read_list
when BIN then read_bin
when ENC_STRING then read_enc_string
when UNICODE_STRING then read_unicode_string
else
fail("Unknown term tag: #{peek_1}")
end
@ -223,6 +230,14 @@ module BERT
[]
end
def read_unicode_string
fail("Invalid Type, not a unicode string") unless read_1 == UNICODE_STRING
length = read_4
str = read_string(length)
str.force_encoding "UTF-8"
str
end
def read_erl_string
fail("Invalid Type, not an erlang string") unless read_1 == STRING
length = read_2
@ -246,5 +261,18 @@ module BERT
def fail(str)
raise str
end
private
def read_enc_string
fail("Invalid Type, not an erlang binary") unless read_1 == ENC_STRING
length = read_4
x = read_string(length)
fail("Invalid Type, not an erlang binary") unless read_1 == BIN
length = read_4
x.force_encoding read_string(length)
x
end
end
end

Просмотреть файл

@ -2,6 +2,47 @@ module BERT
class Encode
include Types
class V2 < Encode
def write_binary(data)
enc = data.encoding
case enc
when ::Encoding::UTF_8, ::Encoding::US_ASCII
write_unicode_string data
when ::Encoding::ASCII_8BIT
super
else
write_enc_string data
end
end
private
def write_unicode_string(data)
write_1 UNICODE_STRING
write_4 data.bytesize
write_string data
end
def write_enc_string(data)
write_1 ENC_STRING
write_4 data.bytesize
write_string data
enc = data.encoding.name
write_1 BIN
write_4 enc.bytesize
write_string enc
end
def version_header
VERSION_2
end
end
class << self
attr_accessor :version
end
self.version = :v1
attr_accessor :out
def initialize(out)
@ -11,12 +52,18 @@ module BERT
def self.encode(data)
io = StringIO.new
io.set_encoding('binary') if io.respond_to?(:set_encoding)
self.new(io).write_any(data)
if version == :v2
Encode::V2.new(io).write_any(data)
else
new(io).write_any(data)
end
io.string
end
def write_any obj
write_1 MAGIC
write_1 version_header
write_any_raw obj
end
@ -132,6 +179,10 @@ module BERT
private
def version_header
MAGIC
end
def fail(obj)
raise "Cannot encode to erlang external format: #{obj.inspect}"
end

Просмотреть файл

@ -12,9 +12,11 @@ module BERT
STRING = 107
LIST = 108
BIN = 109
ENC_STRING = 112
UNICODE_STRING = 113
FUN = 117
NEW_FUN = 112
MAGIC = 131
VERSION_2 = 132
MAX_INT = (1 << 27) -1
MIN_INT = -(1 << 27)
end

Просмотреть файл

@ -5,20 +5,55 @@ class BertTest < Test::Unit::TestCase
setup do
time = Time.at(1254976067)
@ruby = t[:user, {:name => 'TPW'}, [/cat/i, 9.9], time, nil, true, false, :true, :false]
@bert = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false"
@ebin = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
@bert_old = "\203h\td\000\004userh\003d\000\004bertd\000\004dictl\000\000\000\001h\002d\000\004namem\000\000\000\003TPWjl\000\000\000\002h\004d\000\004bertd\000\005regexm\000\000\000\003catl\000\000\000\001d\000\bcaselessjc9.900000000000000e+00\000\000\000\000\000\000\000\000\000\000jh\005d\000\004bertd\000\004timeb\000\000\004\346b\000\016\344\303a\000h\002d\000\004bertd\000\003nilh\002d\000\004bertd\000\004trueh\002d\000\004bertd\000\005falsed\000\004trued\000\005false".b
@ebin_old = "<<131,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,109,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,109,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
end
context "v2 encoder" do
setup do
@old_version = BERT::Encode.version
BERT::Encode.version = :v2
@bert = "\x84h\td\x00\x04userh\x03d\x00\x04bertd\x00\x04dictl\x00\x00\x00\x01h\x02d\x00\x04nameq\x00\x00\x00\x03TPWjl\x00\x00\x00\x02h\x04d\x00\x04bertd\x00\x05regexq\x00\x00\x00\x03catl\x00\x00\x00\x01d\x00\bcaselessjc9.900000000000000e+00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00jh\x05d\x00\x04bertd\x00\x04timeb\x00\x00\x04\xE6b\x00\x0E\xE4\xC3a\x00h\x02d\x00\x04bertd\x00\x03nilh\x02d\x00\x04bertd\x00\x04trueh\x02d\x00\x04bertd\x00\x05falsed\x00\x04trued\x00\x05false".b
@ebin = "<<132,104,9,100,0,4,117,115,101,114,104,3,100,0,4,98,101,114,116,100,0,4,100,105,99,116,108,0,0,0,1,104,2,100,0,4,110,97,109,101,113,0,0,0,3,84,80,87,106,108,0,0,0,2,104,4,100,0,4,98,101,114,116,100,0,5,114,101,103,101,120,113,0,0,0,3,99,97,116,108,0,0,0,1,100,0,8,99,97,115,101,108,101,115,115,106,99,57,46,57,48,48,48,48,48,48,48,48,48,48,48,48,48,48,101,43,48,48,0,0,0,0,0,0,0,0,0,0,106,104,5,100,0,4,98,101,114,116,100,0,4,116,105,109,101,98,0,0,4,230,98,0,14,228,195,97,0,104,2,100,0,4,98,101,114,116,100,0,3,110,105,108,104,2,100,0,4,98,101,114,116,100,0,4,116,114,117,101,104,2,100,0,4,98,101,114,116,100,0,5,102,97,108,115,101,100,0,4,116,114,117,101,100,0,5,102,97,108,115,101>>"
end
teardown do
BERT::Encode.version = @old_version
end
should "decode new format" do
assert_equal @ruby, BERT.decode(@bert)
end
should "roundtrip string and maintain encoding" do
str = "日本語".encode 'EUC-JP'
round = BERT.decode(BERT.encode(str))
assert_equal str, round
assert_equal str.encoding, round.encoding
end
should "roundtrip binary string" do
str = "日本語".b
round = BERT.decode(BERT.encode(str))
assert_equal str, round
assert_equal str.encoding, round.encoding
end
should "encode" do
assert_equal @bert, BERT.encode(@ruby)
end
should "decode" do
assert_equal @ruby, BERT.decode(@bert)
should "ebin" do
assert_equal @ebin, BERT.ebin(@bert)
end
end
should "decode the old format" do
assert_equal @ruby, BERT.decode(@bert_old)
end
should "ebin" do
assert_equal @ebin, BERT.ebin(@bert)
assert_equal @ebin_old, BERT.ebin(@bert_old)
end
should "do roundtrips" do

Просмотреть файл

@ -82,6 +82,7 @@ class EncoderTest < Test::Unit::TestCase
end
should 'handle utf8 strings' do
str = "été".encode 'UTF-8'
bert = [131, 109, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode("été")
end
@ -99,6 +100,36 @@ class EncoderTest < Test::Unit::TestCase
assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000)
end
context "v2" do
setup do
@old_version = BERT::Encode.version
BERT::Encode.version = :v2
end
teardown do
BERT::Encode.version = @old_version
end
should 'handle utf8 strings' do
str = "été".encode 'UTF-8'
bert = [132, 113, 0, 0, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode("été")
end
should 'handle utf8 symbols' do
bert = [132, 100, 0, 5, 195, 169, 116, 195, 169].pack('C*')
assert_equal bert, BERT::Encoder.encode(:'été')
end
should "handle bignums" do
bert = [132,110,8,0,0,0,232,137,4,35,199,138].pack('c*')
assert_equal bert, BERT::Encoder.encode(10_000_000_000_000_000_000)
bert = [132,110,8,1,0,0,232,137,4,35,199,138].pack('c*')
assert_equal bert, BERT::Encoder.encode(-10_000_000_000_000_000_000)
end
end
should "leave other stuff alone" do
before = [1, 2.0, [:foo, 'bar']]
assert_equal before, BERT::Encoder.convert(before)