* transcode.c (str_transcode, transcode_dispatch): added two-step

* trancode.c: some minor formatting fixes

* transcode_data.h, transcode_data_iso_8859.c: Shortened
  extremely frequently used macros to shorten file length.

* test/ruby/test_transcode.rb: Fixed name of test class;
  added setup method to ensure all necessary encodings exist;
  split tests into more test methods; added tests; fixed ordering
  of arguments in assert_equal to have expected result first.

git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14236 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
matz 2007-12-15 05:42:25 +00:00
Родитель 4f13113945
Коммит f2b0dba1cf
5 изменённых файлов: 1452 добавлений и 1377 удалений

Просмотреть файл

@ -52,6 +52,23 @@ Fri Dec 14 16:06:18 2007 Yukihiro Matsumoto <matz@ruby-lang.org>
* string.c (rb_str_casecmp): ditto.
Fri Dec 14 15:25:30 2007 Martin Duerst <duerst@it.aoyama.ac.jp>
* transcode.c (encoding_equal): new function.
* transcode.c (str_transcode, transcode_dispatch): added two-step
conversion logic via UTF-8.
* trancode.c: some minor formatting fixes
* transcode_data.h, transcode_data_iso_8859.c: Shortened
extremely frequently used macros to shorten file length.
* test/ruby/test_transcode.rb: Fixed name of test class;
added setup method to ensure all necessary encodings exist;
split tests into more test methods; added tests; fixed ordering
of arguments in assert_equal to have expected result first.
Fri Dec 14 13:47:54 2007 Nobuyoshi Nakada <nobu@ruby-lang.org>
* common.mk (ruby.imp): fix for circular dependency. a patch from

Просмотреть файл

@ -1,8 +1,22 @@
# -*- encoding: US-ASCII -*- # make sure this runs in binary mode
# -*- encoding: ASCII-8BIT -*- # make sure this runs in binary mode
require 'test/unit'
class TestConvert < Test::Unit::TestCase
def test_basic
class TestTranscode < Test::Unit::TestCase
def setup # trick to create all the necessary encodings
all_encodings = [ 'ISO-8859-1', 'ISO-8859-2',
'ISO-8859-3', 'ISO-8859-4',
'ISO-8859-5', 'ISO-8859-6',
'ISO-8859-7', 'ISO-8859-8',
'ISO-8859-9', 'ISO-8859-10',
'ISO-8859-11', 'ISO-8859-13',
'ISO-8859-14', 'ISO-8859-15'
]
all_encodings.each do |enc|
'abc'.encode(enc, 'UTF-8')
end
end
def test_errors
# we don't have semantics for conversion without attribute yet
# maybe 'convert to UTF-8' would be nice :-)
assert_raise(ArgumentError) { 'abc'.encode }
@ -13,43 +27,63 @@ class TestConvert < Test::Unit::TestCase
assert_raise(ArgumentError) { 'abc'.encode!('foo', 'bar') }
assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode('foo') }
assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode!('foo') }
assert_equal('abc'.force_encoding('utf-8').encode('iso-8859-1'), 'abc')
# check that encoding is kept when no conversion is done
assert_equal('abc'.force_encoding('Shift_JIS').encode('Shift_JIS'), 'abc'.force_encoding('Shift_JIS'))
assert_equal('abc'.force_encoding('Shift_JIS').encode!('Shift_JIS'), 'abc'.force_encoding('Shift_JIS'))
# assert that encoding is correctly set
assert_equal("D\xFCrst".force_encoding('iso-8859-1').encode('utf-8').encoding, "D\u00FCrst".encoding)
# check that Encoding can be used as parameter
assert_equal("D\xFCrst".encode('utf-8', Encoding.find('ISO-8859-1')), "D\u00FCrst")
assert_equal("D\xFCrst".encode(Encoding.find('utf-8'), 'ISO-8859-1'), "D\u00FCrst")
assert_equal("D\xFCrst".encode(Encoding.find('utf-8'), Encoding.find('ISO-8859-1')), "D\u00FCrst")
end
# temporary, fix encoding
assert_equal("D\xFCrst".force_encoding('iso-8859-1').encode('utf-8'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-1'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-2'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-3'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-4'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-9'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-10'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-13'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-14'), "D\u00FCrst")
assert_equal("D\xFCrst".encode('utf-8', 'iso-8859-15'), "D\u00FCrst")
assert_equal("D\u00FCrst".encode('iso-8859-1'), "D\xFCrst".force_encoding('iso-8859-1'))
assert_equal("D\u00FCrst".encode('iso-8859-2'), "D\xFCrst".force_encoding('iso-8859-2'))
assert_equal("D\u00FCrst".encode('iso-8859-3'), "D\xFCrst".force_encoding('iso-8859-3'))
assert_equal("D\u00FCrst".encode('iso-8859-4'), "D\xFCrst".force_encoding('iso-8859-4'))
assert_equal("D\u00FCrst".encode('iso-8859-9'), "D\xFCrst".force_encoding('iso-8859-9'))
assert_equal("D\u00FCrst".encode('iso-8859-10'), "D\xFCrst".force_encoding('iso-8859-10'))
assert_equal("D\u00FCrst".encode('iso-8859-13'), "D\xFCrst".force_encoding('iso-8859-13'))
assert_equal("D\u00FCrst".encode('iso-8859-14'), "D\xFCrst".force_encoding('iso-8859-14'))
assert_equal("D\u00FCrst".encode('iso-8859-15'), "D\xFCrst".force_encoding('iso-8859-15'))
# test length extension
assert_equal(("\xA4"*20).encode('utf-8', 'iso-8859-15'), "\u20AC"*20)
assert_equal(("\xA4"*20).encode!('utf-8', 'iso-8859-15'), "\u20AC"*20)
def test_arguments
assert_equal('abc', 'abc'.force_encoding('utf-8').encode('iso-8859-1'))
# check that encoding is kept when no conversion is done
assert_equal('abc'.force_encoding('Shift_JIS'), 'abc'.force_encoding('Shift_JIS').encode('Shift_JIS'))
assert_equal('abc'.force_encoding('Shift_JIS'), 'abc'.force_encoding('Shift_JIS').encode!('Shift_JIS'))
# assert that encoding is correctly set
assert_equal("D\u00FCrst".encoding, "D\xFCrst".force_encoding('iso-8859-1').encode('utf-8').encoding)
# check that Encoding can be used as parameter
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', Encoding.find('ISO-8859-1')))
assert_equal("D\u00FCrst", "D\xFCrst".encode(Encoding.find('utf-8'), 'ISO-8859-1'))
assert_equal("D\u00FCrst", "D\xFCrst".encode(Encoding.find('utf-8'), Encoding.find('ISO-8859-1')))
end
def test_length
assert_equal("\u20AC"*20, ("\xA4"*20).encode('utf-8', 'iso-8859-15'))
assert_equal("\u20AC"*20, ("\xA4"*20).encode!('utf-8', 'iso-8859-15'))
assert_equal("\u20AC"*2000, ("\xA4"*2000).encode('utf-8', 'iso-8859-15'))
assert_equal("\u20AC"*2000, ("\xA4"*2000).encode!('utf-8', 'iso-8859-15'))
assert_equal("\u20AC"*200000, ("\xA4"*200000).encode('utf-8', 'iso-8859-15'))
assert_equal("\u20AC"*200000, ("\xA4"*200000).encode!('utf-8', 'iso-8859-15'))
end
def test_encodings
# temporary, fix encoding
assert_equal("D\u00FCrst", "D\xFCrst".force_encoding('iso-8859-1').encode('utf-8'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-1'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-2'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-3'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-4'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-9'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-10'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-13'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-14'))
assert_equal("D\u00FCrst", "D\xFCrst".encode('utf-8', 'iso-8859-15'))
assert_equal("D\xFCrst".force_encoding('iso-8859-1'), "D\u00FCrst".encode('iso-8859-1'))
assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\u00FCrst".encode('iso-8859-2'))
assert_equal("D\xFCrst".force_encoding('iso-8859-3').encoding, "D\u00FCrst".encode('iso-8859-3').encoding)
assert_equal("D\xFCrst".force_encoding('iso-8859-4'), "D\u00FCrst".encode('iso-8859-4'))
assert_equal("D\xFCrst".force_encoding('iso-8859-9'), "D\u00FCrst".encode('iso-8859-9'))
assert_equal("D\xFCrst".force_encoding('iso-8859-10'), "D\u00FCrst".encode('iso-8859-10'))
assert_equal("D\xFCrst".force_encoding('iso-8859-13'), "D\u00FCrst".encode('iso-8859-13'))
assert_equal("D\xFCrst".force_encoding('iso-8859-14'), "D\u00FCrst".encode('iso-8859-14'))
assert_equal("D\xFCrst".force_encoding('iso-8859-15'), "D\u00FCrst".encode('iso-8859-15'))
assert_equal("r\xE9sum\xE9".force_encoding('iso-8859-1'), "r\u00E9sum\u00E9".encode('iso-8859-1'))
assert_equal("el\xF5\xEDr\xE1s".force_encoding('iso-8859-2'),
"\u0065\u006C\u0151\u00ED\u0072\u00E1\u0073".encode('iso-8859-2'))
assert_equal("\xE3\xCA\xC8".force_encoding('iso-8859-6'), "\u0643\u062A\u0628".encode('iso-8859-6'))
assert_equal( "\xDF\xD5\xE0\xD5\xD2\xDE\xD4".force_encoding('iso-8859-5'),
"\u043F\u0435\u0440\u0435\u0432\u043E\u0434".encode('iso-8859-5'))
end
def test_twostep
assert_equal("D\xFCrst".force_encoding('iso-8859-2'), "D\xFCrst".encode('iso-8859-2', 'iso-8859-1'))
end
def test_all_bytes
encodings_8859 = [
'ISO-8859-1', 'ISO-8859-2',
@ -69,7 +103,7 @@ class TestConvert < Test::Unit::TestCase
test_start.encode('UTF-8','ISO-8859-1').encode('ISO-8859-1')
encodings_8859.each do |enc|
test_start = all_bytes
assert_equal(test_start.encode('UTF-8',enc).encode(enc).force_encoding('ASCII-8BIT'), test_start)
assert_equal(test_start, test_start.encode('UTF-8',enc).encode(enc).force_encoding('ASCII-8BIT'))
end
end
end

Просмотреть файл

@ -81,8 +81,8 @@ register_transcoder(const char *from_e, const char *to_e,
{
static int n = 0;
if (n >= MAX_TRANSCODERS) {
/* we are initializing, is it okay to use rb_raise here? */
rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots");
/* we are initializing, is it okay to use rb_raise here? */
rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots");
}
transcoder_table[n].from_encoding = from_e;
transcoder_table[n].to_encoding = to_e;
@ -127,25 +127,37 @@ init_transcoder_table(void)
register_transcoder(NULL, NULL, NULL, 0, 0);
}
static int
encoding_equal(const char* encoding1, const char* encoding2)
{
return 0==strcasecmp(encoding1, encoding2);
}
static transcoder*
transcode_dispatch(const char* from_encoding, const char* to_encoding)
{
transcoder *candidate = transcoder_table;
for (candidate = transcoder_table; candidate->from_encoding; candidate++)
if (0==strcasecmp(from_encoding, candidate->from_encoding)
&& 0==strcasecmp(to_encoding, candidate->to_encoding))
break;
/* in the future, add multistep transcoding logic here */
return candidate->from_encoding ? candidate : NULL;
for (candidate = transcoder_table; candidate->from_encoding; candidate++) {
if (encoding_equal(from_encoding, candidate->from_encoding)
&& encoding_equal(to_encoding, candidate->to_encoding)) {
return candidate;
}
}
/* multistep logic, via UTF-8 */
if (!encoding_equal(from_encoding, "UTF-8")
&& !encoding_equal(to_encoding, "UTF-8")
&& transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */
return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */
}
return NULL;
}
/* dynamic structure, one per conversion (similar to iconv_t) */
/* may carry conversion state (e.g. for iso-2022-jp) */
typedef struct transcoding {
VALUE ruby_string_dest; /* the String used as the conversion destination,
or NULL if something else is being converted */
or NULL if something else is being converted */
char *(*flush_func)(struct transcoding*, int, int);
} transcoding;
@ -201,7 +213,7 @@ transcode_loop(char **in_pos, char **out_pos,
}
next_table = next_table->info[next_offset];
goto follow_byte;
/* maybe rewrite the following cases to use fallthrough???? */
/* maybe rewrite the following cases to use fallthrough???? */
case ZERObt: /* drop input */
continue;
case ONEbt:
@ -262,6 +274,7 @@ str_transcode(int argc, VALUE *argv, VALUE str)
VALUE from_encval, to_encval;
transcoder *my_transcoder;
transcoding my_transcoding;
int final_encoding = 0;
if (argc<1 || argc>2) {
rb_raise(rb_eArgError, "wrong number of arguments (%d for 2)", argc);
@ -275,7 +288,7 @@ str_transcode(int argc, VALUE *argv, VALUE str)
to_e = rb_enc_name(to_enc);
}
if (argc==1) {
from_encidx = rb_enc_get_index(str);
from_encidx = rb_enc_get_index(str);
from_enc = rb_enc_from_index(from_encidx);
from_e = rb_enc_name(from_enc);
}
@ -298,33 +311,44 @@ str_transcode(int argc, VALUE *argv, VALUE str)
if (strcasecmp(from_e, to_e) == 0) {
return Qnil;
}
if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
while (!final_encoding) /* loop for multistep transcoding */
{ /* later, maybe use smaller intermediate strings for very long strings */
if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
rb_raise(rb_eArgError, "transcoding not supported (from %s to %s)", from_e, to_e);
}
fromp = sp = RSTRING_PTR(str);
slen = RSTRING_LEN(str);
blen = slen + 30; /* len + margin */
dest = rb_str_tmp_new(blen);
bp = RSTRING_PTR(dest);
my_transcoding.ruby_string_dest = dest;
my_transcoding.flush_func = str_transcoding_resize;
transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding);
if (fromp != sp+slen) {
rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp);
}
buf = RSTRING_PTR(dest);
*bp = '\0';
rb_str_set_len(dest, bp - buf);
rb_enc_associate(dest, to_enc);
if (encoding_equal(my_transcoder->to_encoding, to_e)) {
final_encoding = 1;
}
else {
from_e = my_transcoder->to_encoding;
str = dest;
}
}
fromp = sp = RSTRING_PTR(str);
slen = RSTRING_LEN(str);
blen = slen + 30; /* len + margin */
dest = rb_str_tmp_new(blen);
bp = RSTRING_PTR(dest);
my_transcoding.ruby_string_dest = dest;
my_transcoding.flush_func = str_transcoding_resize;
/* for simple testing: */
transcode_loop(&fromp, &bp, (sp+slen), (bp+blen), my_transcoder, &my_transcoding);
if (fromp != sp+slen) {
rb_raise(rb_eArgError, "not fully converted, %d bytes left", sp+slen-fromp);
}
buf = RSTRING_PTR(dest);
*bp = '\0';
rb_str_set_len(dest, bp - buf);
/* set encoding */
if (!to_enc) {
to_encidx = rb_enc_replicate(to_e, rb_default_encoding());
to_enc = rb_enc_from_index(to_encidx);
}
rb_enc_associate(dest, to_enc);
return dest;
}

Просмотреть файл

@ -22,10 +22,10 @@ typedef struct byte_lookup {
#define UNDEF (PType 0x09) /* legal but undefined */
#define ZERObt (PType 0x0A) /* zero bytes of payload, i.e. remove */
#define output1(b1) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|ONEbt))
#define output2(b1,b2) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt))
#define output3(b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt))
#define output4(b0,b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt))
#define o1(b1) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|ONEbt))
#define o2(b1,b2) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt))
#define o3(b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt))
#define o4(b0,b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt))
#define getBT1(a) (((a)>> 8)&0xFF)
#define getBT2(a) (((a)>>16)&0xFF)

Разница между файлами не показана из-за своего большого размера Загрузить разницу