Fri Dec 28 01:55:04 2007 Martin Duerst <duerst@it.aoyama.ac.jp>

* transcode.c (transcode_dispatch): reverted some of the changes
          in r14746.

	* transcode.c, enc/trans/single_byte.c: Added conversions to/from
	  US-ASCII and ASCII-8BIT (using data tables).

	* enc/trans/single_byte.c: Some spacing/ordering changes due to
	  automatic data file generation.

	* transcode_data.h, transcode.c: Preliminary code for using
	  micro-conversion functions.

	* test/ruby/test_transcode.rb: Added some tests for US-ASCII and
	  ASCII-8BIT conversions.



git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14766 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
duerst 2007-12-28 09:26:55 +00:00
Родитель 48af602e38
Коммит 793e9423cd
5 изменённых файлов: 152 добавлений и 51 удалений

Просмотреть файл

@ -1,3 +1,20 @@
Fri Dec 28 01:55:04 2007 Martin Duerst <duerst@it.aoyama.ac.jp>
* transcode.c (transcode_dispatch): reverted some of the changes
in r14746.
* transcode.c, enc/trans/single_byte.c: Added conversions to/from
US-ASCII and ASCII-8BIT (using data tables).
* enc/trans/single_byte.c: Some spacing/ordering changes due to
automatic data file generation.
* transcode_data.h, transcode.c: Preliminary code for using
micro-conversion functions.
* test/ruby/test_transcode.rb: Added some tests for US-ASCII and
ASCII-8BIT conversions.
Fri Dec 28 17:33:44 2007 Tanaka Akira <akr@fsij.org> Fri Dec 28 17:33:44 2007 Tanaka Akira <akr@fsij.org>
* time.c (make_time_t): verify mktime and timegm result. * time.c (make_time_t): verify mktime and timegm result.

Просмотреть файл

@ -1,5 +1,62 @@
#include "transcode_data.h" #include "transcode_data.h"
static const unsigned char
from_US_ASCII_offsets[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
};
static const struct byte_lookup* const
from_US_ASCII_infos[2] = {
NOMAP, UNDEF,
};
static const BYTE_LOOKUP
from_US_ASCII = {
/* used from from_US_ASCII */
/* used from to_US_ASCII */
/* used from to_ASCII_8BIT */
/* used from from_ASCII_8BIT */
from_US_ASCII_offsets,
from_US_ASCII_infos
};
static rb_transcoder
rb_from_US_ASCII = {
"US-ASCII", "UTF-8", &from_US_ASCII, 1, 0,
NULL, NULL,
};
static rb_transcoder
rb_to_US_ASCII = {
"UTF-8", "US-ASCII", &from_US_ASCII, 1, 1,
NULL, NULL,
};
static rb_transcoder
rb_from_ASCII_8BIT = {
"ASCII-8BIT", "UTF-8", &from_US_ASCII, 1, 0,
NULL, NULL,
};
static rb_transcoder
rb_to_ASCII_8BIT = {
"UTF-8", "ASCII-8BIT", &from_US_ASCII, 1, 1,
NULL, NULL,
};
static const unsigned char static const unsigned char
from_ISO_8859_1_offsets[256] = { from_ISO_8859_1_offsets[256] = {
/* used from from_ISO_8859_1 */ /* used from from_ISO_8859_1 */
@ -69,6 +126,7 @@ from_ISO_8859_1 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_1_infos from_ISO_8859_1_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_1 = { rb_from_ISO_8859_1 = {
"ISO-8859-1", "UTF-8", &from_ISO_8859_1, 2, 0, "ISO-8859-1", "UTF-8", &from_ISO_8859_1, 2, 0,
@ -167,6 +225,7 @@ to_ISO_8859_1 = {
to_ISO_8859_1_offsets, to_ISO_8859_1_offsets,
to_ISO_8859_1_infos to_ISO_8859_1_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_1 = { rb_to_ISO_8859_1 = {
"UTF-8", "ISO-8859-1", &to_ISO_8859_1, 1, 1, "UTF-8", "ISO-8859-1", &to_ISO_8859_1, 1, 1,
@ -214,6 +273,7 @@ from_ISO_8859_2 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_2_infos from_ISO_8859_2_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_2 = { rb_from_ISO_8859_2 = {
"ISO-8859-2", "UTF-8", &from_ISO_8859_2, 2, 0, "ISO-8859-2", "UTF-8", &from_ISO_8859_2, 2, 0,
@ -370,6 +430,7 @@ to_ISO_8859_2 = {
to_ISO_8859_2_offsets, to_ISO_8859_2_offsets,
to_ISO_8859_2_infos to_ISO_8859_2_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_2 = { rb_to_ISO_8859_2 = {
"UTF-8", "ISO-8859-2", &to_ISO_8859_2, 1, 1, "UTF-8", "ISO-8859-2", &to_ISO_8859_2, 1, 1,
@ -434,6 +495,7 @@ from_ISO_8859_3 = {
from_ISO_8859_3_offsets, from_ISO_8859_3_offsets,
from_ISO_8859_3_infos from_ISO_8859_3_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_3 = { rb_from_ISO_8859_3 = {
"ISO-8859-3", "UTF-8", &from_ISO_8859_3, 2, 0, "ISO-8859-3", "UTF-8", &from_ISO_8859_3, 2, 0,
@ -565,6 +627,7 @@ to_ISO_8859_3 = {
to_ISO_8859_2_offsets, to_ISO_8859_2_offsets,
to_ISO_8859_3_infos to_ISO_8859_3_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_3 = { rb_to_ISO_8859_3 = {
"UTF-8", "ISO-8859-3", &to_ISO_8859_3, 1, 1, "UTF-8", "ISO-8859-3", &to_ISO_8859_3, 1, 1,
@ -612,6 +675,7 @@ from_ISO_8859_4 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_4_infos from_ISO_8859_4_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_4 = { rb_from_ISO_8859_4 = {
"ISO-8859-4", "UTF-8", &from_ISO_8859_4, 2, 0, "ISO-8859-4", "UTF-8", &from_ISO_8859_4, 2, 0,
@ -747,6 +811,7 @@ to_ISO_8859_4 = {
to_ISO_8859_2_offsets, to_ISO_8859_2_offsets,
to_ISO_8859_4_infos to_ISO_8859_4_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_4 = { rb_to_ISO_8859_4 = {
"UTF-8", "ISO-8859-4", &to_ISO_8859_4, 1, 1, "UTF-8", "ISO-8859-4", &to_ISO_8859_4, 1, 1,
@ -826,6 +891,7 @@ from_ISO_8859_5 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_5_infos from_ISO_8859_5_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_5 = { rb_from_ISO_8859_5 = {
"ISO-8859-5", "UTF-8", &from_ISO_8859_5, 3, 0, "ISO-8859-5", "UTF-8", &from_ISO_8859_5, 3, 0,
@ -977,6 +1043,7 @@ to_ISO_8859_5 = {
to_ISO_8859_5_offsets, to_ISO_8859_5_offsets,
to_ISO_8859_5_infos to_ISO_8859_5_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_5 = { rb_to_ISO_8859_5 = {
"UTF-8", "ISO-8859-5", &to_ISO_8859_5, 1, 1, "UTF-8", "ISO-8859-5", &to_ISO_8859_5, 1, 1,
@ -1032,6 +1099,7 @@ from_ISO_8859_6 = {
from_ISO_8859_6_offsets, from_ISO_8859_6_offsets,
from_ISO_8859_6_infos from_ISO_8859_6_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_6 = { rb_from_ISO_8859_6 = {
"ISO-8859-6", "UTF-8", &from_ISO_8859_6, 2, 0, "ISO-8859-6", "UTF-8", &from_ISO_8859_6, 2, 0,
@ -1138,6 +1206,7 @@ to_ISO_8859_6 = {
to_ISO_8859_6_offsets, to_ISO_8859_6_offsets,
to_ISO_8859_6_infos to_ISO_8859_6_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_6 = { rb_to_ISO_8859_6 = {
"UTF-8", "ISO-8859-6", &to_ISO_8859_6, 1, 1, "UTF-8", "ISO-8859-6", &to_ISO_8859_6, 1, 1,
@ -1235,6 +1304,7 @@ from_ISO_8859_7 = {
from_ISO_8859_7_offsets, from_ISO_8859_7_offsets,
from_ISO_8859_7_infos from_ISO_8859_7_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_7 = { rb_from_ISO_8859_7 = {
"ISO-8859-7", "UTF-8", &from_ISO_8859_7, 3, 0, "ISO-8859-7", "UTF-8", &from_ISO_8859_7, 3, 0,
@ -1421,6 +1491,7 @@ to_ISO_8859_7 = {
to_ISO_8859_7_offsets, to_ISO_8859_7_offsets,
to_ISO_8859_7_infos to_ISO_8859_7_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_7 = { rb_to_ISO_8859_7 = {
"UTF-8", "ISO-8859-7", &to_ISO_8859_7, 1, 1, "UTF-8", "ISO-8859-7", &to_ISO_8859_7, 1, 1,
@ -1501,6 +1572,7 @@ from_ISO_8859_8 = {
from_ISO_8859_8_offsets, from_ISO_8859_8_offsets,
from_ISO_8859_8_infos from_ISO_8859_8_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_8 = { rb_from_ISO_8859_8 = {
"ISO-8859-8", "UTF-8", &from_ISO_8859_8, 3, 0, "ISO-8859-8", "UTF-8", &from_ISO_8859_8, 3, 0,
@ -1646,6 +1718,7 @@ to_ISO_8859_8 = {
to_ISO_8859_8_offsets, to_ISO_8859_8_offsets,
to_ISO_8859_8_infos to_ISO_8859_8_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_8 = { rb_to_ISO_8859_8 = {
"UTF-8", "ISO-8859-8", &to_ISO_8859_8, 1, 1, "UTF-8", "ISO-8859-8", &to_ISO_8859_8, 1, 1,
@ -1693,6 +1766,7 @@ from_ISO_8859_9 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_9_infos from_ISO_8859_9_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_9 = { rb_from_ISO_8859_9 = {
"ISO-8859-9", "UTF-8", &from_ISO_8859_9, 2, 0, "ISO-8859-9", "UTF-8", &from_ISO_8859_9, 2, 0,
@ -1795,6 +1869,7 @@ to_ISO_8859_9 = {
to_ISO_8859_9_offsets, to_ISO_8859_9_offsets,
to_ISO_8859_9_infos to_ISO_8859_9_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_9 = { rb_to_ISO_8859_9 = {
"UTF-8", "ISO-8859-9", &to_ISO_8859_9, 1, 1, "UTF-8", "ISO-8859-9", &to_ISO_8859_9, 1, 1,
@ -1874,6 +1949,7 @@ from_ISO_8859_10 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_10_infos from_ISO_8859_10_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_10 = { rb_from_ISO_8859_10 = {
"ISO-8859-10", "UTF-8", &from_ISO_8859_10, 3, 0, "ISO-8859-10", "UTF-8", &from_ISO_8859_10, 3, 0,
@ -2031,6 +2107,7 @@ to_ISO_8859_10 = {
to_ISO_8859_10_offsets, to_ISO_8859_10_offsets,
to_ISO_8859_10_infos to_ISO_8859_10_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_10 = { rb_to_ISO_8859_10 = {
"UTF-8", "ISO-8859-10", &to_ISO_8859_10, 1, 1, "UTF-8", "ISO-8859-10", &to_ISO_8859_10, 1, 1,
@ -2125,6 +2202,7 @@ from_ISO_8859_11 = {
from_ISO_8859_11_offsets, from_ISO_8859_11_offsets,
from_ISO_8859_11_infos from_ISO_8859_11_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_11 = { rb_from_ISO_8859_11 = {
"ISO-8859-11", "UTF-8", &from_ISO_8859_11, 3, 0, "ISO-8859-11", "UTF-8", &from_ISO_8859_11, 3, 0,
@ -2258,6 +2336,7 @@ to_ISO_8859_11 = {
to_ISO_8859_11_offsets, to_ISO_8859_11_offsets,
to_ISO_8859_11_infos to_ISO_8859_11_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_11 = { rb_to_ISO_8859_11 = {
"UTF-8", "ISO-8859-11", &to_ISO_8859_11, 1, 1, "UTF-8", "ISO-8859-11", &to_ISO_8859_11, 1, 1,
@ -2337,6 +2416,7 @@ from_ISO_8859_13 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_13_infos from_ISO_8859_13_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_13 = { rb_from_ISO_8859_13 = {
"ISO-8859-13", "UTF-8", &from_ISO_8859_13, 3, 0, "ISO-8859-13", "UTF-8", &from_ISO_8859_13, 3, 0,
@ -2481,6 +2561,7 @@ to_ISO_8859_13 = {
to_ISO_8859_10_offsets, to_ISO_8859_10_offsets,
to_ISO_8859_13_infos to_ISO_8859_13_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_13 = { rb_to_ISO_8859_13 = {
"UTF-8", "ISO-8859-13", &to_ISO_8859_13, 1, 1, "UTF-8", "ISO-8859-13", &to_ISO_8859_13, 1, 1,
@ -2560,6 +2641,7 @@ from_ISO_8859_14 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_14_infos from_ISO_8859_14_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_14 = { rb_from_ISO_8859_14 = {
"ISO-8859-14", "UTF-8", &from_ISO_8859_14, 3, 0, "ISO-8859-14", "UTF-8", &from_ISO_8859_14, 3, 0,
@ -2781,6 +2863,7 @@ to_ISO_8859_14 = {
to_ISO_8859_14_offsets, to_ISO_8859_14_offsets,
to_ISO_8859_14_infos to_ISO_8859_14_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_14 = { rb_to_ISO_8859_14 = {
"UTF-8", "ISO-8859-14", &to_ISO_8859_14, 1, 1, "UTF-8", "ISO-8859-14", &to_ISO_8859_14, 1, 1,
@ -2860,6 +2943,7 @@ from_ISO_8859_15 = {
from_ISO_8859_1_offsets, from_ISO_8859_1_offsets,
from_ISO_8859_15_infos from_ISO_8859_15_infos
}; };
static rb_transcoder static rb_transcoder
rb_from_ISO_8859_15 = { rb_from_ISO_8859_15 = {
"ISO-8859-15", "UTF-8", &from_ISO_8859_15, 3, 0, "ISO-8859-15", "UTF-8", &from_ISO_8859_15, 3, 0,
@ -2979,6 +3063,7 @@ to_ISO_8859_15 = {
to_ISO_8859_15_offsets, to_ISO_8859_15_offsets,
to_ISO_8859_15_infos to_ISO_8859_15_infos
}; };
static rb_transcoder static rb_transcoder
rb_to_ISO_8859_15 = { rb_to_ISO_8859_15 = {
"UTF-8", "ISO-8859-15", &to_ISO_8859_15, 1, 1, "UTF-8", "ISO-8859-15", &to_ISO_8859_15, 1, 1,
@ -2988,33 +3073,37 @@ rb_to_ISO_8859_15 = {
void void
Init_single_byte(void) Init_single_byte(void)
{ {
rb_register_transcoder(&rb_from_US_ASCII);
rb_register_transcoder(&rb_to_US_ASCII);
rb_register_transcoder(&rb_from_ASCII_8BIT);
rb_register_transcoder(&rb_to_ASCII_8BIT);
rb_register_transcoder(&rb_from_ISO_8859_1); rb_register_transcoder(&rb_from_ISO_8859_1);
rb_register_transcoder(&rb_from_ISO_8859_2);
rb_register_transcoder(&rb_from_ISO_8859_3);
rb_register_transcoder(&rb_from_ISO_8859_4);
rb_register_transcoder(&rb_from_ISO_8859_5);
rb_register_transcoder(&rb_from_ISO_8859_6);
rb_register_transcoder(&rb_from_ISO_8859_7);
rb_register_transcoder(&rb_from_ISO_8859_8);
rb_register_transcoder(&rb_from_ISO_8859_9);
rb_register_transcoder(&rb_from_ISO_8859_10);
rb_register_transcoder(&rb_from_ISO_8859_11);
rb_register_transcoder(&rb_from_ISO_8859_13);
rb_register_transcoder(&rb_from_ISO_8859_14);
rb_register_transcoder(&rb_from_ISO_8859_15);
rb_register_transcoder(&rb_to_ISO_8859_1); rb_register_transcoder(&rb_to_ISO_8859_1);
rb_register_transcoder(&rb_from_ISO_8859_2);
rb_register_transcoder(&rb_to_ISO_8859_2); rb_register_transcoder(&rb_to_ISO_8859_2);
rb_register_transcoder(&rb_from_ISO_8859_3);
rb_register_transcoder(&rb_to_ISO_8859_3); rb_register_transcoder(&rb_to_ISO_8859_3);
rb_register_transcoder(&rb_from_ISO_8859_4);
rb_register_transcoder(&rb_to_ISO_8859_4); rb_register_transcoder(&rb_to_ISO_8859_4);
rb_register_transcoder(&rb_from_ISO_8859_5);
rb_register_transcoder(&rb_to_ISO_8859_5); rb_register_transcoder(&rb_to_ISO_8859_5);
rb_register_transcoder(&rb_from_ISO_8859_6);
rb_register_transcoder(&rb_to_ISO_8859_6); rb_register_transcoder(&rb_to_ISO_8859_6);
rb_register_transcoder(&rb_from_ISO_8859_7);
rb_register_transcoder(&rb_to_ISO_8859_7); rb_register_transcoder(&rb_to_ISO_8859_7);
rb_register_transcoder(&rb_from_ISO_8859_8);
rb_register_transcoder(&rb_to_ISO_8859_8); rb_register_transcoder(&rb_to_ISO_8859_8);
rb_register_transcoder(&rb_from_ISO_8859_9);
rb_register_transcoder(&rb_to_ISO_8859_9); rb_register_transcoder(&rb_to_ISO_8859_9);
rb_register_transcoder(&rb_from_ISO_8859_10);
rb_register_transcoder(&rb_to_ISO_8859_10); rb_register_transcoder(&rb_to_ISO_8859_10);
rb_register_transcoder(&rb_from_ISO_8859_11);
rb_register_transcoder(&rb_to_ISO_8859_11); rb_register_transcoder(&rb_to_ISO_8859_11);
rb_register_transcoder(&rb_from_ISO_8859_13);
rb_register_transcoder(&rb_to_ISO_8859_13); rb_register_transcoder(&rb_to_ISO_8859_13);
rb_register_transcoder(&rb_from_ISO_8859_14);
rb_register_transcoder(&rb_to_ISO_8859_14); rb_register_transcoder(&rb_to_ISO_8859_14);
rb_register_transcoder(&rb_from_ISO_8859_15);
rb_register_transcoder(&rb_to_ISO_8859_15); rb_register_transcoder(&rb_to_ISO_8859_15);
} }
/* Footprint (bytes): gross: 26788, saved: 3728, net: 23060 */ /* Footprint (bytes): gross: 27876, saved: 4544, net: 23332 */

Просмотреть файл

@ -26,6 +26,8 @@ class TestTranscode < Test::Unit::TestCase
assert_raise(ArgumentError) { 'abc'.encode!('foo', 'bar') } assert_raise(ArgumentError) { 'abc'.encode!('foo', 'bar') }
assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode('foo') } assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode('foo') }
assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode!('foo') } assert_raise(ArgumentError) { 'abc'.force_encoding('utf-8').encode!('foo') }
assert_raise(RuntimeError) { "\x80".encode('utf-8','ASCII-8BIT') }
assert_raise(RuntimeError) { "\x80".encode('utf-8','US-ASCII') }
assert_raise(RuntimeError) { "\xA5".encode('utf-8','iso-8859-3') } assert_raise(RuntimeError) { "\xA5".encode('utf-8','iso-8859-3') }
end end
@ -87,6 +89,7 @@ class TestTranscode < Test::Unit::TestCase
def test_ascii_range def test_ascii_range
encodings = [ encodings = [
'US-ASCII', 'ASCII-8BIT',
'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3', 'ISO-8859-1', 'ISO-8859-2', 'ISO-8859-3',
'ISO-8859-4', 'ISO-8859-5', 'ISO-8859-6', 'ISO-8859-4', 'ISO-8859-5', 'ISO-8859-6',
'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9', 'ISO-8859-7', 'ISO-8859-8', 'ISO-8859-9',

Просмотреть файл

@ -89,6 +89,8 @@ rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib)
static void static void
init_transcoder_table(void) init_transcoder_table(void)
{ {
rb_declare_transcoder("US-ASCII", "UTF-8", "single_byte");
rb_declare_transcoder("ASCII-8BIT", "UTF-8", "single_byte");
rb_declare_transcoder("ISO-8859-1", "UTF-8", "single_byte"); rb_declare_transcoder("ISO-8859-1", "UTF-8", "single_byte");
rb_declare_transcoder("ISO-8859-2", "UTF-8", "single_byte"); rb_declare_transcoder("ISO-8859-2", "UTF-8", "single_byte");
rb_declare_transcoder("ISO-8859-3", "UTF-8", "single_byte"); rb_declare_transcoder("ISO-8859-3", "UTF-8", "single_byte");
@ -173,6 +175,7 @@ transcode_loop(char **in_pos, char **out_pos,
follow_byte: follow_byte:
next_offset = next_table->base[next_byte]; next_offset = next_table->base[next_byte];
next_info = (VALUE)next_table->info[next_offset]; next_info = (VALUE)next_table->info[next_offset];
follow_info:
switch (next_info & 0x1F) { switch (next_info & 0x1F) {
case NOMAP: case NOMAP:
*out_p++ = next_byte; *out_p++ = next_byte;
@ -191,7 +194,7 @@ transcode_loop(char **in_pos, char **out_pos,
else else
goto invalid; goto invalid;
} }
next_table = next_table->info[next_offset]; next_table = (const BYTE_LOOKUP *)next_info;
goto follow_byte; goto follow_byte;
/* maybe rewrite the following cases to use fallthrough???? */ /* maybe rewrite the following cases to use fallthrough???? */
case ZERObt: /* drop input */ case ZERObt: /* drop input */
@ -210,6 +213,9 @@ transcode_loop(char **in_pos, char **out_pos,
*out_p++ = getBT2(next_info); *out_p++ = getBT2(next_info);
*out_p++ = getBT3(next_info); *out_p++ = getBT3(next_info);
continue; continue;
case FUNii:
next_info = (VALUE)(*my_transcoder->func_ii)(next_info);
goto follow_info;
case INVALID: case INVALID:
goto invalid; goto invalid;
case UNDEF: case UNDEF:
@ -287,7 +293,7 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
return -1; return -1;
} }
if (from_enc && to_enc && rb_enc_asciicompat(from_enc) && rb_enc_asciicompat(to_enc)) { if (from_enc && to_enc && rb_enc_asciicompat(from_enc) && rb_enc_asciicompat(to_enc)) {
if (to_encidx == 0 || ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) { if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT) {
return to_encidx; return to_encidx;
} }
} }
@ -295,25 +301,6 @@ str_transcode(int argc, VALUE *argv, VALUE *self)
return -1; return -1;
} }
if (from_encidx == 0) {
const char *p = RSTRING_PTR(str);
const char *e = p + RSTRING_LEN(str);
while (p < e) {
int ret = rb_enc_precise_mbclen(p, e, to_enc);
int len = MBCLEN_CHARFOUND(ret);
if (!len) {
rb_raise(rb_eArgError, "not fully converted, %d bytes left", e-p);
}
p += len;
}
if (to_encidx < 0) {
to_encidx = rb_define_dummy_encoding(to_e);
}
return to_encidx;
}
while (!final_encoding) { /* loop for multistep transcoding */ while (!final_encoding) { /* loop for multistep transcoding */
/* later, maybe use smaller intermediate strings for very long strings */ /* later, maybe use smaller intermediate strings for very long strings */
if (!(my_transcoder = transcode_dispatch(from_e, to_e))) { if (!(my_transcoder = transcode_dispatch(from_e, to_e))) {
@ -412,6 +399,7 @@ rb_str_transcode_bang(int argc, VALUE *argv, VALUE str)
/* /*
* call-seq: * call-seq:
* str.encode(encoding) => str * str.encode(encoding) => str
* str.encode(to_encoding, from_encoding) => str
* *
* With one argument, returns a copy of <i>str</i> transcoded * With one argument, returns a copy of <i>str</i> transcoded
* to encoding +encoding+. * to encoding +encoding+.

Просмотреть файл

@ -27,24 +27,27 @@ typedef struct byte_lookup {
#define PType (const BYTE_LOOKUP *) #define PType (const BYTE_LOOKUP *)
#endif #endif
#define NOMAP (PType 0x01) /* single byte direct map */ #define NOMAP (PType 0x01) /* single byte direct map */
#define ONEbt (0x02) /* one byte payload */ #define ONEbt (0x02) /* one byte payload */
#define TWObt (0x03) /* two bytes payload */ #define TWObt (0x03) /* two bytes payload */
#define THREEbt (0x05) /* three bytes payload */ #define THREEbt (0x05) /* three bytes payload */
#define FOURbt (0x06) /* four bytes payload, UTF-8 only, macros start at getBT0 */ #define FOURbt (0x06) /* four bytes payload, UTF-8 only, macros start at getBT0 */
#define INVALID (PType 0x07) /* invalid byte sequence */ #define INVALID (PType 0x07) /* invalid byte sequence */
#define UNDEF (PType 0x09) /* legal but undefined */ #define UNDEF (PType 0x09) /* legal but undefined */
#define ZERObt (PType 0x0A) /* zero bytes of payload, i.e. remove */ #define ZERObt (PType 0x0A) /* zero bytes of payload, i.e. remove */
#define FUNii (PType 0x0B) /* function from info to info */
#define o1(b1) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|ONEbt)) #define o1(b1) (PType((((unsigned char)(b1))<<8)|ONEbt))
#define o2(b1,b2) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt)) #define o2(b1,b2) (PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|TWObt))
#define o3(b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt)) #define o3(b1,b2,b3) (PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|THREEbt))
#define o4(b0,b1,b2,b3) ((const BYTE_LOOKUP *)((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt)) #define o4(b0,b1,b2,b3) (PType((((unsigned char)(b1))<< 8)|(((unsigned char)(b2))<<16)|(((unsigned char)(b3))<<24)|((((unsigned char)(b0))&0x07)<<5)|FOURbt))
#define getBT1(a) (((a)>> 8)&0xFF) #define getBT1(a) (((a)>> 8)&0xFF)
#define getBT2(a) (((a)>>16)&0xFF) #define getBT2(a) (((a)>>16)&0xFF)
#define getBT3(a) (((a)>>24)&0xFF) #define getBT3(a) (((a)>>24)&0xFF)
#define getBT0(a) ((((a)>> 5)&0x07)|0xF0) /* for UTF-8 only!!! */ #define getBT0(a) ((((a)>> 5)&0x07)|0xF0) /* for UTF-8 only!!! */
#define o2FUNii(b1,b2) (PType((((unsigned char)(b1))<<8)|(((unsigned char)(b2))<<16)|FUNii))
/* do we need these??? maybe not, can be done with simple tables */ /* do we need these??? maybe not, can be done with simple tables */
#define ONETRAIL /* legal but undefined if one more trailing UTF-8 */ #define ONETRAIL /* legal but undefined if one more trailing UTF-8 */
@ -70,6 +73,7 @@ typedef struct rb_transcoder {
struct rb_transcoder *, struct rb_transcoding *); struct rb_transcoder *, struct rb_transcoding *);
void (*postprocessor)(char**, char**, char*, char*, void (*postprocessor)(char**, char**, char*, char*,
struct rb_transcoder *, struct rb_transcoding *); struct rb_transcoder *, struct rb_transcoding *);
VALUE (*func_ii)(VALUE); /* function from info to info */
} rb_transcoder; } rb_transcoder;
void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib); void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib);