diff --git a/ChangeLog b/ChangeLog index ceaee71160..b8b15e1b06 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,20 @@ +Tue Dec 25 14:57:00 2007 Nobuyoshi Nakada + + * common.mk (COMMONOBJS): transcode_data_*.c moved under enc/trans. + + * transcode_data.h (rb_transcoding, rb_transcoder): prefixed. + + * transcode.c (rb_register_transcoder, rb_declare_transcoder): split + declaration and registration. [ruby-dev:32704] + + * transcode.c (transcode_dispatch): autoload pre-declared transcoder. + + * transcode.c (str_transcode): use rb_define_dummy_encoding(). + + * transcode.c (Init_transcode): initialize transcoder tables. + + * enc/trans/single_byte.c, enc/trans/japanese.c: moved from top. + Tue Dec 25 14:20:13 2007 Yukihiro Matsumoto * lib/mkmf.rb (map_dir): should generate path including $top_srcdir. diff --git a/common.mk b/common.mk index 634fdcaf54..fd5c366b5c 100644 --- a/common.mk +++ b/common.mk @@ -60,8 +60,6 @@ COMMONOBJS = array.$(OBJEXT) \ struct.$(OBJEXT) \ time.$(OBJEXT) \ transcode.$(OBJEXT) \ - transcode_data_one_byte.$(OBJEXT) \ - transcode_data_japanese.$(OBJEXT) \ util.$(OBJEXT) \ variable.$(OBJEXT) \ version.$(OBJEXT) \ @@ -548,8 +546,6 @@ thread.$(OBJEXT): {$(VPATH)}thread.c {$(VPATH)}eval_intern.h \ {$(VPATH)}signal.h {$(VPATH)}st.h {$(VPATH)}dln.h transcode.$(OBJEXT): {$(VPATH)}transcode.c {$(VPATH)}transcode_data.h {$(VPATH)}ruby.h {$(VPATH)}config.h \ {$(VPATH)}defines.h {$(VPATH)}intern.h {$(VPATH)}missing.h {$(VPATH)}encoding.h -transcode_data_one_byte.$(OBJEXT): {$(VPATH)}transcode_data_one_byte.c {$(VPATH)}transcode_data.h -transcode_data_japanese.$(OBJEXT): {$(VPATH)}transcode_data_japanese.c {$(VPATH)}transcode_data.h cont.$(OBJEXT): {$(VPATH)}cont.c {$(VPATH)}eval_intern.h \ {$(VPATH)}ruby.h {$(VPATH)}vm_core.h {$(VPATH)}id.h {$(VPATH)}config.h \ {$(VPATH)}defines.h {$(VPATH)}intern.h {$(VPATH)}missing.h \ diff --git a/transcode_data_japanese.c b/enc/trans/japanese.c similarity index 99% rename from transcode_data_japanese.c rename to enc/trans/japanese.c index 860b824221..3e03f2c969 100644 --- a/transcode_data_japanese.c +++ b/enc/trans/japanese.c @@ -1,4 +1,3 @@ -#define TRANSCODE_DATA #include "transcode_data.h" static const unsigned char @@ -4418,11 +4417,16 @@ from_SHIFT_JIS_infos[108] = { &from_SHIFT_JIS_FA, &from_SHIFT_JIS_FB, &from_SHIFT_JIS_FC, UNDEF, }; -const BYTE_LOOKUP -rb_from_SHIFT_JIS = { +static const BYTE_LOOKUP +from_SHIFT_JIS = { from_SHIFT_JIS_offsets, from_SHIFT_JIS_infos }; +static rb_transcoder +rb_from_SHIFT_JIS = { + "UTF-8", "SHIFT_JIS", &from_SHIFT_JIS, 3, 0, + NULL, NULL, +}; static const unsigned char to_SHIFT_JIS_C2_offsets[64] = { @@ -13343,11 +13347,16 @@ to_SHIFT_JIS_infos[17] = { &to_SHIFT_JIS_E7, &to_SHIFT_JIS_E8, &to_SHIFT_JIS_E9, &to_SHIFT_JIS_EF, UNDEF, }; -const BYTE_LOOKUP -rb_to_SHIFT_JIS = { +static const BYTE_LOOKUP +to_SHIFT_JIS = { to_SHIFT_JIS_offsets, to_SHIFT_JIS_infos }; +static rb_transcoder +rb_to_SHIFT_JIS = { + "SHIFT_JIS", "UTF-8", &to_SHIFT_JIS, 2, 1, + NULL, NULL, +}; static const unsigned char from_EUC_JP_8E_offsets[256] = { @@ -18112,11 +18121,16 @@ from_EUC_JP_infos[85] = { &from_EUC_JP_F9, &from_EUC_JP_FA, &from_EUC_JP_FB, &from_EUC_JP_FC, UNDEF, }; -const BYTE_LOOKUP -rb_from_EUC_JP = { +static const BYTE_LOOKUP +from_EUC_JP = { from_EUC_JP_offsets, from_EUC_JP_infos }; +static rb_transcoder +rb_from_EUC_JP = { + "UTF-8", "EUC-JP", &from_EUC_JP, 3, 0, + NULL, NULL, +}; static const struct byte_lookup* const to_EUC_JP_C2_infos[14] = { @@ -23612,11 +23626,16 @@ to_EUC_JP_infos[17] = { &to_EUC_JP_E7, &to_EUC_JP_E8, &to_EUC_JP_E9, &to_EUC_JP_EF, UNDEF, }; -const BYTE_LOOKUP -rb_to_EUC_JP = { +static const BYTE_LOOKUP +to_EUC_JP = { to_SHIFT_JIS_offsets, to_EUC_JP_infos }; +static rb_transcoder +rb_to_EUC_JP = { + "EUC_JP", "UTF-8", &to_EUC_JP, 2, 1, + NULL, NULL, +}; #define ISO_2022_ENCODING(escseq, byte) ((escseq<<8)|byte) enum ISO_2022_ESCSEQ { @@ -23699,11 +23718,11 @@ get_iso_2022_mode(char **in_pos) return new_mode; } -void +static void from_iso_2022_jp_transcoder_preprocessor(char **in_pos, char **out_pos, - char *in_stop, char *out_stop, - transcoder *my_transcoder, - transcoding *my_transcoding) + char *in_stop, char *out_stop, + rb_transcoder *my_transcoder, + rb_transcoding *my_transcoding) { char *in_p = *in_pos, *out_p = *out_pos; int cur_mode = ISO_2022_GZ_ASCII; @@ -23779,11 +23798,11 @@ select_iso_2022_mode(char **out_pos, int new_mode) return new_mode; } -void +static void to_iso_2022_jp_transcoder_postprocessor(char **in_pos, char **out_pos, - char *in_stop, char *out_stop, - transcoder *my_transcoder, - transcoding *my_transcoding) + char *in_stop, char *out_stop, + rb_transcoder *my_transcoder, + rb_transcoding *my_transcoding) { char *in_p = *in_pos, *out_p = *out_pos; int cur_mode = ISO_2022_GZ_ASCII, new_mode = 0; @@ -23824,3 +23843,26 @@ to_iso_2022_jp_transcoder_postprocessor(char **in_pos, char **out_pos, *in_pos = in_p; *out_pos = out_p; } + +static rb_transcoder +rb_from_ISO_2022_JP = { + "ISO-2022-JP", "UTF-8", &from_EUC_JP, 8, 0, + &from_iso_2022_jp_transcoder_preprocessor, NULL, +}; + +static rb_transcoder +rb_to_ISO_2022_JP = { + "UTF-8", "ISO-2022-JP", &to_EUC_JP, 8, 1, + NULL, &to_iso_2022_jp_transcoder_postprocessor, +}; + +void +Init_japanese(void) +{ + rb_register_transcoder(&rb_from_SHIFT_JIS); + rb_register_transcoder(&rb_from_EUC_JP); + rb_register_transcoder(&rb_to_SHIFT_JIS); + rb_register_transcoder(&rb_to_EUC_JP); + rb_register_transcoder(&rb_from_ISO_2022_JP); + rb_register_transcoder(&rb_to_ISO_2022_JP); +} diff --git a/transcode_data_one_byte.c b/enc/trans/single_byte.c similarity index 95% rename from transcode_data_one_byte.c rename to enc/trans/single_byte.c index f050bfeb4f..ca4006d542 100644 --- a/transcode_data_one_byte.c +++ b/enc/trans/single_byte.c @@ -1,4 +1,3 @@ -#define TRANSCODE_DATA #include "transcode_data.h" static const unsigned char @@ -65,11 +64,16 @@ from_ISO_8859_1_infos[129] = { o2(0xC3,0xBB), o2(0xC3,0xBC), o2(0xC3,0xBD), o2(0xC3,0xBE), o2(0xC3,0xBF), }; -const BYTE_LOOKUP -rb_from_ISO_8859_1 = { +static const BYTE_LOOKUP +from_ISO_8859_1 = { from_ISO_8859_1_offsets, from_ISO_8859_1_infos }; +static rb_transcoder +rb_from_ISO_8859_1 = { + "ISO-8859-1", "UTF-8", &from_ISO_8859_1, 2, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_1_C2_offsets[64] = { @@ -158,11 +162,16 @@ to_ISO_8859_1_infos[4] = { NOMAP, &to_ISO_8859_1_C2, &to_ISO_8859_1_C3, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_1 = { +static const BYTE_LOOKUP +to_ISO_8859_1 = { to_ISO_8859_1_offsets, to_ISO_8859_1_infos }; +static rb_transcoder +rb_to_ISO_8859_1 = { + "UTF-8", "ISO-8859-1", &to_ISO_8859_1, 1, 1, + NULL, NULL, +}; static const struct byte_lookup* const from_ISO_8859_2_infos[129] = { @@ -200,11 +209,16 @@ from_ISO_8859_2_infos[129] = { o2(0xC5,0xB1), o2(0xC3,0xBC), o2(0xC3,0xBD), o2(0xC5,0xA3), o2(0xCB,0x99), }; -const BYTE_LOOKUP -rb_from_ISO_8859_2 = { +static const BYTE_LOOKUP +from_ISO_8859_2 = { from_ISO_8859_1_offsets, from_ISO_8859_2_infos }; +static rb_transcoder +rb_from_ISO_8859_2 = { + "ISO-8859-2", "UTF-8", &from_ISO_8859_2, 2, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_2_C2_offsets[64] = { @@ -351,11 +365,16 @@ to_ISO_8859_2_infos[7] = { &to_ISO_8859_2_C5, &to_ISO_8859_2_CB, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_2 = { +static const BYTE_LOOKUP +to_ISO_8859_2 = { to_ISO_8859_2_offsets, to_ISO_8859_2_infos }; +static rb_transcoder +rb_to_ISO_8859_2 = { + "UTF-8", "ISO-8859-2", &to_ISO_8859_2, 1, 1, + NULL, NULL, +}; static const unsigned char from_ISO_8859_3_offsets[256] = { @@ -410,11 +429,16 @@ from_ISO_8859_3_infos[123] = { o2(0xC3,0xBA), o2(0xC3,0xBB), o2(0xC3,0xBC), o2(0xC5,0xAD), o2(0xC5,0x9D), o2(0xCB,0x99), UNDEF, }; -const BYTE_LOOKUP -rb_from_ISO_8859_3 = { +static const BYTE_LOOKUP +from_ISO_8859_3 = { from_ISO_8859_3_offsets, from_ISO_8859_3_infos }; +static rb_transcoder +rb_from_ISO_8859_3 = { + "ISO-8859-3", "UTF-8", &from_ISO_8859_3, 2, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_3_C2_offsets[64] = { @@ -536,11 +560,16 @@ to_ISO_8859_3_infos[7] = { &to_ISO_8859_3_C5, &to_ISO_8859_3_CB, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_3 = { +static const BYTE_LOOKUP +to_ISO_8859_3 = { to_ISO_8859_2_offsets, to_ISO_8859_3_infos }; +static rb_transcoder +rb_to_ISO_8859_3 = { + "UTF-8", "ISO-8859-3", &to_ISO_8859_3, 1, 1, + NULL, NULL, +}; static const struct byte_lookup* const from_ISO_8859_4_infos[129] = { @@ -578,11 +607,16 @@ from_ISO_8859_4_infos[129] = { o2(0xC3,0xBB), o2(0xC3,0xBC), o2(0xC5,0xA9), o2(0xC5,0xAB), o2(0xCB,0x99), }; -const BYTE_LOOKUP -rb_from_ISO_8859_4 = { +static const BYTE_LOOKUP +from_ISO_8859_4 = { from_ISO_8859_1_offsets, from_ISO_8859_4_infos }; +static rb_transcoder +rb_from_ISO_8859_4 = { + "ISO-8859-4", "UTF-8", &from_ISO_8859_4, 2, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_4_C2_offsets[64] = { @@ -708,11 +742,16 @@ to_ISO_8859_4_infos[7] = { &to_ISO_8859_4_C5, &to_ISO_8859_4_CB, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_4 = { +static const BYTE_LOOKUP +to_ISO_8859_4 = { to_ISO_8859_2_offsets, to_ISO_8859_4_infos }; +static rb_transcoder +rb_to_ISO_8859_4 = { + "UTF-8", "ISO-8859-4", &to_ISO_8859_4, 1, 1, + NULL, NULL, +}; static const struct byte_lookup* const from_ISO_8859_5_infos[129] = { @@ -782,11 +821,16 @@ from_ISO_8859_5_infos[129] = { o2(0xC2,0xA7), o2(0xD1,0x9E), o2(0xD1,0x9F), }; -const BYTE_LOOKUP -rb_from_ISO_8859_5 = { +static const BYTE_LOOKUP +from_ISO_8859_5 = { from_ISO_8859_1_offsets, from_ISO_8859_5_infos }; +static rb_transcoder +rb_from_ISO_8859_5 = { + "ISO-8859-5", "UTF-8", &from_ISO_8859_5, 3, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_5_C2_offsets[64] = { @@ -928,11 +972,16 @@ to_ISO_8859_5_infos[6] = { &to_ISO_8859_5_D0, &to_ISO_8859_5_D1, &to_ISO_8859_5_E2, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_5 = { +static const BYTE_LOOKUP +to_ISO_8859_5 = { to_ISO_8859_5_offsets, to_ISO_8859_5_infos }; +static rb_transcoder +rb_to_ISO_8859_5 = { + "UTF-8", "ISO-8859-5", &to_ISO_8859_5, 1, 1, + NULL, NULL, +}; static const unsigned char from_ISO_8859_6_offsets[256] = { @@ -978,11 +1027,16 @@ from_ISO_8859_6_infos[85] = { o2(0xD9,0x8F), o2(0xD9,0x90), o2(0xD9,0x91), o2(0xD9,0x92), UNDEF, }; -const BYTE_LOOKUP -rb_from_ISO_8859_6 = { +static const BYTE_LOOKUP +from_ISO_8859_6 = { from_ISO_8859_6_offsets, from_ISO_8859_6_infos }; +static rb_transcoder +rb_from_ISO_8859_6 = { + "ISO-8859-6", "UTF-8", &from_ISO_8859_6, 2, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_6_C2_offsets[64] = { @@ -1079,11 +1133,16 @@ to_ISO_8859_6_infos[5] = { &to_ISO_8859_6_D8, &to_ISO_8859_6_D9, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_6 = { +static const BYTE_LOOKUP +to_ISO_8859_6 = { to_ISO_8859_6_offsets, to_ISO_8859_6_infos }; +static rb_transcoder +rb_to_ISO_8859_6 = { + "UTF-8", "ISO-8859-6", &to_ISO_8859_6, 1, 1, + NULL, NULL, +}; static const unsigned char from_ISO_8859_7_offsets[256] = { @@ -1171,11 +1230,16 @@ from_ISO_8859_7_infos[127] = { o2(0xCF,0x8D), o2(0xCF,0x8E), UNDEF, }; -const BYTE_LOOKUP -rb_from_ISO_8859_7 = { +static const BYTE_LOOKUP +from_ISO_8859_7 = { from_ISO_8859_7_offsets, from_ISO_8859_7_infos }; +static rb_transcoder +rb_from_ISO_8859_7 = { + "ISO-8859-7", "UTF-8", &from_ISO_8859_7, 3, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_7_C2_offsets[64] = { @@ -1352,11 +1416,16 @@ to_ISO_8859_7_infos[7] = { &to_ISO_8859_7_CF, &to_ISO_8859_7_E2, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_7 = { +static const BYTE_LOOKUP +to_ISO_8859_7 = { to_ISO_8859_7_offsets, to_ISO_8859_7_infos }; +static rb_transcoder +rb_to_ISO_8859_7 = { + "UTF-8", "ISO-8859-7", &to_ISO_8859_7, 1, 1, + NULL, NULL, +}; static const unsigned char from_ISO_8859_8_offsets[256] = { @@ -1427,11 +1496,16 @@ from_ISO_8859_8_infos[94] = { o2(0xD7,0xAA), o3(0xE2,0x80,0x8E), o3(0xE2,0x80,0x8F), UNDEF, }; -const BYTE_LOOKUP -rb_from_ISO_8859_8 = { +static const BYTE_LOOKUP +from_ISO_8859_8 = { from_ISO_8859_8_offsets, from_ISO_8859_8_infos }; +static rb_transcoder +rb_from_ISO_8859_8 = { + "ISO-8859-8", "UTF-8", &from_ISO_8859_8, 3, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_8_C2_offsets[64] = { @@ -1567,11 +1641,16 @@ to_ISO_8859_8_infos[6] = { &to_ISO_8859_8_C3, &to_ISO_8859_8_D7, &to_ISO_8859_8_E2, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_8 = { +static const BYTE_LOOKUP +to_ISO_8859_8 = { to_ISO_8859_8_offsets, to_ISO_8859_8_infos }; +static rb_transcoder +rb_to_ISO_8859_8 = { + "UTF-8", "ISO-8859-8", &to_ISO_8859_8, 1, 1, + NULL, NULL, +}; static const struct byte_lookup* const from_ISO_8859_9_infos[129] = { @@ -1609,11 +1688,16 @@ from_ISO_8859_9_infos[129] = { o2(0xC3,0xBB), o2(0xC3,0xBC), o2(0xC4,0xB1), o2(0xC5,0x9F), o2(0xC3,0xBF), }; -const BYTE_LOOKUP -rb_from_ISO_8859_9 = { +static const BYTE_LOOKUP +from_ISO_8859_9 = { from_ISO_8859_1_offsets, from_ISO_8859_9_infos }; +static rb_transcoder +rb_from_ISO_8859_9 = { + "ISO-8859-9", "UTF-8", &from_ISO_8859_9, 2, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_9_C3_offsets[64] = { @@ -1706,11 +1790,16 @@ to_ISO_8859_9_infos[6] = { &to_ISO_8859_9_C3, &to_ISO_8859_9_C4, &to_ISO_8859_9_C5, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_9 = { +static const BYTE_LOOKUP +to_ISO_8859_9 = { to_ISO_8859_9_offsets, to_ISO_8859_9_infos }; +static rb_transcoder +rb_to_ISO_8859_9 = { + "UTF-8", "ISO-8859-9", &to_ISO_8859_9, 1, 1, + NULL, NULL, +}; static const struct byte_lookup* const from_ISO_8859_10_infos[129] = { @@ -1780,11 +1869,16 @@ from_ISO_8859_10_infos[129] = { o2(0xC3,0xBD), o2(0xC3,0xBE), o2(0xC4,0xB8), }; -const BYTE_LOOKUP -rb_from_ISO_8859_10 = { +static const BYTE_LOOKUP +from_ISO_8859_10 = { from_ISO_8859_1_offsets, from_ISO_8859_10_infos }; +static rb_transcoder +rb_from_ISO_8859_10 = { + "ISO-8859-10", "UTF-8", &from_ISO_8859_10, 3, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_10_C2_offsets[64] = { @@ -1932,11 +2026,16 @@ to_ISO_8859_10_infos[7] = { &to_ISO_8859_10_C5, &to_ISO_8859_10_E2, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_10 = { +static const BYTE_LOOKUP +to_ISO_8859_10 = { to_ISO_8859_10_offsets, to_ISO_8859_10_infos }; +static rb_transcoder +rb_to_ISO_8859_10 = { + "UTF-8", "ISO-8859-10", &to_ISO_8859_10, 1, 1, + NULL, NULL, +}; static const unsigned char from_ISO_8859_11_offsets[256] = { @@ -2021,11 +2120,16 @@ from_ISO_8859_11_infos[122] = { o3(0xE0,0xB9,0x99), o3(0xE0,0xB9,0x9A), o3(0xE0,0xB9,0x9B), UNDEF, }; -const BYTE_LOOKUP -rb_from_ISO_8859_11 = { +static const BYTE_LOOKUP +from_ISO_8859_11 = { from_ISO_8859_11_offsets, from_ISO_8859_11_infos }; +static rb_transcoder +rb_from_ISO_8859_11 = { + "ISO-8859-11", "UTF-8", &from_ISO_8859_11, 3, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_11_C2_offsets[64] = { @@ -2149,11 +2253,16 @@ to_ISO_8859_11_infos[4] = { NOMAP, &to_ISO_8859_11_C2, &to_ISO_8859_11_E0, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_11 = { +static const BYTE_LOOKUP +to_ISO_8859_11 = { to_ISO_8859_11_offsets, to_ISO_8859_11_infos }; +static rb_transcoder +rb_to_ISO_8859_11 = { + "UTF-8", "ISO-8859-11", &to_ISO_8859_11, 1, 1, + NULL, NULL, +}; static const struct byte_lookup* const from_ISO_8859_13_infos[129] = { @@ -2223,11 +2332,16 @@ from_ISO_8859_13_infos[129] = { o2(0xC5,0xBC), o2(0xC5,0xBE), o3(0xE2,0x80,0x99), }; -const BYTE_LOOKUP -rb_from_ISO_8859_13 = { +static const BYTE_LOOKUP +from_ISO_8859_13 = { from_ISO_8859_1_offsets, from_ISO_8859_13_infos }; +static rb_transcoder +rb_from_ISO_8859_13 = { + "ISO-8859-13", "UTF-8", &from_ISO_8859_13, 3, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_13_C2_offsets[64] = { @@ -2362,11 +2476,16 @@ to_ISO_8859_13_infos[7] = { &to_ISO_8859_13_C5, &to_ISO_8859_13_E2, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_13 = { +static const BYTE_LOOKUP +to_ISO_8859_13 = { to_ISO_8859_10_offsets, to_ISO_8859_13_infos }; +static rb_transcoder +rb_to_ISO_8859_13 = { + "UTF-8", "ISO-8859-13", &to_ISO_8859_13, 1, 1, + NULL, NULL, +}; static const struct byte_lookup* const from_ISO_8859_14_infos[129] = { @@ -2436,11 +2555,16 @@ from_ISO_8859_14_infos[129] = { o2(0xC3,0xBD), o2(0xC5,0xB7), o2(0xC3,0xBF), }; -const BYTE_LOOKUP -rb_from_ISO_8859_14 = { +static const BYTE_LOOKUP +from_ISO_8859_14 = { from_ISO_8859_1_offsets, from_ISO_8859_14_infos }; +static rb_transcoder +rb_from_ISO_8859_14 = { + "ISO-8859-14", "UTF-8", &from_ISO_8859_14, 3, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_14_C2_offsets[64] = { @@ -2652,11 +2776,16 @@ to_ISO_8859_14_infos[7] = { &to_ISO_8859_14_C5, &to_ISO_8859_14_E1, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_14 = { +static const BYTE_LOOKUP +to_ISO_8859_14 = { to_ISO_8859_14_offsets, to_ISO_8859_14_infos }; +static rb_transcoder +rb_to_ISO_8859_14 = { + "UTF-8", "ISO-8859-14", &to_ISO_8859_14, 1, 1, + NULL, NULL, +}; static const struct byte_lookup* const from_ISO_8859_15_infos[129] = { @@ -2726,11 +2855,16 @@ from_ISO_8859_15_infos[129] = { o2(0xC3,0xBD), o2(0xC3,0xBE), o2(0xC3,0xBF), }; -const BYTE_LOOKUP -rb_from_ISO_8859_15 = { +static const BYTE_LOOKUP +from_ISO_8859_15 = { from_ISO_8859_1_offsets, from_ISO_8859_15_infos }; +static rb_transcoder +rb_from_ISO_8859_15 = { + "ISO-8859-15", "UTF-8", &from_ISO_8859_15, 3, 0, + NULL, NULL, +}; static const unsigned char to_ISO_8859_15_C2_offsets[64] = { @@ -2840,10 +2974,47 @@ to_ISO_8859_15_infos[6] = { &to_ISO_8859_1_C3, &to_ISO_8859_15_C5, &to_ISO_8859_15_E2, UNDEF, }; -const BYTE_LOOKUP -rb_to_ISO_8859_15 = { +static const BYTE_LOOKUP +to_ISO_8859_15 = { to_ISO_8859_15_offsets, to_ISO_8859_15_infos }; +static rb_transcoder +rb_to_ISO_8859_15 = { + "UTF-8", "ISO-8859-15", &to_ISO_8859_15, 1, 1, + NULL, NULL, +}; +void +Init_one_byte(void) +{ + rb_register_transcoder(&rb_from_ISO_8859_1); + rb_register_transcoder(&rb_from_ISO_8859_2); + rb_register_transcoder(&rb_from_ISO_8859_3); + rb_register_transcoder(&rb_from_ISO_8859_4); + rb_register_transcoder(&rb_from_ISO_8859_5); + rb_register_transcoder(&rb_from_ISO_8859_6); + rb_register_transcoder(&rb_from_ISO_8859_7); + rb_register_transcoder(&rb_from_ISO_8859_8); + rb_register_transcoder(&rb_from_ISO_8859_9); + rb_register_transcoder(&rb_from_ISO_8859_10); + rb_register_transcoder(&rb_from_ISO_8859_11); + rb_register_transcoder(&rb_from_ISO_8859_13); + rb_register_transcoder(&rb_from_ISO_8859_14); + rb_register_transcoder(&rb_from_ISO_8859_15); + rb_register_transcoder(&rb_to_ISO_8859_1); + rb_register_transcoder(&rb_to_ISO_8859_2); + rb_register_transcoder(&rb_to_ISO_8859_3); + rb_register_transcoder(&rb_to_ISO_8859_4); + rb_register_transcoder(&rb_to_ISO_8859_5); + rb_register_transcoder(&rb_to_ISO_8859_6); + rb_register_transcoder(&rb_to_ISO_8859_7); + rb_register_transcoder(&rb_to_ISO_8859_8); + rb_register_transcoder(&rb_to_ISO_8859_9); + rb_register_transcoder(&rb_to_ISO_8859_10); + rb_register_transcoder(&rb_to_ISO_8859_11); + rb_register_transcoder(&rb_to_ISO_8859_13); + rb_register_transcoder(&rb_to_ISO_8859_14); + rb_register_transcoder(&rb_to_ISO_8859_15); +} /* Footprint (bytes): gross: 26788, saved: 3728, net: 23060 */ diff --git a/transcode.c b/transcode.c index c915cc299d..e8c6545163 100644 --- a/transcode.c +++ b/transcode.c @@ -12,9 +12,9 @@ #include "ruby/ruby.h" #include "ruby/encoding.h" - +#define PType (int) #include "transcode_data.h" - +#include VALUE rb_str_tmp_new(long); VALUE rb_str_shared_replace(VALUE, VALUE); @@ -23,168 +23,122 @@ VALUE rb_str_shared_replace(VALUE, VALUE); * Dispatch data and logic */ -/* extern declarations, should use some include file here */ -extern const BYTE_LOOKUP rb_from_ISO_8859_1; -extern const BYTE_LOOKUP rb_from_ISO_8859_2; -extern const BYTE_LOOKUP rb_from_ISO_8859_3; -extern const BYTE_LOOKUP rb_from_ISO_8859_4; -extern const BYTE_LOOKUP rb_from_ISO_8859_5; -extern const BYTE_LOOKUP rb_from_ISO_8859_6; -extern const BYTE_LOOKUP rb_from_ISO_8859_7; -extern const BYTE_LOOKUP rb_from_ISO_8859_8; -extern const BYTE_LOOKUP rb_from_ISO_8859_9; -extern const BYTE_LOOKUP rb_from_ISO_8859_10; -extern const BYTE_LOOKUP rb_from_ISO_8859_11; -extern const BYTE_LOOKUP rb_from_ISO_8859_13; -extern const BYTE_LOOKUP rb_from_ISO_8859_14; -extern const BYTE_LOOKUP rb_from_ISO_8859_15; +static st_table *transcoder_table, *transcoder_lib_table; -extern const BYTE_LOOKUP rb_to_ISO_8859_1; -extern const BYTE_LOOKUP rb_to_ISO_8859_2; -extern const BYTE_LOOKUP rb_to_ISO_8859_3; -extern const BYTE_LOOKUP rb_to_ISO_8859_4; -extern const BYTE_LOOKUP rb_to_ISO_8859_5; -extern const BYTE_LOOKUP rb_to_ISO_8859_6; -extern const BYTE_LOOKUP rb_to_ISO_8859_7; -extern const BYTE_LOOKUP rb_to_ISO_8859_8; -extern const BYTE_LOOKUP rb_to_ISO_8859_9; -extern const BYTE_LOOKUP rb_to_ISO_8859_10; -extern const BYTE_LOOKUP rb_to_ISO_8859_11; -extern const BYTE_LOOKUP rb_to_ISO_8859_13; -extern const BYTE_LOOKUP rb_to_ISO_8859_14; -extern const BYTE_LOOKUP rb_to_ISO_8859_15; +#define TRANSCODER_INTERNAL_SEPARATOR '\t' -extern const BYTE_LOOKUP rb_from_SHIFT_JIS; -extern const BYTE_LOOKUP rb_from_EUC_JP; - -extern const BYTE_LOOKUP rb_to_SHIFT_JIS; -extern const BYTE_LOOKUP rb_to_EUC_JP; - -extern void from_iso_2022_jp_transcoder_preprocessor(char**, char**, char*, char*, - struct transcoder_st *transcoder, struct transcoding*); -extern void to_iso_2022_jp_transcoder_postprocessor(char**, char**, char*, char*, - struct transcoder_st *transcoder, struct transcoding*); - -/* declarations probably need to go into separate header file, e.g. transcode.h */ - -/* todo: dynamic structure, one per conversion (stream) */ - -/* in the future, add some mechanism for dynamically adding stuff here */ -#define MAX_TRANSCODERS 35 /* todo: fix: this number has to be adjusted by hand */ -static transcoder transcoder_table[MAX_TRANSCODERS]; -/* variable to work across register_transcoder and register_functional_transcoder */ -static int next_transcoder_position = 0; - -/* not sure why it's not possible to do relocatable initializations */ -/* maybe the code here can be removed (changed to simple initialization) */ -/* if we move this to another file???? */ -static void -register_transcoder(const char *from_e, const char *to_e, - const BYTE_LOOKUP *tree_start, int max_output, int from_utf8) +static char * +transcoder_key(const char *from_e, const char *to_e) { - if (next_transcoder_position >= MAX_TRANSCODERS) { - /* we are initializing, is it okay to use rb_raise here? */ - rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots"); - } - transcoder_table[next_transcoder_position].from_encoding = from_e; - transcoder_table[next_transcoder_position].to_encoding = to_e; - transcoder_table[next_transcoder_position].conv_tree_start = tree_start; - transcoder_table[next_transcoder_position].max_output = max_output; - transcoder_table[next_transcoder_position].from_utf8 = from_utf8; + int to_len = strlen(to_e); + int from_len = strlen(from_e); + char *const key = xmalloc(to_len + from_len + 2); - next_transcoder_position++; + memcpy(key, to_e, to_len); + memcpy(key + to_len + 1, from_e, from_len + 1); + key[to_len] = TRANSCODER_INTERNAL_SEPARATOR; + return key; +} + +void +rb_register_transcoder(const rb_transcoder *tr) +{ + st_data_t k, val = 0; + const char *const from_e = tr->from_encoding; + const char *const to_e = tr->to_encoding; + char *const key = transcoder_key(from_e, to_e); + + if (st_lookup(transcoder_table, (st_data_t)key, &val)) { + xfree(key); + rb_raise(rb_eArgError, "transcoder from %s to %s has been already registered", + from_e, to_e); + } + k = (st_data_t)key; + if (st_delete(transcoder_lib_table, &k, &val)) { + xfree((char *)k); + } + st_insert(transcoder_table, (st_data_t)key, (st_data_t)tr); } static void -register_functional_transcoder(const char *from_e, const char *to_e, - const BYTE_LOOKUP *tree_start, int max_output, int from_utf8, - void (*preprocessor)(char**, char**, char*, char*, transcoder*, transcoding*), - void (*postprocessor)(char**, char**, char*, char*, transcoder*, transcoding*)) +declare_transcoder(const char *to, const char *from, const char *lib) { - if (next_transcoder_position >= MAX_TRANSCODERS) { - /* we are initializing, is it okay to use rb_raise here? */ - rb_raise(rb_eRuntimeError /*change exception*/, "not enough transcoder slots"); - } - transcoder_table[next_transcoder_position].from_encoding = from_e; - transcoder_table[next_transcoder_position].to_encoding = to_e; - transcoder_table[next_transcoder_position].conv_tree_start = tree_start; - transcoder_table[next_transcoder_position].max_output = max_output; - transcoder_table[next_transcoder_position].from_utf8 = from_utf8; - transcoder_table[next_transcoder_position].conv_tree_start = tree_start; - transcoder_table[next_transcoder_position].preprocessor = preprocessor; - transcoder_table[next_transcoder_position].postprocessor = postprocessor; + const char *const key = transcoder_key(to, from); + st_data_t k = (st_data_t)key, val; - next_transcoder_position++; + if (st_delete(transcoder_lib_table, &k, &val)) { + xfree((char *)k); + } + st_insert(transcoder_lib_table, (st_data_t)key, (st_data_t)lib); +} + +#define MAX_TRANSCODER_LIBNAME_LEN 64 +static const char transcoder_lib_prefix[] = "enc/trans/"; + +void +rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib) +{ + if (!lib || strlen(lib) > MAX_TRANSCODER_LIBNAME_LEN) { + rb_raise(rb_eArgError, "invalid library name - %s", + lib ? lib : "(null)"); + } + declare_transcoder(enc1, enc2, lib); + declare_transcoder(enc2, enc1, lib); } static void init_transcoder_table(void) { - register_transcoder("ISO-8859-1", "UTF-8", &rb_from_ISO_8859_1, 2, 0); - register_transcoder("ISO-8859-2", "UTF-8", &rb_from_ISO_8859_2, 2, 0); - register_transcoder("ISO-8859-3", "UTF-8", &rb_from_ISO_8859_3, 2, 0); - register_transcoder("ISO-8859-4", "UTF-8", &rb_from_ISO_8859_4, 2, 0); - register_transcoder("ISO-8859-5", "UTF-8", &rb_from_ISO_8859_5, 3, 0); - register_transcoder("ISO-8859-6", "UTF-8", &rb_from_ISO_8859_6, 2, 0); - register_transcoder("ISO-8859-7", "UTF-8", &rb_from_ISO_8859_7, 3, 0); - register_transcoder("ISO-8859-8", "UTF-8", &rb_from_ISO_8859_8, 3, 0); - register_transcoder("ISO-8859-9", "UTF-8", &rb_from_ISO_8859_9, 2, 0); - register_transcoder("ISO-8859-10", "UTF-8", &rb_from_ISO_8859_10, 3, 0); - register_transcoder("ISO-8859-11", "UTF-8", &rb_from_ISO_8859_11, 3, 0); - register_transcoder("ISO-8859-13", "UTF-8", &rb_from_ISO_8859_13, 3, 0); - register_transcoder("ISO-8859-14", "UTF-8", &rb_from_ISO_8859_14, 3, 0); - register_transcoder("ISO-8859-15", "UTF-8", &rb_from_ISO_8859_15, 3, 0); - register_transcoder("UTF-8", "ISO-8859-1", &rb_to_ISO_8859_1, 1, 1); - register_transcoder("UTF-8", "ISO-8859-2", &rb_to_ISO_8859_2, 1, 1); - register_transcoder("UTF-8", "ISO-8859-3", &rb_to_ISO_8859_3, 1, 1); - register_transcoder("UTF-8", "ISO-8859-4", &rb_to_ISO_8859_4, 1, 1); - register_transcoder("UTF-8", "ISO-8859-5", &rb_to_ISO_8859_5, 1, 1); - register_transcoder("UTF-8", "ISO-8859-6", &rb_to_ISO_8859_6, 1, 1); - register_transcoder("UTF-8", "ISO-8859-7", &rb_to_ISO_8859_7, 1, 1); - register_transcoder("UTF-8", "ISO-8859-8", &rb_to_ISO_8859_8, 1, 1); - register_transcoder("UTF-8", "ISO-8859-9", &rb_to_ISO_8859_9, 1, 1); - register_transcoder("UTF-8", "ISO-8859-10", &rb_to_ISO_8859_10, 1, 1); - register_transcoder("UTF-8", "ISO-8859-11", &rb_to_ISO_8859_11, 1, 1); - register_transcoder("UTF-8", "ISO-8859-13", &rb_to_ISO_8859_13, 1, 1); - register_transcoder("UTF-8", "ISO-8859-14", &rb_to_ISO_8859_14, 1, 1); - register_transcoder("UTF-8", "ISO-8859-15", &rb_to_ISO_8859_15, 1, 1); - - register_transcoder("SHIFT_JIS", "UTF-8", &rb_from_SHIFT_JIS, 3, 0); - register_transcoder("EUC-JP", "UTF-8", &rb_from_EUC_JP, 3, 0); - register_transcoder("UTF-8", "SHIFT_JIS", &rb_to_SHIFT_JIS, 2, 1); - register_transcoder("UTF-8", "EUC-JP", &rb_to_EUC_JP, 2, 1); - register_functional_transcoder("ISO-2022-JP", "UTF-8", &rb_from_EUC_JP, - 8, 0, &from_iso_2022_jp_transcoder_preprocessor, NULL); - register_functional_transcoder("UTF-8", "ISO-2022-JP", &rb_to_EUC_JP, - 8, 1, NULL, &to_iso_2022_jp_transcoder_postprocessor); - - register_transcoder(NULL, NULL, NULL, 0, 0); + rb_declare_transcoder("ISO-8859-1", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-2", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-3", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-4", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-5", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-6", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-7", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-8", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-9", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-10", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-11", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-13", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-14", "UTF-8", "single_byte"); + rb_declare_transcoder("ISO-8859-15", "UTF-8", "single_byte"); + rb_declare_transcoder("SHIFT_JIS", "UTF-8", "japanese"); + rb_declare_transcoder("EUC-JP", "UTF-8", "japanese"); + rb_declare_transcoder("ISO-2022-JP", "UTF-8", "japanese"); } -static int -encoding_equal(const char* encoding1, const char* encoding2) -{ - return 0==strcasecmp(encoding1, encoding2); -} +#define encoding_equal(enc1, enc2) (strcasecmp(enc1, enc2) == 0) -static transcoder* +static rb_transcoder * transcode_dispatch(const char* from_encoding, const char* to_encoding) { - transcoder *candidate = transcoder_table; - - for (candidate = transcoder_table; candidate->from_encoding; candidate++) { - if (encoding_equal(from_encoding, candidate->from_encoding) - && encoding_equal(to_encoding, candidate->to_encoding)) { - return candidate; + char *const key = transcoder_key(from_encoding, to_encoding); + st_data_t k, val = 0; + + k = (st_data_t)key; + if (!st_lookup(transcoder_table, k, &val) && + st_delete(transcoder_lib_table, &k, &val)) { + const char *const lib = (const char *)val; + int len = strlen(lib); + char path[sizeof(transcoder_lib_prefix) + MAX_TRANSCODER_LIBNAME_LEN]; + + xfree((char *)k); + if (len > MAX_TRANSCODER_LIBNAME_LEN) return NULL; + memcpy(path, transcoder_lib_prefix, sizeof(transcoder_lib_prefix) - 1); + memcpy(path + sizeof(transcoder_lib_prefix) - 1, lib, len + 1); + if (!rb_require(path)) return NULL; + if (!st_lookup(transcoder_table, (st_data_t)key, &val)) { + /* multistep logic, via UTF-8 */ + if (!encoding_equal(from_encoding, "UTF-8") && + !encoding_equal(to_encoding, "UTF-8") && + transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */ + return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */ + } + return NULL; } } - /* multistep logic, via UTF-8 */ - if (!encoding_equal(from_encoding, "UTF-8") - && !encoding_equal(to_encoding, "UTF-8") - && transcode_dispatch("UTF-8", to_encoding)) { /* check that we have a second step */ - return transcode_dispatch(from_encoding, "UTF-8"); /* return first step */ - } - return NULL; + return (rb_transcoder *)val; } @@ -194,8 +148,8 @@ transcode_dispatch(const char* from_encoding, const char* to_encoding) static void transcode_loop(char **in_pos, char **out_pos, char *in_stop, char *out_stop, - transcoder *my_transcoder, - transcoding *my_transcoding) + const rb_transcoder *my_transcoder, + rb_transcoding *my_transcoding) { char *in_p = *in_pos, *out_p = *out_pos; const BYTE_LOOKUP *conv_tree_start = my_transcoder->conv_tree_start; @@ -280,7 +234,7 @@ transcode_loop(char **in_pos, char **out_pos, */ static char * -str_transcoding_resize(transcoding *my_transcoding, int len, int new_len) +str_transcoding_resize(rb_transcoding *my_transcoding, int len, int new_len) { VALUE dest_string = my_transcoding->ruby_string_dest; rb_str_resize(dest_string, new_len); @@ -298,8 +252,8 @@ str_transcode(int argc, VALUE *argv, VALUE *self) const char *from_e, *to_e; int from_encidx, to_encidx; VALUE from_encval, to_encval; - transcoder *my_transcoder; - transcoding my_transcoding; + rb_transcoder *my_transcoder; + rb_transcoding my_transcoding; int final_encoding = 0; if (argc<1 || argc>2) { @@ -307,6 +261,7 @@ str_transcode(int argc, VALUE *argv, VALUE *self) } if ((to_encidx = rb_to_encoding_index(to_encval = argv[0])) < 0) { to_enc = 0; + to_encidx = 0; to_e = StringValueCStr(to_encval); } else { @@ -405,7 +360,7 @@ str_transcode(int argc, VALUE *argv, VALUE *self) } /* set encoding */ if (!to_enc) { - to_encidx = rb_enc_replicate(to_e, rb_ascii8bit_encoding()); + to_encidx = rb_define_dummy_encoding(to_e); } *self = dest; @@ -467,7 +422,10 @@ rb_str_transcode(int argc, VALUE *argv, VALUE str) void Init_transcode(void) { + transcoder_table = st_init_strcasetable(); + transcoder_lib_table = st_init_strcasetable(); init_transcoder_table(); + rb_define_method(rb_cString, "encode", rb_str_transcode, -1); rb_define_method(rb_cString, "encode!", rb_str_transcode_bang, -1); } diff --git a/transcode_data.h b/transcode_data.h index 3131877a41..add954ef74 100644 --- a/transcode_data.h +++ b/transcode_data.h @@ -22,12 +22,9 @@ typedef struct byte_lookup { const struct byte_lookup *const *info; } BYTE_LOOKUP; -#ifdef TRANSCODE_DATA +#ifndef PType /* data file needs to treat this as a pointer, to remove warnings */ #define PType (const BYTE_LOOKUP *) -#else -/* in code, this is treated as just an integer */ -#define PType (int) #endif #define NOMAP (PType 0x01) /* single byte direct map */ @@ -56,23 +53,26 @@ typedef struct byte_lookup { /* dynamic structure, one per conversion (similar to iconv_t) */ /* may carry conversion state (e.g. for iso-2022-jp) */ -typedef struct transcoding { +typedef struct rb_transcoding { VALUE ruby_string_dest; /* the String used as the conversion destination, or NULL if something else is being converted */ - char *(*flush_func)(struct transcoding*, int, int); -} transcoding; + char *(*flush_func)(struct rb_transcoding*, int, int); +} rb_transcoding; /* static structure, one per supported encoding pair */ -typedef struct transcoder_st{ +typedef struct rb_transcoder { const char *from_encoding; const char *to_encoding; const BYTE_LOOKUP *conv_tree_start; int max_output; int from_utf8; void (*preprocessor)(char**, char**, char*, char*, - struct transcoder_st *transcoder, struct transcoding*); + struct rb_transcoder *, struct rb_transcoding *); void (*postprocessor)(char**, char**, char*, char*, - struct transcoder_st *transcoder, struct transcoding*); -} transcoder; + struct rb_transcoder *, struct rb_transcoding *); +} rb_transcoder; + +void rb_declare_transcoder(const char *enc1, const char *enc2, const char *lib); +void rb_register_transcoder(const rb_transcoder *); #endif /* RUBY_TRANSCODE_DATA_H */