diff --git a/ChangeLog b/ChangeLog index 2a879a542d..760f9eef18 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,10 +1,27 @@ +Sun Jan 13 00:01:07 2008 NARUSE, Yui + + * enc/make_encdb.rb: added. search enc/*.c and make encoding database. + + * regenc.h (ENC_REPLICATE, ENC_ALIAS): added for defining replica + encoding and encoding alias. + + * encoding.c (rb_enc_init): move alias definitions to enc/*.c. + (rb_enc_find_index): search original of replica and alias when no + encoding library. + (rb_enc_name_list, rb_enc_aliases_enc_i, rb_enc_aliases_str_i, + rb_enc_aliases, Encoding.name_list, Encoding.aliases): added. + (Init_Encoding): init encdb. + + * enc/ascii.c, enc/us_ascii.c, enc/euc_jp.c, enc/sjis.c: + add replica encoding and encoding alias difinition. + + * common.mk (dist-clean-local): add rule for remvoe encdb.h. + Sat Jan 12 18:27:41 2008 Nobuyoshi Nakada * eval.c (rb_define_alloc_func, rb_undef_alloc_func): should define/undef on a signleton class. [ruby-core:09959] -9959] - Sat Jan 12 12:44:36 2008 NARUSE, Yui * ext/nkf/nkf.c: rdoc update. diff --git a/common.mk b/common.mk index 1b2c3881f8..1b80b685d5 100644 --- a/common.mk +++ b/common.mk @@ -312,7 +312,7 @@ clean-enc: distclean: distclean-ext distclean-local distclean-enc distclean-local:: clean-local - @$(RM) $(MKFILES) config.h rbconfig.rb yasmdata.rb + @$(RM) $(MKFILES) config.h rbconfig.rb yasmdata.rb encdb.h @$(RM) config.cache config.log config.status config.status.lineno $(PRELUDES) @$(RM) *~ *.bak *.stackdump core *.core gmon.out y.tab.c y.output $(PREP) distclean-ext: @@ -431,7 +431,7 @@ dmyext.$(OBJEXT): {$(VPATH)}dmyext.c encoding.$(OBJEXT): {$(VPATH)}encoding.c {$(VPATH)}ruby.h \ {$(VPATH)}config.h {$(VPATH)}defines.h {$(VPATH)}missing.h \ {$(VPATH)}intern.h {$(VPATH)}st.h {$(VPATH)}encoding.h \ - {$(VPATH)}oniguruma.h {$(VPATH)}regenc.h + {$(VPATH)}oniguruma.h {$(VPATH)}regenc.h {$(VPATH)}encdb.h enum.$(OBJEXT): {$(VPATH)}enum.c {$(VPATH)}ruby.h {$(VPATH)}config.h \ {$(VPATH)}defines.h {$(VPATH)}missing.h {$(VPATH)}intern.h \ {$(VPATH)}st.h {$(VPATH)}node.h {$(VPATH)}util.h @@ -708,6 +708,9 @@ incs: $(INSNS) {$(VPATH)}node_name.inc {$(VPATH)}revision.h node_name.inc: {$(VPATH)}node.h $(BASERUBY) -n $(srcdir)/tool/node_name.rb $? > $@ +encdb.h: $(srcdir)/enc/make_encdb.rb + $(BASERUBY) -I$(srcdir) $(srcdir)/enc/make_encdb.rb + miniprelude.c: $(srcdir)/tool/compile_prelude.rb $(srcdir)/prelude.rb $(BASERUBY) -I$(srcdir) $(srcdir)/tool/compile_prelude.rb $(srcdir)/prelude.rb $@ diff --git a/enc/ascii.c b/enc/ascii.c index f9a619d700..e79d93074f 100644 --- a/enc/ascii.c +++ b/enc/ascii.c @@ -47,3 +47,4 @@ OnigEncodingDefine(ascii, ASCII) = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match }; +ENC_ALIAS("BINARY", "ASCII-8BIT"); diff --git a/enc/euc_jp.c b/enc/euc_jp.c index 8c0df7db4e..7040185558 100644 --- a/enc/euc_jp.c +++ b/enc/euc_jp.c @@ -361,3 +361,4 @@ OnigEncodingDefine(euc_jp, EUC_JP) = { is_allowed_reverse_match, 0 }; +ENC_ALIAS("eucJP", "EUC-JP"); /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */ diff --git a/enc/make_encdb.rb b/enc/make_encdb.rb new file mode 100755 index 0000000000..2cd7699944 --- /dev/null +++ b/enc/make_encdb.rb @@ -0,0 +1,65 @@ +#! ./miniruby + +# +# OnigEncodingDefine(foo, Foo) = { +# .. +# "Shift_JIS", /* Canonical Name */ +# .. +# }; +# ENC_ALIAS("SJIS", "Shift_JIS") +# ENC_REPLICATE("Windows-31J", "Shift_JIS") +# ENC_ALIAS("CP932", "Windows-31J") +# + +require 'mkmf' + +encodings = [] +replicas = {} +aliases = {} +Dir.open($srcdir) {|d| d.grep(/.+\.c\z/)}.each do |fn| + open(File.join($srcdir,fn)) do |f| + orig = nil + name = nil + f.each_line do |line| + break if /^OnigEncodingDefine/o =~ line + end + f.each_line do |line| + break if /"(.*?)"/ =~ line + end + encodings << $1 if $1 + f.each_line do |line| + if /^ENC_REPLICATE\(\s*"([^"]+)"\s*,\s*"([^"]+)"/o =~ line + replicas[$1] = $2 + elsif /^ENC_ALIAS\(\s*"([^"]+)"\s*,\s*"([^"]+)"/o =~ line + aliases[$1] = $2 + end + end + end +end +open('encdb.h', 'wb') do |f| + f.puts 'static const char *enc_name_list[] = {' + encodings.each {|name| f.puts' "%s",' % name} + replicas.each_key {|name| f.puts' "%s",' % name} + f.puts(<<"_TEXT_") + NULL +}; +static const int enc_name_list_size = #{encodings.length + replicas.length}; +static const int enc_aliases_size = #{aliases.length}; +static st_table *enc_table_replica_name; +static st_table *enc_table_alias_name; + +static void enc_init_db(void) +{ + if (!enc_table_replica_name) { + enc_table_replica_name = st_init_strcasetable(); + } + if (!enc_table_alias_name) { + enc_table_alias_name = st_init_strcasetable(); + } +_TEXT_ + replicas.each_pair {|name, orig| + f.puts' st_insert(enc_table_replica_name, (st_data_t)"%s", (st_data_t)"%s");' % [name, orig]} + aliases.each_pair {|name, orig| + f.puts' st_insert(enc_table_alias_name, (st_data_t)"%s", (st_data_t)"%s");' % [name, orig]} + f.puts '}' +end diff --git a/enc/sjis.c b/enc/sjis.c index 776940291d..dcf05bf86f 100644 --- a/enc/sjis.c +++ b/enc/sjis.c @@ -370,3 +370,7 @@ OnigEncodingDefine(sjis, SJIS) = { is_allowed_reverse_match, 0 }; +ENC_ALIAS("SJIS", "Shift_JIS"); +ENC_REPLICATE("Windows-31J", "Shift_JIS"); +ENC_ALIAS("CP932", "Windows-31J"); +ENC_ALIAS("csWindows31J", "Windows-31J"); /* IANA. IE6 don't accept Windows-31J but csWindows31J. */ diff --git a/enc/us_ascii.c b/enc/us_ascii.c index df50e2d0d4..b3ac093425 100644 --- a/enc/us_ascii.c +++ b/enc/us_ascii.c @@ -26,3 +26,6 @@ OnigEncodingDefine(us_ascii, US_ASCII) = { onigenc_single_byte_left_adjust_char_head, onigenc_always_true_is_allowed_reverse_match }; +ENC_ALIAS("ASCII", "US-ASCII"); +ENC_ALIAS("ANSI_X3.4-1986", "US-ASCII"); + diff --git a/encoding.c b/encoding.c index 4f719dbcd3..b9e69617ea 100644 --- a/encoding.c +++ b/encoding.c @@ -12,6 +12,7 @@ #include "ruby/ruby.h" #include "ruby/encoding.h" #include "regenc.h" +#include "encdb.h" #include #ifdef HAVE_LANGINFO_H #include @@ -291,8 +292,6 @@ rb_enc_alias(const char *alias, const char *orig) enum { ENCINDEX_ASCII, - ENCINDEX_EUC_JP, - ENCINDEX_SJIS, ENCINDEX_UTF8, ENCINDEX_BUILTIN_MAX }; @@ -303,14 +302,8 @@ rb_enc_init(void) enc_table_count = enc_table_expand(ENCINDEX_BUILTIN_MAX); #define ENC_REGISTER(enc) enc_register_at(ENCINDEX_##enc, rb_enc_name(ONIG_ENCODING_##enc), ONIG_ENCODING_##enc) ENC_REGISTER(ASCII); - ENC_REGISTER(EUC_JP); - ENC_REGISTER(SJIS); ENC_REGISTER(UTF8); #undef ENC_REGISTER - enc_alias("ASCII", rb_enc_name(ONIG_ENCODING_ASCII)); - enc_alias("BINARY", rb_enc_name(ONIG_ENCODING_ASCII)); - enc_alias("eucJP", rb_enc_name(ONIG_ENCODING_EUC_JP)); /* UI-OSF Application Platform Profile for Japanese Environment Version 1.1 */ - enc_alias("SJIS", rb_enc_name(ONIG_ENCODING_SJIS)); } rb_encoding * @@ -373,6 +366,20 @@ rb_enc_find_index(const char *name) OBJ_FREEZE(enclib); if (RTEST(rb_protect(require_enc, enclib, 0))) i = rb_enc_registered(name); + else { + st_data_t key = (st_data_t)name, orig; + if (st_lookup(enc_table_replica_name, key, &orig)) { + i = rb_enc_find_index((char *)orig); + if (i < 0) { + rb_raise(rb_eRuntimeError, "unknown original encoding name - %s for %s", (char *)orig, name); + } + i = rb_enc_replicate(name, rb_enc_from_index(i)); + st_delete(enc_table_replica_name, &key, &orig); + } else if (st_lookup(enc_table_alias_name, key, &orig)) { + i = rb_enc_alias(name, (char *)orig); + st_delete(enc_table_replica_name, &key, &orig); + } + } rb_set_errinfo(Qnil); } return i; @@ -978,6 +985,40 @@ set_encoding_alias(st_data_t name, st_data_t orig, st_data_t arg) return ST_CONTINUE; } +static VALUE +rb_enc_name_list(VALUE klass) +{ + VALUE ary = rb_ary_new2(enc_name_list_size); + int i; + for (i = 0; i < enc_name_list_size; i++) { + rb_ary_push(ary, rb_str_new2(enc_name_list[i])); + } + return ary; +} + +static int +rb_enc_aliases_enc_i(st_data_t name, st_data_t orig, st_data_t arg) +{ + rb_hash_aset((VALUE)arg, rb_str_new2((char *)name), rb_enc_name(rb_enc_from_index((int)orig))); + return 0; +} + +static int +rb_enc_aliases_str_i(st_data_t name, st_data_t orig, st_data_t arg) +{ + rb_hash_aset((VALUE)arg, rb_str_new2((char *)name), rb_str_new2((char *)orig)); + return 0; +} + +static VALUE +rb_enc_aliases(VALUE klass) +{ + VALUE aliases = rb_hash_new(); + st_foreach(enc_table_alias, rb_enc_aliases_enc_i, (st_data_t)aliases); + st_foreach(enc_table_alias_name, rb_enc_aliases_str_i, (st_data_t)aliases); + return aliases; +} + void Init_Encoding(void) { @@ -993,6 +1034,8 @@ Init_Encoding(void) rb_define_method(rb_cEncoding, "base_encoding", enc_base_encoding, 0); rb_define_method(rb_cEncoding, "dummy?", enc_dummy_p, 0); rb_define_singleton_method(rb_cEncoding, "list", enc_list, 0); + rb_define_singleton_method(rb_cEncoding, "name_list", rb_enc_name_list, 0); + rb_define_singleton_method(rb_cEncoding, "aliases", rb_enc_aliases, 0); rb_define_singleton_method(rb_cEncoding, "find", enc_find, 1); rb_define_singleton_method(rb_cEncoding, "compatible?", enc_compatible_p, 2); @@ -1005,15 +1048,7 @@ Init_Encoding(void) /* dummy for unsupported, statefull encoding */ rb_define_dummy_encoding("ISO-2022-JP"); - rb_enc_replicate("Windows-31J", rb_enc_from_index(ENCINDEX_SJIS)); - rb_enc_alias("CP932", "Windows-31J"); - rb_enc_alias("csWindows31J", "Windows-31J"); /* IANA. IE6 don't accept Windows-31J but csWindows31J. */ - - for (i = 0; i < enc_table_size; ++i) { - rb_encoding *enc = enc_table[i].enc; - if (enc) set_encoding_const(rb_enc_name(enc), enc); - } - st_foreach(enc_table_alias, set_encoding_alias, 0); + enc_init_db(); } /* locale insensitive functions */ diff --git a/regenc.h b/regenc.h index e393af7537..979fb2c335 100644 --- a/regenc.h +++ b/regenc.h @@ -199,4 +199,8 @@ extern int ONIG_ENC_REGISTER(const char *, OnigEncodingType*); #define OnigEncodingDefine(f,n) OnigEncodingDeclare(n) #endif +/* macros for define replica encoding and encoding alias */ +#define ENC_REPLICATE(name, orig) +#define ENC_ALIAS(name, orig) + #endif /* REGENC_H */