diff --git a/ChangeLog b/ChangeLog index 2bd1d06064..322eb95c9a 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,13 @@ +Fri Nov 23 15:27:43 2007 Tanaka Akira + + * re.c (REG_CASESTATE): unused macro removed. + (rb_reg_prepare_re): check encoding difference. + (rb_reg_initialize): check 8bit byte. + + * parse.y (parser_tokadd_escape): fix has8bit. + + [ruby-dev:32113] + Fri Nov 23 13:34:08 2007 Tanaka Akira * struct.c (rb_struct_define_without_accessor): new function. diff --git a/parse.y b/parse.y index adae072923..75d39042f4 100644 --- a/parse.y +++ b/parse.y @@ -5206,7 +5206,7 @@ parser_tokadd_escape(struct parser_params *parser, int term, hex = tok_hex(&numlen); if (numlen == 0) goto eof; tokcopy(numlen + 2); - if (hex >= 0x80) *has8bit = ENC_CODERANGE_UNKNOWN; + if (hex >= 0x80) *has8bit = 1; } return 0; diff --git a/re.c b/re.c index 4e915f6f78..8c17e98592 100644 --- a/re.c +++ b/re.c @@ -132,7 +132,6 @@ rb_memsearch(const void *x0, long m, const void *y0, long n) } #define REG_LITERAL FL_USER5 -#define REG_CASESTATE FL_USER0 #define KCODE_FIXED FL_USER4 @@ -711,15 +710,18 @@ static void rb_reg_prepare_re(VALUE re, VALUE str) { int need_recompile = 0; - int state; rb_encoding *enc; rb_reg_check(re); - state = FL_TEST(re, REG_CASESTATE); /* ignorecase status */ - if (ENCODING_GET(re) == 0 && !FL_TEST(re, KCODE_FIXED) && - (enc = rb_enc_get(str)) != 0 && - RREGEXP(re)->ptr->enc != enc) { + if (ENCODING_GET(re) != 0 || FL_TEST(re, KCODE_FIXED)) { + if (ENCODING_GET(re) != rb_enc_get_index(str) && + rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) { + rb_raise(rb_eArgError, "character encodings differ"); + } + } + else if ((enc = rb_enc_get(str)) != 0 && + RREGEXP(re)->ptr->enc != enc) { need_recompile = 1; } @@ -755,7 +757,6 @@ rb_reg_adjust_startpos(VALUE re, VALUE str, int pos, int reverse) OnigEncoding enc; UChar *p, *string; - rb_reg_check(re); rb_reg_prepare_re(re, str); if (reverse) { @@ -795,7 +796,6 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse) return -1; } - rb_reg_check(re); rb_reg_prepare_re(re, str); if (reverse) { @@ -1231,6 +1231,8 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, int options, onig_errmsg_buffer err) { struct RRegexp *re = RREGEXP(obj); + int raw8bit; + long i; if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4) rb_raise(rb_eSecurityError, "Insecure: can't modify regexp"); @@ -1242,8 +1244,16 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc, re->ptr = 0; re->str = 0; + raw8bit = 0; + for (i = 0; i < len; i++) { + if (s[i] & 0x80) { + raw8bit = 1; + break; + } + } + rb_enc_associate((VALUE)re, enc); - if (options & ARG_ENCODING_FIXED) { + if (options & ARG_ENCODING_FIXED || raw8bit) { re->basic.flags |= KCODE_FIXED; } re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err); diff --git a/test/ruby/test_m17n.rb b/test/ruby/test_m17n.rb new file mode 100644 index 0000000000..c50c6b8384 --- /dev/null +++ b/test/ruby/test_m17n.rb @@ -0,0 +1,201 @@ +require 'test/unit' + +class TestM17N < Test::Unit::TestCase + def assert_encoding(encname, actual, message=nil) + assert_equal(Encoding.find(encname), actual, message) + end + + def a(str) str.force_encoding("ASCII-8BIT") end + def e(str) str.force_encoding("EUC-JP") end + def s(str) str.force_encoding("Shift_JIS") end + def u(str) str.force_encoding("UTF-8") end + + def test_string_ascii_literal + assert_encoding("ASCII-8BIT", eval(a(%{""})).encoding) + assert_encoding("ASCII-8BIT", eval(a(%{"a"})).encoding) + end + + def test_string_euc_literal + assert_encoding("ASCII-8BIT", eval(e(%{""})).encoding) + assert_encoding("ASCII-8BIT", eval(e(%{"a"})).encoding) + assert_encoding("EUC-JP", eval(e(%{"\xa1\xa1"})).encoding) + assert_encoding("EUC-JP", eval(e(%{"\\xa1\\xa1"})).encoding) + assert_encoding("ASCII-8BIT", eval(e(%{"\\x20"})).encoding) + assert_encoding("ASCII-8BIT", eval(e(%{"\\n"})).encoding) + assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding) + end + + def test_regexp_too_short_multibyte_character + assert_raise(SyntaxError) { eval('/\xfe/e') } + assert_raise(SyntaxError) { eval('/\x8e/e') } + assert_raise(SyntaxError) { eval('/\x8f/e') } + assert_raise(SyntaxError) { eval('/\x8f\xa1/e') } + assert_raise(SyntaxError) { eval('/\xef/s') } + assert_raise(SyntaxError) { eval('/\xc0/u') } + assert_raise(SyntaxError) { eval('/\xe0\x80/u') } + assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') } + assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') } + assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') } + + # raw 8bit + #assert_raise(SyntaxError) { eval("/\xfe/e") } + #assert_raise(SyntaxError) { eval("/\xc0/u") } + + # invalid suffix + #assert_raise(SyntaxError) { eval('/\xc0\xff/u') } + #assert_raise(SyntaxError) { eval('/\xc0\x20/u') } + end + + def test_regexp_generic + r = /a/ + assert_encoding("ASCII-8BIT", r.encoding) + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + + # "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8. + assert_equal(nil, r =~ a("\xc0\xa1")) + assert_equal(nil, r =~ e("\xc0\xa1")) + assert_equal(nil, r =~ s("\xc0\xa1")) + assert_equal(nil, r =~ u("\xc0\xa1")) + + r = eval(a(%{/\xc0\xa1/})) + assert_encoding("ASCII-8BIT", r.encoding) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_equal(0, r =~ a("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + # xxx: /\xc0\xa1/ should be restricted only for ASCII-8BIT? + # r = /\xc0\xa1/ + # assert_encoding("ASCII-8BIT", r.encoding) + # assert_equal(nil, r =~ a("a")) + # assert_equal(nil, r =~ e("a")) + # assert_equal(nil, r =~ s("a")) + # assert_equal(nil, r =~ u("a")) + # assert_equal(0, r =~ a("\xc0\xa1")) + # assert_equal(0, r =~ e("\xc0\xa1")) # xxx + # assert_equal(0, r =~ s("\xc0\xa1")) # xxx + # assert_equal(0, r =~ u("\xc0\xa1")) # xxx + end + + def test_regexp_ascii + r = /a/n + assert_encoding("ASCII-8BIT", r.encoding) + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + assert_equal(nil, r =~ a("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + r = /\xc0\xa1/n + assert_encoding("ASCII-8BIT", r.encoding) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_equal(0, r =~ a("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + r = eval(%{/\xc0\xa1/n}.force_encoding("ASCII-8BIT")) + assert_encoding("ASCII-8BIT", r.encoding) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_equal(0, r =~ a("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + r = eval(%q{/\xc0\xa1/}.force_encoding("ASCII-8BIT")) + assert_encoding("ASCII-8BIT", r.encoding) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_equal(0, r =~ a("\xc0\xa1")) + # assert_raise(ArgumentError) { r =~ e("\xc0\xa1") } + # assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + # assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + end + + def test_regexp_euc + r = /a/e + assert_encoding("EUC-JP", r.encoding) + assert_equal(0, r =~ a("a")) + assert_equal(0, r =~ e("a")) + assert_equal(0, r =~ s("a")) + assert_equal(0, r =~ u("a")) + assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } + assert_equal(nil, r =~ e("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + r = /\xc0\xa1/e + assert_encoding("EUC-JP", r.encoding) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } + assert_equal(0, r =~ e("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + r = eval(%{/\xc0\xa1/}.force_encoding("EUC-JP")) + assert_encoding("EUC-JP", r.encoding) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } + assert_equal(0, r =~ e("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + + r = eval(%q{/\xc0\xa1/}.force_encoding("EUC-JP")) + assert_encoding("EUC-JP", r.encoding) + assert_equal(nil, r =~ a("a")) + assert_equal(nil, r =~ e("a")) + assert_equal(nil, r =~ s("a")) + assert_equal(nil, r =~ u("a")) + assert_raise(ArgumentError) { r =~ a("\xc0\xa1") } + assert_equal(0, r =~ e("\xc0\xa1")) + assert_raise(ArgumentError) { r =~ s("\xc0\xa1") } + assert_raise(ArgumentError) { r =~ u("\xc0\xa1") } + end + + def test_begin_end_offset + str = e("\244\242\244\244\244\246\244\250\244\252a") + assert(/(a)/ =~ str) + assert_equal("a", $&) + assert_equal(5, $~.begin(0)) + assert_equal(6, $~.end(0)) + assert_equal([5,6], $~.offset(0)) + assert_equal(5, $~.begin(1)) + assert_equal(6, $~.end(1)) + assert_equal([5,6], $~.offset(1)) + end + + def test_begin_end_offset_sjis + str = s("\x81@@") + assert(/@/ =~ str) + assert_equal(s("\x81@"), $`) + assert_equal("@", $&) + assert_equal("", $') + assert_equal([1,2], $~.offset(0)) + end + +end