* re.c (REG_CASESTATE): unused macro removed.

(rb_reg_prepare_re): check encoding difference.
  (rb_reg_initialize): check 8bit byte.

* parse.y (parser_tokadd_escape): fix has8bit.

  [ruby-dev:32113]


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@14002 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
akr 2007-11-23 06:30:26 +00:00
Родитель f1afb3959c
Коммит 2109a52503
4 изменённых файлов: 231 добавлений и 10 удалений

Просмотреть файл

@ -1,3 +1,13 @@
Fri Nov 23 15:27:43 2007 Tanaka Akira <akr@fsij.org>
* re.c (REG_CASESTATE): unused macro removed.
(rb_reg_prepare_re): check encoding difference.
(rb_reg_initialize): check 8bit byte.
* parse.y (parser_tokadd_escape): fix has8bit.
[ruby-dev:32113]
Fri Nov 23 13:34:08 2007 Tanaka Akira <akr@fsij.org>
* struct.c (rb_struct_define_without_accessor): new function.

Просмотреть файл

@ -5206,7 +5206,7 @@ parser_tokadd_escape(struct parser_params *parser, int term,
hex = tok_hex(&numlen);
if (numlen == 0) goto eof;
tokcopy(numlen + 2);
if (hex >= 0x80) *has8bit = ENC_CODERANGE_UNKNOWN;
if (hex >= 0x80) *has8bit = 1;
}
return 0;

26
re.c
Просмотреть файл

@ -132,7 +132,6 @@ rb_memsearch(const void *x0, long m, const void *y0, long n)
}
#define REG_LITERAL FL_USER5
#define REG_CASESTATE FL_USER0
#define KCODE_FIXED FL_USER4
@ -711,14 +710,17 @@ static void
rb_reg_prepare_re(VALUE re, VALUE str)
{
int need_recompile = 0;
int state;
rb_encoding *enc;
rb_reg_check(re);
state = FL_TEST(re, REG_CASESTATE);
/* ignorecase status */
if (ENCODING_GET(re) == 0 && !FL_TEST(re, KCODE_FIXED) &&
(enc = rb_enc_get(str)) != 0 &&
if (ENCODING_GET(re) != 0 || FL_TEST(re, KCODE_FIXED)) {
if (ENCODING_GET(re) != rb_enc_get_index(str) &&
rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) {
rb_raise(rb_eArgError, "character encodings differ");
}
}
else if ((enc = rb_enc_get(str)) != 0 &&
RREGEXP(re)->ptr->enc != enc) {
need_recompile = 1;
}
@ -755,7 +757,6 @@ rb_reg_adjust_startpos(VALUE re, VALUE str, int pos, int reverse)
OnigEncoding enc;
UChar *p, *string;
rb_reg_check(re);
rb_reg_prepare_re(re, str);
if (reverse) {
@ -795,7 +796,6 @@ rb_reg_search(VALUE re, VALUE str, int pos, int reverse)
return -1;
}
rb_reg_check(re);
rb_reg_prepare_re(re, str);
if (reverse) {
@ -1231,6 +1231,8 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
int options, onig_errmsg_buffer err)
{
struct RRegexp *re = RREGEXP(obj);
int raw8bit;
long i;
if (!OBJ_TAINTED(obj) && rb_safe_level() >= 4)
rb_raise(rb_eSecurityError, "Insecure: can't modify regexp");
@ -1242,8 +1244,16 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
re->ptr = 0;
re->str = 0;
raw8bit = 0;
for (i = 0; i < len; i++) {
if (s[i] & 0x80) {
raw8bit = 1;
break;
}
}
rb_enc_associate((VALUE)re, enc);
if (options & ARG_ENCODING_FIXED) {
if (options & ARG_ENCODING_FIXED || raw8bit) {
re->basic.flags |= KCODE_FIXED;
}
re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err);

201
test/ruby/test_m17n.rb Normal file
Просмотреть файл

@ -0,0 +1,201 @@
require 'test/unit'
class TestM17N < Test::Unit::TestCase
def assert_encoding(encname, actual, message=nil)
assert_equal(Encoding.find(encname), actual, message)
end
def a(str) str.force_encoding("ASCII-8BIT") end
def e(str) str.force_encoding("EUC-JP") end
def s(str) str.force_encoding("Shift_JIS") end
def u(str) str.force_encoding("UTF-8") end
def test_string_ascii_literal
assert_encoding("ASCII-8BIT", eval(a(%{""})).encoding)
assert_encoding("ASCII-8BIT", eval(a(%{"a"})).encoding)
end
def test_string_euc_literal
assert_encoding("ASCII-8BIT", eval(e(%{""})).encoding)
assert_encoding("ASCII-8BIT", eval(e(%{"a"})).encoding)
assert_encoding("EUC-JP", eval(e(%{"\xa1\xa1"})).encoding)
assert_encoding("EUC-JP", eval(e(%{"\\xa1\\xa1"})).encoding)
assert_encoding("ASCII-8BIT", eval(e(%{"\\x20"})).encoding)
assert_encoding("ASCII-8BIT", eval(e(%{"\\n"})).encoding)
assert_encoding("EUC-JP", eval(e(%{"\\x80"})).encoding)
end
def test_regexp_too_short_multibyte_character
assert_raise(SyntaxError) { eval('/\xfe/e') }
assert_raise(SyntaxError) { eval('/\x8e/e') }
assert_raise(SyntaxError) { eval('/\x8f/e') }
assert_raise(SyntaxError) { eval('/\x8f\xa1/e') }
assert_raise(SyntaxError) { eval('/\xef/s') }
assert_raise(SyntaxError) { eval('/\xc0/u') }
assert_raise(SyntaxError) { eval('/\xe0\x80/u') }
assert_raise(SyntaxError) { eval('/\xf0\x80\x80/u') }
assert_raise(SyntaxError) { eval('/\xf8\x80\x80\x80/u') }
assert_raise(SyntaxError) { eval('/\xfc\x80\x80\x80\x80/u') }
# raw 8bit
#assert_raise(SyntaxError) { eval("/\xfe/e") }
#assert_raise(SyntaxError) { eval("/\xc0/u") }
# invalid suffix
#assert_raise(SyntaxError) { eval('/\xc0\xff/u') }
#assert_raise(SyntaxError) { eval('/\xc0\x20/u') }
end
def test_regexp_generic
r = /a/
assert_encoding("ASCII-8BIT", r.encoding)
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
# "\xc0\xa1" is a valid sequence for ASCII-8BIT, EUC-JP, Shift_JIS and UTF-8.
assert_equal(nil, r =~ a("\xc0\xa1"))
assert_equal(nil, r =~ e("\xc0\xa1"))
assert_equal(nil, r =~ s("\xc0\xa1"))
assert_equal(nil, r =~ u("\xc0\xa1"))
r = eval(a(%{/\xc0\xa1/}))
assert_encoding("ASCII-8BIT", r.encoding)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_equal(0, r =~ a("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
# xxx: /\xc0\xa1/ should be restricted only for ASCII-8BIT?
# r = /\xc0\xa1/
# assert_encoding("ASCII-8BIT", r.encoding)
# assert_equal(nil, r =~ a("a"))
# assert_equal(nil, r =~ e("a"))
# assert_equal(nil, r =~ s("a"))
# assert_equal(nil, r =~ u("a"))
# assert_equal(0, r =~ a("\xc0\xa1"))
# assert_equal(0, r =~ e("\xc0\xa1")) # xxx
# assert_equal(0, r =~ s("\xc0\xa1")) # xxx
# assert_equal(0, r =~ u("\xc0\xa1")) # xxx
end
def test_regexp_ascii
r = /a/n
assert_encoding("ASCII-8BIT", r.encoding)
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
assert_equal(nil, r =~ a("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = /\xc0\xa1/n
assert_encoding("ASCII-8BIT", r.encoding)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_equal(0, r =~ a("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(%{/\xc0\xa1/n}.force_encoding("ASCII-8BIT"))
assert_encoding("ASCII-8BIT", r.encoding)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_equal(0, r =~ a("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(%q{/\xc0\xa1/}.force_encoding("ASCII-8BIT"))
assert_encoding("ASCII-8BIT", r.encoding)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_equal(0, r =~ a("\xc0\xa1"))
# assert_raise(ArgumentError) { r =~ e("\xc0\xa1") }
# assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
# assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
end
def test_regexp_euc
r = /a/e
assert_encoding("EUC-JP", r.encoding)
assert_equal(0, r =~ a("a"))
assert_equal(0, r =~ e("a"))
assert_equal(0, r =~ s("a"))
assert_equal(0, r =~ u("a"))
assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
assert_equal(nil, r =~ e("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = /\xc0\xa1/e
assert_encoding("EUC-JP", r.encoding)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
assert_equal(0, r =~ e("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(%{/\xc0\xa1/}.force_encoding("EUC-JP"))
assert_encoding("EUC-JP", r.encoding)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
assert_equal(0, r =~ e("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
r = eval(%q{/\xc0\xa1/}.force_encoding("EUC-JP"))
assert_encoding("EUC-JP", r.encoding)
assert_equal(nil, r =~ a("a"))
assert_equal(nil, r =~ e("a"))
assert_equal(nil, r =~ s("a"))
assert_equal(nil, r =~ u("a"))
assert_raise(ArgumentError) { r =~ a("\xc0\xa1") }
assert_equal(0, r =~ e("\xc0\xa1"))
assert_raise(ArgumentError) { r =~ s("\xc0\xa1") }
assert_raise(ArgumentError) { r =~ u("\xc0\xa1") }
end
def test_begin_end_offset
str = e("\244\242\244\244\244\246\244\250\244\252a")
assert(/(a)/ =~ str)
assert_equal("a", $&)
assert_equal(5, $~.begin(0))
assert_equal(6, $~.end(0))
assert_equal([5,6], $~.offset(0))
assert_equal(5, $~.begin(1))
assert_equal(6, $~.end(1))
assert_equal([5,6], $~.offset(1))
end
def test_begin_end_offset_sjis
str = s("\x81@@")
assert(/@/ =~ str)
assert_equal(s("\x81@"), $`)
assert_equal("@", $&)
assert_equal("", $')
assert_equal([1,2], $~.offset(0))
end
end