* parse.y (parser_regx_options, reg_compile_gen): relaxened encoding

matching rule.

* re.c (rb_reg_initialize): always set encoding of Regexp.

* re.c (rb_reg_initialize_str): fix enconding for non 7bit-clean
  strings.

* re.c (rb_reg_initialize_m): use ascii encoding for 'n' option.


git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@13743 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
nobu 2007-10-19 07:41:03 +00:00
Родитель a4e493a985
Коммит c7697aba34
3 изменённых файлов: 71 добавлений и 37 удалений

Просмотреть файл

@ -1,3 +1,15 @@
Fri Oct 19 16:41:00 2007 Nobuyoshi Nakada <nobu@ruby-lang.org>
* parse.y (parser_regx_options, reg_compile_gen): relaxened encoding
matching rule.
* re.c (rb_reg_initialize): always set encoding of Regexp.
* re.c (rb_reg_initialize_str): fix enconding for non 7bit-clean
strings.
* re.c (rb_reg_initialize_m): use ascii encoding for 'n' option.
Fri Oct 19 11:09:56 2007 Nobuyoshi Nakada <nobu@ruby-lang.org>
* ruby.c (process_options): set primary encoding from the parser

40
parse.y
Просмотреть файл

@ -261,7 +261,7 @@ struct parser_params {
};
#define STR_NEW(p,n) rb_enc_str_new((p),(n),parser->enc)
#define STR_NEW0() rb_enc_str_new(0,0,rb_enc_from_index(0))
#define STR_NEW0() rb_str_new(0,0)
#define STR_NEW2(p) rb_enc_str_new((p),strlen(p),parser->enc)
#define STR_NEW3(p,n,m) parser_str_new((p),(n),STR_ENC(!ENC_SINGLE(m)),(m))
#define STR_ENC(m) ((m)?parser->enc:rb_enc_from_index(0))
@ -443,6 +443,10 @@ static int lvar_defined_gen(struct parser_params*, ID);
#define lvar_defined(id) lvar_defined_gen(parser, id)
#define RE_OPTION_ONCE (1<<16)
#define RE_OPTION_ENCODING_SHIFT 8
#define RE_OPTION_ENCODING(e) (((e)&0xff)<<RE_OPTION_ENCODING_SHIFT)
#define RE_OPTION_ENCODING_IDX(o) (((o)>>RE_OPTION_ENCODING_SHIFT)&0xff)
#define RE_OPTION_MASK 0xff
#define NODE_STRTERM NODE_ZARRAY /* nothing to gc */
#define NODE_HEREDOC NODE_ARRAY /* 1, 3 to gc */
@ -3639,14 +3643,14 @@ regexp : tREGEXP_BEG xstring_contents tREGEXP_END
int options = $3;
NODE *node = $2;
if (!node) {
node = NEW_LIT(reg_compile(0, options & ~RE_OPTION_ONCE));
node = NEW_LIT(reg_compile(STR_NEW0(), options));
}
else switch (nd_type(node)) {
case NODE_STR:
{
VALUE src = node->nd_lit;
nd_set_type(node, NODE_LIT);
node->nd_lit = reg_compile(src, options&~RE_OPTION_ONCE);
node->nd_lit = reg_compile(src, options);
}
break;
default:
@ -3658,7 +3662,7 @@ regexp : tREGEXP_BEG xstring_contents tREGEXP_END
else {
nd_set_type(node, NODE_DREGX);
}
node->nd_cflag = options & ~RE_OPTION_ONCE;
node->nd_cflag = options & RE_OPTION_MASK;
break;
}
$$ = node;
@ -5110,11 +5114,12 @@ parser_tokadd_escape(struct parser_params *parser, int term, int *mb)
return 0;
}
extern int rb_char_to_option_kcode(int c, int *option, int *kcode);
static int
parser_regx_options(struct parser_params *parser)
{
extern int rb_char_to_option_kcode(int c, int *option, int *kcode);
int kcode = 0;
int options = 0;
int c, opt, kc;
@ -5125,11 +5130,7 @@ parser_regx_options(struct parser_params *parser)
}
else if (rb_char_to_option_kcode(c, &opt, &kc)) {
options |= opt;
if (kc != 0 && rb_enc_from_index(kc) != parser->enc) {
compile_error(PARSER_ARG
"regexp encoding option '%c' mismatch to %s",
c, rb_enc_name(parser->enc));
}
if (kc >= 0) kcode = c;
}
else {
tokadd(c);
@ -5141,7 +5142,7 @@ parser_regx_options(struct parser_params *parser)
compile_error(PARSER_ARG "unknown regexp option%s - %s",
toklen() > 1 ? "s" : "", tok());
}
return options;
return options | RE_OPTION_ENCODING(kcode);
}
#define STR_FUNC_ESCAPE 0x01
@ -8212,8 +8213,21 @@ VALUE rb_reg_compile(VALUE str, int options);
static VALUE
reg_compile_gen(struct parser_params* parser, VALUE str, int options)
{
VALUE re = rb_reg_compile(str, (options) & ~RE_OPTION_ONCE);
VALUE re;
int c = RE_OPTION_ENCODING_IDX(options);
if (c) {
int opt, idx;
rb_char_to_option_kcode(c, &opt, &idx);
if (idx != ENCODING_GET(str) && ENCODING_GET(str) &&
rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) {
compile_error(PARSER_ARG
"regexp encoding option '%c' differs from source encoding '%s'",
c, rb_enc_name(rb_enc_get(str)));
}
ENCODING_SET(str, idx);
}
re = rb_reg_compile(str, options & RE_OPTION_MASK);
if (NIL_P(re)) {
RB_GC_GUARD(re) = rb_obj_as_string(rb_errinfo());
compile_error(PARSER_ARG "%s", RSTRING_PTR(re));

56
re.c
Просмотреть файл

@ -136,8 +136,11 @@ rb_memsearch(const void *x0, long m, const void *y0, long n)
#define KCODE_FIXED FL_USER4
#define ARG_REG_OPTION_MASK 0x0f
#define ARG_KCODE_NONE 0x10
#define ARG_REG_OPTION_MASK \
(ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND)
#define ARG_ENCODING_FIXED 16
#define ARG_KCODE_NONE 0
#define ARG_KCODE_EUC 1
#define ARG_KCODE_SJIS 2
#define ARG_KCODE_UTF8 3
@ -157,9 +160,6 @@ char_to_option(int c)
case 'm':
val = ONIG_OPTION_MULTILINE;
break;
case 'n':
val = ARG_KCODE_NONE;
break;
default:
val = 0;
break;
@ -184,19 +184,24 @@ rb_char_to_option_kcode(int c, int *option, int *kcode)
*option = 0;
switch (c) {
case 'n':
*kcode = ARG_KCODE_NONE;
break;
case 'e':
*kcode = ARG_KCODE_EUC;
return 1;
break;
case 's':
*kcode = ARG_KCODE_SJIS;
return 1;
break;
case 'u':
*kcode = ARG_KCODE_UTF8;
return 1;
break;
default:
*kcode = 0;
*kcode = -1;
return (*option = char_to_option(c));
}
*option = ARG_ENCODING_FIXED;
return 1;
}
static void
@ -1227,14 +1232,10 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
re->ptr = 0;
re->str = 0;
if (options & ARG_KCODE_NONE) {
rb_enc_associate_index((VALUE)re, 0);
enc = rb_enc_from_index(0);
rb_enc_associate((VALUE)re, enc);
if (options & ARG_ENCODING_FIXED) {
re->basic.flags |= KCODE_FIXED;
}
else {
rb_enc_associate((VALUE)re, enc);
}
re->ptr = make_regexp(s, len, enc, options & ARG_REG_OPTION_MASK, err);
if (!re->ptr) return -1;
re->str = ALLOC_N(char, len+1);
@ -1247,6 +1248,9 @@ rb_reg_initialize(VALUE obj, const char *s, int len, rb_encoding *enc,
static int
rb_reg_initialize_str(VALUE obj, VALUE str, int options, onig_errmsg_buffer err)
{
if (rb_enc_str_coderange(str) != ENC_CODERANGE_SINGLE) {
options |= ARG_ENCODING_FIXED;
}
return rb_reg_initialize(obj, RSTRING_PTR(str), RSTRING_LEN(str), rb_enc_get(str),
options, err);
}
@ -1573,21 +1577,21 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
onig_errmsg_buffer err;
int flags = 0;
VALUE str;
rb_encoding *enc;
const char *ptr;
long len;
if (argc == 0 || argc > 3) {
rb_raise(rb_eArgError, "wrong number of arguments");
}
if (TYPE(argv[0]) == T_REGEXP) {
VALUE re = argv[0];
const char *ptr;
long len;
rb_encoding *enc;
if (argc > 1) {
rb_warn("flags ignored");
}
rb_reg_check(re);
flags = RREGEXP(argv[0])->ptr->options & ARG_REG_OPTION_MASK;
flags = rb_reg_options(re);
ptr = RREGEXP(re)->str;
len = RREGEXP(re)->len;
enc = rb_enc_get(re);
@ -1601,18 +1605,22 @@ rb_reg_initialize_m(int argc, VALUE *argv, VALUE self)
if (FIXNUM_P(argv[1])) flags = FIX2INT(argv[1]);
else if (RTEST(argv[1])) flags = ONIG_OPTION_IGNORECASE;
}
enc = 0;
if (argc == 3 && !NIL_P(argv[2])) {
char *kcode = StringValuePtr(argv[2]);
if (kcode[0] == 'n' || kcode[1] == 'N') {
flags |= ARG_KCODE_NONE;
enc = rb_enc_from_index(0);
flags |= ARG_ENCODING_FIXED;
}
else {
rb_warning("encoding option is obsolete - %s", kcode);
}
}
str = argv[0];
StringValueCStr(str);
if (rb_reg_initialize_str(self, str, flags, err)) {
ptr = StringValueCStr(str);
if (enc
? rb_reg_initialize(self, ptr, RSTRING_LEN(str), enc, flags, err)
: rb_reg_initialize_str(self, str, flags, err)) {
rb_reg_raise_str(str, flags, err);
}
}
@ -1731,8 +1739,8 @@ rb_reg_options(VALUE re)
int options;
rb_reg_check(re);
options = RREGEXP(re)->ptr->options &
(ONIG_OPTION_IGNORECASE|ONIG_OPTION_MULTILINE|ONIG_OPTION_EXTEND);
options = RREGEXP(re)->ptr->options & ARG_REG_OPTION_MASK;
if (RBASIC(re)->flags & KCODE_FIXED) options |= ARG_ENCODING_FIXED;
return options;
}