зеркало из https://github.com/github/ruby.git
\d, \s and \w are now non Unicode class. [ruby-dev:39026]
* include/ruby/oniguruma.h (ONIGENC_CTYPE_SPECIAL_MASK): added. (ONIGENC_CTYPE_D): ditto. (ONIGENC_CTYPE_S): ditto. (ONIGENC_CTYPE_W): ditto. * regparse.c: \d, \s and \w are now non Unicode class. [ruby-dev:39026] (fetch_token_in_cc): use ONIGENC_CTYPE_[DSW] for \d/\s/\w. (fetch_token): ditto. (add_ctype_to_cc): add routines for ONIGENC_CTYPE_[DSW]. (parse_exp): ditto. * test/ruby/test_regexp.rb (TestRegexp#test_char_class): add tests for above. git-svn-id: svn+ssh://ci.ruby-lang.org/ruby/trunk@24544 b2dd03c8-39d4-4d8f-98ff-823fe69b080e
This commit is contained in:
Родитель
fefb793f65
Коммит
249697855e
18
ChangeLog
18
ChangeLog
|
@ -1,3 +1,21 @@
|
|||
Sun Aug 16 00:30:33 2009 NARUSE, Yui <naruse@ruby-lang.org>
|
||||
|
||||
* include/ruby/oniguruma.h
|
||||
(ONIGENC_CTYPE_SPECIAL_MASK): added.
|
||||
(ONIGENC_CTYPE_D): ditto.
|
||||
(ONIGENC_CTYPE_S): ditto.
|
||||
(ONIGENC_CTYPE_W): ditto.
|
||||
|
||||
* regparse.c: \d, \s and \w are now non Unicode class.
|
||||
[ruby-dev:39026]
|
||||
(fetch_token_in_cc): use ONIGENC_CTYPE_[DSW] for \d/\s/\w.
|
||||
(fetch_token): ditto.
|
||||
(add_ctype_to_cc): add routines for ONIGENC_CTYPE_[DSW].
|
||||
(parse_exp): ditto.
|
||||
|
||||
* test/ruby/test_regexp.rb (TestRegexp#test_char_class):
|
||||
add tests for above.
|
||||
|
||||
Sat Aug 15 10:39:53 2009 Nobuyoshi Nakada <nobu@ruby-lang.org>
|
||||
|
||||
* parse.y (fname, string_dvar, sym, dsym, f_arglist): removed
|
||||
|
|
|
@ -200,6 +200,14 @@ ONIG_EXTERN OnigEncodingType OnigEncodingASCII;
|
|||
#define ONIGENC_CTYPE_ALNUM 13 /* alpha || digit */
|
||||
#define ONIGENC_CTYPE_ASCII 14
|
||||
#define ONIGENC_MAX_STD_CTYPE ONIGENC_CTYPE_ASCII
|
||||
#define ONIGENC_CTYPE_SPECIAL_MASK 128
|
||||
#define ONIGENC_CTYPE_S /* [\t\n\v\f\r\s] */ \
|
||||
ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_SPACE
|
||||
#define ONIGENC_CTYPE_D /* [0-9] */ \
|
||||
ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_DIGIT
|
||||
#define ONIGENC_CTYPE_W /* [0-9A-Za-z_] */ \
|
||||
ONIGENC_CTYPE_SPECIAL_MASK | ONIGENC_CTYPE_WORD
|
||||
#define ONIGENC_CTYPE_SPECIAL_P(ctype) ((ctype) & ONIGENC_CTYPE_SPECIAL_MASK)
|
||||
|
||||
|
||||
#define onig_enc_len(enc,p,e) ONIGENC_MBC_ENC_LEN(enc, p, e)
|
||||
|
|
59
regparse.c
59
regparse.c
|
@ -2974,32 +2974,32 @@ fetch_token_in_cc(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
|
|||
switch (c) {
|
||||
case 'w':
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_W;
|
||||
tok->u.prop.not = 0;
|
||||
break;
|
||||
case 'W':
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_W;
|
||||
tok->u.prop.not = 1;
|
||||
break;
|
||||
case 'd':
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_D;
|
||||
tok->u.prop.not = 0;
|
||||
break;
|
||||
case 'D':
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_D;
|
||||
tok->u.prop.not = 1;
|
||||
break;
|
||||
case 's':
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_S;
|
||||
tok->u.prop.not = 0;
|
||||
break;
|
||||
case 'S':
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_S;
|
||||
tok->u.prop.not = 1;
|
||||
break;
|
||||
case 'h':
|
||||
|
@ -3261,14 +3261,14 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
|
|||
case 'w':
|
||||
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_W;
|
||||
tok->u.prop.not = 0;
|
||||
break;
|
||||
|
||||
case 'W':
|
||||
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_W_WORD)) break;
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_WORD;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_W;
|
||||
tok->u.prop.not = 1;
|
||||
break;
|
||||
|
||||
|
@ -3301,28 +3301,28 @@ fetch_token(OnigToken* tok, UChar** src, UChar* end, ScanEnv* env)
|
|||
case 's':
|
||||
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_S;
|
||||
tok->u.prop.not = 0;
|
||||
break;
|
||||
|
||||
case 'S':
|
||||
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_S_WHITE_SPACE)) break;
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_SPACE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_S;
|
||||
tok->u.prop.not = 1;
|
||||
break;
|
||||
|
||||
case 'd':
|
||||
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_D;
|
||||
tok->u.prop.not = 0;
|
||||
break;
|
||||
|
||||
case 'D':
|
||||
if (! IS_SYNTAX_OP(syn, ONIG_SYN_OP_ESC_D_DIGIT)) break;
|
||||
tok->type = TK_CHAR_TYPE;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_DIGIT;
|
||||
tok->u.prop.ctype = ONIGENC_CTYPE_D;
|
||||
tok->u.prop.not = 1;
|
||||
break;
|
||||
|
||||
|
@ -3864,6 +3864,28 @@ add_ctype_to_cc(CClassNode* cc, int ctype, int not, ScanEnv* env)
|
|||
OnigCodePoint sb_out;
|
||||
OnigEncoding enc = env->enc;
|
||||
|
||||
switch (ctype) {
|
||||
case ONIGENC_CTYPE_D:
|
||||
case ONIGENC_CTYPE_S:
|
||||
case ONIGENC_CTYPE_W:
|
||||
ctype ^= ONIGENC_CTYPE_SPECIAL_MASK;
|
||||
if (not != 0) {
|
||||
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
|
||||
if (! ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
|
||||
BITSET_SET_BIT_CHKDUP(cc->bs, c);
|
||||
}
|
||||
ADD_ALL_MULTI_BYTE_RANGE(enc, cc->mbuf);
|
||||
}
|
||||
else {
|
||||
for (c = 0; c < SINGLE_BYTE_SIZE; c++) {
|
||||
if (ONIGENC_IS_ASCII_CODE_CTYPE((OnigCodePoint )c, ctype))
|
||||
BITSET_SET_BIT_CHKDUP(cc->bs, c);
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
break;
|
||||
}
|
||||
|
||||
r = ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges);
|
||||
if (r == 0) {
|
||||
return add_ctype_to_cc_by_range(cc, ctype, not, env, sb_out, ranges);
|
||||
|
@ -5212,6 +5234,19 @@ parse_exp(Node** np, OnigToken* tok, int term,
|
|||
case TK_CHAR_TYPE:
|
||||
{
|
||||
switch (tok->u.prop.ctype) {
|
||||
case ONIGENC_CTYPE_D:
|
||||
case ONIGENC_CTYPE_S:
|
||||
case ONIGENC_CTYPE_W:
|
||||
{
|
||||
CClassNode* cc;
|
||||
*np = node_new_cclass();
|
||||
CHECK_NULL_RETURN_MEMERR(*np);
|
||||
cc = NCCLASS(*np);
|
||||
add_ctype_to_cc(cc, tok->u.prop.ctype, 0, env);
|
||||
if (tok->u.prop.not != 0) NCCLASS_SET_NOT(cc);
|
||||
}
|
||||
break;
|
||||
|
||||
case ONIGENC_CTYPE_WORD:
|
||||
*np = node_new_ctype(tok->u.prop.ctype, tok->u.prop.not);
|
||||
CHECK_NULL_RETURN_MEMERR(*np);
|
||||
|
|
|
@ -665,6 +665,13 @@ class TestRegexp < Test::Unit::TestCase
|
|||
check(/\A[[^b-c]&&[^e]&&a-f]\z/, %w(a d f), %w(b c e g 0))
|
||||
check(/\A[\n\r\t]\z/, ["\n", "\r", "\t"])
|
||||
failcheck('[9-1]')
|
||||
|
||||
assert_match(/\A\d+\z/, "0123456789")
|
||||
assert_no_match(/\d/, "\uff10\uff11\uff12\uff13\uff14\uff15\uff16\uff17\uff18\uff19")
|
||||
assert_match(/\A\w+\z/, "09azAZ_")
|
||||
assert_no_match(/\w/, "\uff10\uff19\uff41\uff5a\uff21\uff3a")
|
||||
assert_match(/\A\s+\z/, "\r\n\v\f\r\s")
|
||||
assert_no_match(/\s/, "\u0085")
|
||||
end
|
||||
|
||||
def test_posix_bracket
|
||||
|
|
Загрузка…
Ссылка в новой задаче