зеркало из https://github.com/github/ruby.git
Optimize regexp matching for look-around and atomic groups (#7931)
This commit is contained in:
Родитель
13c9cbe09e
Коммит
34cb174800
14
regcomp.c
14
regcomp.c
|
@ -1273,6 +1273,10 @@ compile_length_enclose_node(EncloseNode* node, regex_t* reg)
|
|||
break;
|
||||
|
||||
case ENCLOSE_STOP_BACKTRACK:
|
||||
/* Disable POP_STOP_BT optimization for simple repeat under the match cache */
|
||||
/* optimization because the match cache optimization pushes an extra item to */
|
||||
/* the stack and it breaks the assumption for this optimization. */
|
||||
#ifndef USE_MATCH_CACHE
|
||||
if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
|
||||
QtfrNode* qn = NQTFR(node->target);
|
||||
tlen = compile_length_tree(qn->target, reg);
|
||||
|
@ -1282,8 +1286,11 @@ compile_length_enclose_node(EncloseNode* node, regex_t* reg)
|
|||
+ SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
|
||||
}
|
||||
else {
|
||||
#endif
|
||||
len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
|
||||
#ifndef USE_MATCH_CACHE
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
|
||||
case ENCLOSE_CONDITION:
|
||||
|
@ -1395,6 +1402,10 @@ compile_enclose_node(EncloseNode* node, regex_t* reg)
|
|||
break;
|
||||
|
||||
case ENCLOSE_STOP_BACKTRACK:
|
||||
/* Disable POP_STOP_BT optimization for simple repeat under the match cache */
|
||||
/* optimization because the match cache optimization pushes an extra item to */
|
||||
/* the stack and it breaks the assumption for this optimization. */
|
||||
#ifndef USE_MATCH_CACHE
|
||||
if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
|
||||
QtfrNode* qn = NQTFR(node->target);
|
||||
r = compile_tree_n_times(qn->target, qn->lower, reg);
|
||||
|
@ -1413,12 +1424,15 @@ compile_enclose_node(EncloseNode* node, regex_t* reg)
|
|||
-((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
|
||||
}
|
||||
else {
|
||||
#endif
|
||||
r = add_opcode(reg, OP_PUSH_STOP_BT);
|
||||
if (r) return r;
|
||||
r = compile_tree(node->target, reg);
|
||||
if (r) return r;
|
||||
r = add_opcode(reg, OP_POP_STOP_BT);
|
||||
#ifndef USE_MATCH_CACHE
|
||||
}
|
||||
#endif
|
||||
break;
|
||||
|
||||
case ENCLOSE_CONDITION:
|
||||
|
|
377
regexec.c
377
regexec.c
|
@ -242,9 +242,6 @@ The `Regexp#match` optimization by using a cache.
|
|||
"cache opcode"
|
||||
A cachable opcode (e.g. `OP_PUSH`, `OP_REPEAT`, etc).
|
||||
It is corresponding to some cache points.
|
||||
`OP_NULL_CHECK_START` and `OP_NULL_CHECK_END_MEMST` are exceptions.
|
||||
They are counted as cache opcodes, but they have no corresponding cache points.
|
||||
They are used for resetting a match cache buffer when a null loop is detected.
|
||||
|
||||
"cache point"
|
||||
A cachable point on matching.
|
||||
|
@ -258,7 +255,7 @@ We encode a match cache point to an integer value by the following equation:
|
|||
"match cache point" = "position on input string" * "total number of cache points" + "cache point"
|
||||
|
||||
"match cache buffer"
|
||||
A bit-array for recording (caching) match cache points once arrived.
|
||||
A bit-array for memoizing (recording) match cache points once backtracked.
|
||||
*/
|
||||
|
||||
/* count the total number of cache opcodes for allocating a match cache buffer. */
|
||||
|
@ -272,7 +269,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
|
|||
OnigEncoding enc = reg->enc;
|
||||
MemNumType current_repeat_mem = -1;
|
||||
long num_cache_opcodes = 0;
|
||||
long num_cache_opcodes_at_null_check_start = -1;
|
||||
int lookaround_nesting = 0;
|
||||
|
||||
while (p < pend) {
|
||||
switch (*p++) {
|
||||
|
@ -326,7 +323,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
|
|||
break;
|
||||
case OP_ANYCHAR_STAR:
|
||||
case OP_ANYCHAR_ML_STAR:
|
||||
num_cache_opcodes++;break;
|
||||
num_cache_opcodes++; break;
|
||||
case OP_ANYCHAR_STAR_PEEK_NEXT:
|
||||
case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
|
||||
p++; num_cache_opcodes++; break;
|
||||
|
@ -370,7 +367,12 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
|
|||
case OP_MEMORY_END_PUSH_REC:
|
||||
case OP_MEMORY_END:
|
||||
case OP_MEMORY_END_REC:
|
||||
p += SIZE_MEMNUM; break;
|
||||
p += SIZE_MEMNUM;
|
||||
// A memory (capture) in look-around is found.
|
||||
if (lookaround_nesting != 0) {
|
||||
goto impossible;
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_KEEP:
|
||||
break;
|
||||
|
@ -422,37 +424,60 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
|
|||
goto impossible;
|
||||
case OP_NULL_CHECK_START:
|
||||
p += SIZE_MEMNUM;
|
||||
if (num_cache_opcodes_at_null_check_start != -1) {
|
||||
// A nested OP_NULL_CHECK_START is not yet supported.
|
||||
goto impossible;
|
||||
}
|
||||
num_cache_opcodes_at_null_check_start = num_cache_opcodes;
|
||||
break;
|
||||
case OP_NULL_CHECK_END:
|
||||
case OP_NULL_CHECK_END_MEMST_PUSH:
|
||||
p += SIZE_MEMNUM;
|
||||
num_cache_opcodes_at_null_check_start = -1;
|
||||
break;
|
||||
case OP_NULL_CHECK_END_MEMST:
|
||||
p += SIZE_MEMNUM;
|
||||
// OP_NULL_CHECK_START and OP_NULL_CHECK_END_MEMST are counted as cache opcodes.
|
||||
if (num_cache_opcodes_at_null_check_start < num_cache_opcodes) {
|
||||
num_cache_opcodes += 2;
|
||||
}
|
||||
num_cache_opcodes_at_null_check_start = -1;
|
||||
break;
|
||||
|
||||
case OP_PUSH_POS:
|
||||
case OP_POP_POS:
|
||||
if (lookaround_nesting < 0) {
|
||||
// A look-around nested in a atomic grouping is found.
|
||||
goto impossible;
|
||||
}
|
||||
lookaround_nesting++;
|
||||
break;
|
||||
case OP_PUSH_POS_NOT:
|
||||
if (lookaround_nesting < 0) {
|
||||
// A look-around nested in a atomic grouping is found.
|
||||
goto impossible;
|
||||
}
|
||||
p += SIZE_RELADDR;
|
||||
lookaround_nesting++;
|
||||
break;
|
||||
case OP_PUSH_LOOK_BEHIND_NOT:
|
||||
if (lookaround_nesting < 0) {
|
||||
// A look-around nested in a atomic grouping is found.
|
||||
goto impossible;
|
||||
}
|
||||
p += SIZE_RELADDR;
|
||||
p += SIZE_LENGTH;
|
||||
lookaround_nesting++;
|
||||
break;
|
||||
case OP_POP_POS:
|
||||
case OP_FAIL_POS:
|
||||
goto impossible;
|
||||
case OP_FAIL_LOOK_BEHIND_NOT:
|
||||
lookaround_nesting--;
|
||||
break;
|
||||
case OP_PUSH_STOP_BT:
|
||||
if (lookaround_nesting != 0) {
|
||||
// A nested atomic grouping is found.
|
||||
goto impossible;
|
||||
}
|
||||
lookaround_nesting++;
|
||||
lookaround_nesting *= -1;
|
||||
break;
|
||||
case OP_POP_STOP_BT:
|
||||
lookaround_nesting *= -1;
|
||||
lookaround_nesting--;
|
||||
break;
|
||||
case OP_LOOK_BEHIND:
|
||||
case OP_PUSH_LOOK_BEHIND_NOT:
|
||||
case OP_FAIL_LOOK_BEHIND_NOT:
|
||||
p += SIZE_LENGTH;
|
||||
break;
|
||||
|
||||
case OP_PUSH_ABSENT_POS:
|
||||
case OP_ABSENT_END:
|
||||
case OP_ABSENT:
|
||||
|
@ -506,19 +531,34 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
|
|||
MemNumType current_repeat_mem = -1;
|
||||
long cache_point = 0;
|
||||
long num_cache_points_at_repeat = 0;
|
||||
int lookaround_nesting = 0;
|
||||
const OnigCacheOpcode* cache_opcodes_begin = cache_opcodes;
|
||||
OnigCacheOpcode* cache_opcodes_at_repeat = NULL;
|
||||
UChar* null_check_start_addr = NULL;
|
||||
OnigCacheOpcode* cache_opcodes_at_null_check_start = NULL;
|
||||
|
||||
# define INC_CACHE_OPCODES do {\
|
||||
# define INC_CACHE_OPCODES do {\
|
||||
cache_opcodes->addr = pbegin;\
|
||||
cache_opcodes->cache_point = cache_point - num_cache_points_at_repeat;\
|
||||
cache_opcodes->outer_repeat_mem = current_repeat_mem;\
|
||||
cache_opcodes->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;\
|
||||
cache_opcodes->num_cache_points_in_outer_repeat = 0;\
|
||||
cache_point++;\
|
||||
cache_opcodes->lookaround_nesting = lookaround_nesting;\
|
||||
cache_point += lookaround_nesting > 0 ? 2 : 1;\
|
||||
cache_opcodes++;\
|
||||
} while (0)
|
||||
} while (0)
|
||||
|
||||
# define INC_LOOKAROUND_NESTING lookaround_nesting++
|
||||
# define DEC_LOOKAROUND_NESTING do {\
|
||||
UChar* match_addr = p - 1;\
|
||||
OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes - 1;\
|
||||
while (\
|
||||
cache_opcodes_in_lookaround >= cache_opcodes_begin &&\
|
||||
cache_opcodes_in_lookaround->lookaround_nesting == lookaround_nesting\
|
||||
) {\
|
||||
cache_opcodes_in_lookaround->match_addr = match_addr;\
|
||||
cache_opcodes_in_lookaround--;\
|
||||
}\
|
||||
lookaround_nesting--;\
|
||||
} while (0)
|
||||
|
||||
while (p < pend) {
|
||||
pbegin = p;
|
||||
|
@ -620,7 +660,11 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
|
|||
case OP_MEMORY_END_PUSH_REC:
|
||||
case OP_MEMORY_END:
|
||||
case OP_MEMORY_END_REC:
|
||||
p += SIZE_MEMNUM; break;
|
||||
p += SIZE_MEMNUM;
|
||||
if (lookaround_nesting != 0) {
|
||||
goto unexpected_bytecode_error;
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_KEEP:
|
||||
break;
|
||||
|
@ -679,46 +723,45 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
|
|||
case OP_REPEAT_INC_NG_SG:
|
||||
goto unexpected_bytecode_error;
|
||||
case OP_NULL_CHECK_START:
|
||||
if (cache_opcodes_at_null_check_start != NULL) {
|
||||
goto unexpected_bytecode_error;
|
||||
}
|
||||
p += SIZE_MEMNUM;
|
||||
null_check_start_addr = pbegin;
|
||||
cache_opcodes_at_null_check_start = cache_opcodes;
|
||||
break;
|
||||
case OP_NULL_CHECK_END:
|
||||
case OP_NULL_CHECK_END_MEMST_PUSH:
|
||||
p += SIZE_MEMNUM;
|
||||
cache_opcodes_at_null_check_start = NULL;
|
||||
break;
|
||||
case OP_NULL_CHECK_END_MEMST:
|
||||
p += SIZE_MEMNUM;
|
||||
{
|
||||
if (cache_opcodes_at_null_check_start < cache_opcodes) {
|
||||
*cache_opcodes = *(cache_opcodes - 1);
|
||||
cache_opcodes->addr = pbegin;
|
||||
cache_opcodes++;
|
||||
long num_cache_opcodes_in_null_check = (cache_opcodes - cache_opcodes_at_null_check_start);
|
||||
xmemmove(cache_opcodes_at_null_check_start + 1, cache_opcodes_at_null_check_start, sizeof(OnigCacheOpcode) * num_cache_opcodes_in_null_check);
|
||||
cache_opcodes_at_null_check_start->addr = null_check_start_addr;
|
||||
cache_opcodes++;
|
||||
}
|
||||
cache_opcodes_at_null_check_start = NULL;
|
||||
}
|
||||
break;
|
||||
|
||||
case OP_PUSH_POS:
|
||||
case OP_POP_POS:
|
||||
INC_LOOKAROUND_NESTING;
|
||||
break;
|
||||
case OP_PUSH_POS_NOT:
|
||||
p += SIZE_RELADDR;
|
||||
INC_LOOKAROUND_NESTING;
|
||||
break;
|
||||
case OP_PUSH_LOOK_BEHIND_NOT:
|
||||
p += SIZE_RELADDR;
|
||||
p += SIZE_LENGTH;
|
||||
INC_LOOKAROUND_NESTING;
|
||||
break;
|
||||
case OP_POP_POS:
|
||||
case OP_FAIL_POS:
|
||||
goto unexpected_bytecode_error;
|
||||
case OP_FAIL_LOOK_BEHIND_NOT:
|
||||
DEC_LOOKAROUND_NESTING;
|
||||
break;
|
||||
case OP_PUSH_STOP_BT:
|
||||
INC_LOOKAROUND_NESTING;
|
||||
lookaround_nesting *= -1;
|
||||
break;
|
||||
case OP_POP_STOP_BT:
|
||||
lookaround_nesting *= -1;
|
||||
DEC_LOOKAROUND_NESTING;
|
||||
break;
|
||||
case OP_LOOK_BEHIND:
|
||||
case OP_PUSH_LOOK_BEHIND_NOT:
|
||||
case OP_FAIL_LOOK_BEHIND_NOT:
|
||||
case OP_PUSH_ABSENT_POS:
|
||||
p += SIZE_LENGTH;
|
||||
break;
|
||||
|
||||
case OP_ABSENT_END:
|
||||
case OP_ABSENT:
|
||||
goto unexpected_bytecode_error;
|
||||
|
@ -933,31 +976,33 @@ onig_region_copy(OnigRegion* to, const OnigRegion* from)
|
|||
|
||||
/* stack type */
|
||||
/* used by normal-POP */
|
||||
#define STK_ALT 0x0001
|
||||
#define STK_LOOK_BEHIND_NOT 0x0002
|
||||
#define STK_POS_NOT 0x0003
|
||||
#define STK_ALT 0x0001
|
||||
#define STK_LOOK_BEHIND_NOT 0x0002
|
||||
#define STK_POS_NOT 0x0003
|
||||
/* handled by normal-POP */
|
||||
#define STK_MEM_START 0x0100
|
||||
#define STK_MEM_END 0x8200
|
||||
#define STK_REPEAT_INC 0x0300
|
||||
#define STK_STATE_CHECK_MARK 0x1000
|
||||
#define STK_MEM_START 0x0100
|
||||
#define STK_MEM_END 0x8200
|
||||
#define STK_REPEAT_INC 0x0300
|
||||
#define STK_STATE_CHECK_MARK 0x1000
|
||||
/* avoided by normal-POP */
|
||||
#define STK_NULL_CHECK_START 0x3000
|
||||
#define STK_NULL_CHECK_END 0x5000 /* for recursive call */
|
||||
#define STK_MEM_END_MARK 0x8400
|
||||
#define STK_POS 0x0500 /* used when POP-POS */
|
||||
#define STK_STOP_BT 0x0600 /* mark for "(?>...)" */
|
||||
#define STK_REPEAT 0x0700
|
||||
#define STK_CALL_FRAME 0x0800
|
||||
#define STK_RETURN 0x0900
|
||||
#define STK_VOID 0x0a00 /* for fill a blank */
|
||||
#define STK_ABSENT_POS 0x0b00 /* for absent */
|
||||
#define STK_ABSENT 0x0c00 /* absent inner loop marker */
|
||||
#define STK_NULL_CHECK_START 0x3000
|
||||
#define STK_NULL_CHECK_END 0x5000 /* for recursive call */
|
||||
#define STK_MEM_END_MARK 0x8400
|
||||
#define STK_POS 0x0500 /* used when POP-POS */
|
||||
#define STK_STOP_BT 0x0600 /* mark for "(?>...)" */
|
||||
#define STK_REPEAT 0x0700
|
||||
#define STK_CALL_FRAME 0x0800
|
||||
#define STK_RETURN 0x0900
|
||||
#define STK_VOID 0x0a00 /* for fill a blank */
|
||||
#define STK_ABSENT_POS 0x0b00 /* for absent */
|
||||
#define STK_ABSENT 0x0c00 /* absent inner loop marker */
|
||||
#define STK_MATCH_CACHE_POINT 0x0d00 /* for the match cache optimization */
|
||||
#define STK_ATOMIC_MATCH_CACHE_POINT 0x0e00
|
||||
|
||||
/* stack type check mask */
|
||||
#define STK_MASK_POP_USED 0x00ff
|
||||
#define STK_MASK_TO_VOID_TARGET 0x10ff
|
||||
#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */
|
||||
#define STK_MASK_POP_USED 0x00ff
|
||||
#define STK_MASK_TO_VOID_TARGET 0x10ff
|
||||
#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */
|
||||
|
||||
#ifdef USE_MATCH_CACHE
|
||||
#define MATCH_ARG_INIT_MATCH_CACHE(msa) do {\
|
||||
|
@ -1387,6 +1432,15 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
STACK_INC;\
|
||||
} while(0)
|
||||
|
||||
#define STACK_PUSH_MATCH_CACHE_POINT(match_cache_point_index, match_cache_point_mask) do {\
|
||||
STACK_ENSURE(1);\
|
||||
stk->type = STK_MATCH_CACHE_POINT;\
|
||||
stk->null_check = stk == stk_base ? 0 : (stk-1)->null_check;\
|
||||
stk->u.match_cache_point.index = (match_cache_point_index);\
|
||||
stk->u.match_cache_point.mask = (match_cache_point_mask);\
|
||||
STACK_INC;\
|
||||
} while(0)
|
||||
|
||||
|
||||
#ifdef ONIG_DEBUG
|
||||
# define STACK_BASE_CHECK(p, at) \
|
||||
|
@ -1398,6 +1452,42 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
# define STACK_BASE_CHECK(p, at)
|
||||
#endif
|
||||
|
||||
#ifdef ONIG_DEBUG_MATCH_CACHE
|
||||
# define MATCH_CACHE_DEBUG_MEMOIZE(stkp) fprintf(stderr, "MATCH CACHE: memoize (index=%ld mask=%d)\n", stkp->u.match_cache_point.index, stkp->u.match_cache_point.mask);
|
||||
#else
|
||||
# define MATCH_CACHE_DEBUG_MEMOIZE(stkp) ((void) 0)
|
||||
#endif
|
||||
|
||||
#ifdef USE_MATCH_CACHE
|
||||
# define INC_NUM_FAILS msa->num_fails++
|
||||
# define MEMOIZE_MATCH_CACHE_POINT do {\
|
||||
if (stk->type == STK_MATCH_CACHE_POINT) {\
|
||||
msa->match_cache_buf[stk->u.match_cache_point.index] |= stk->u.match_cache_point.mask;\
|
||||
MATCH_CACHE_DEBUG_MEMOIZE(stk);\
|
||||
} else if (stk->type == STK_ATOMIC_MATCH_CACHE_POINT) {\
|
||||
memoize_extended_match_cache_point(msa->match_cache_buf, stk->u.match_cache_point.index, stk->u.match_cache_point.mask);\
|
||||
MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
|
||||
}\
|
||||
} while(0)
|
||||
# define MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(stkp) do {\
|
||||
if (stkp->type == STK_MATCH_CACHE_POINT) {\
|
||||
stkp->type = STK_VOID;\
|
||||
memoize_extended_match_cache_point(msa->match_cache_buf, stkp->u.match_cache_point.index, stkp->u.match_cache_point.mask);\
|
||||
MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
|
||||
}\
|
||||
} while(0)
|
||||
# define MEMOIZE_ATOMIC_MATCH_CACHE_POINT do {\
|
||||
if (stk->type == STK_MATCH_CACHE_POINT) {\
|
||||
memoize_extended_match_cache_point(msa->match_cache_buf, stk->u.match_cache_point.index, stk->u.match_cache_point.mask);\
|
||||
MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
|
||||
}\
|
||||
} while(0)
|
||||
#else
|
||||
# define INC_NUM_FAILS ((void) 0)
|
||||
# define MEMOIZE_MATCH_CACHE_POINT ((void) 0)
|
||||
# define MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT ((void) 0)
|
||||
#endif
|
||||
|
||||
#define STACK_POP_ONE do {\
|
||||
stk--;\
|
||||
STACK_BASE_CHECK(stk, "STACK_POP_ONE"); \
|
||||
|
@ -1411,6 +1501,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
STACK_BASE_CHECK(stk, "STACK_POP"); \
|
||||
if ((stk->type & STK_MASK_POP_USED) != 0) break;\
|
||||
ELSE_IF_STATE_CHECK_MARK(stk);\
|
||||
MEMOIZE_MATCH_CACHE_POINT;\
|
||||
}\
|
||||
break;\
|
||||
case STACK_POP_LEVEL_MEM_START:\
|
||||
|
@ -1423,6 +1514,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
|
||||
}\
|
||||
ELSE_IF_STATE_CHECK_MARK(stk);\
|
||||
MEMOIZE_MATCH_CACHE_POINT;\
|
||||
}\
|
||||
break;\
|
||||
default:\
|
||||
|
@ -1442,6 +1534,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
|
||||
}\
|
||||
ELSE_IF_STATE_CHECK_MARK(stk);\
|
||||
MEMOIZE_MATCH_CACHE_POINT;\
|
||||
}\
|
||||
break;\
|
||||
}\
|
||||
|
@ -1463,7 +1556,11 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\
|
||||
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
|
||||
}\
|
||||
else if (IS_TO_VOID_TARGET(stk)) {\
|
||||
INC_NUM_FAILS;\
|
||||
}\
|
||||
ELSE_IF_STATE_CHECK_MARK(stk);\
|
||||
MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(stk);\
|
||||
}\
|
||||
} while(0)
|
||||
|
||||
|
@ -1520,12 +1617,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
k--;\
|
||||
STACK_BASE_CHECK(k, "STACK_POS_END"); \
|
||||
if (IS_TO_VOID_TARGET(k)) {\
|
||||
INC_NUM_FAILS;\
|
||||
k->type = STK_VOID;\
|
||||
}\
|
||||
else if (k->type == STK_POS) {\
|
||||
k->type = STK_VOID;\
|
||||
break;\
|
||||
}\
|
||||
MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(k);\
|
||||
}\
|
||||
} while(0)
|
||||
|
||||
|
@ -1535,12 +1634,28 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
k--;\
|
||||
STACK_BASE_CHECK(k, "STACK_STOP_BT_END"); \
|
||||
if (IS_TO_VOID_TARGET(k)) {\
|
||||
INC_NUM_FAILS;\
|
||||
k->type = STK_VOID;\
|
||||
}\
|
||||
else if (k->type == STK_STOP_BT) {\
|
||||
k->type = STK_VOID;\
|
||||
break;\
|
||||
}\
|
||||
else if (k->type == STK_MATCH_CACHE_POINT) {\
|
||||
k->type = STK_ATOMIC_MATCH_CACHE_POINT;\
|
||||
}\
|
||||
}\
|
||||
} while(0)
|
||||
|
||||
#define STACK_STOP_BT_FAIL do {\
|
||||
while (1) {\
|
||||
stk--;\
|
||||
STACK_BASE_CHECK(stk, "STACK_STOP_BT_END"); \
|
||||
if (stk->type == STK_STOP_BT) {\
|
||||
stk->type = STK_VOID;\
|
||||
break;\
|
||||
}\
|
||||
MEMOIZE_ATOMIC_MATCH_CACHE_POINT;\
|
||||
}\
|
||||
} while(0)
|
||||
|
||||
|
@ -1579,7 +1694,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
}\
|
||||
} while(0)
|
||||
|
||||
#define STACK_NULL_CHECK_MEMST(isnull,ischange,id,s,reg) do {\
|
||||
#define STACK_NULL_CHECK_MEMST(isnull,id,s,reg) do {\
|
||||
OnigStackType* k = STACK_AT((stk-1)->null_check)+1;\
|
||||
while (1) {\
|
||||
k--;\
|
||||
|
@ -1596,14 +1711,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
|
|||
while (k < stk) {\
|
||||
if (k->type == STK_MEM_START) {\
|
||||
if (k->u.mem.end == INVALID_STACK_INDEX) {\
|
||||
(isnull) = 0; (ischange) = 1; break;\
|
||||
(isnull) = 0; break;\
|
||||
}\
|
||||
if (BIT_STATUS_AT(reg->bt_mem_end, k->u.mem.num))\
|
||||
endp = STACK_AT(k->u.mem.end)->u.mem.pstr;\
|
||||
else\
|
||||
endp = (UChar* )k->u.mem.end;\
|
||||
if (STACK_AT(k->u.mem.start)->u.mem.pstr != endp) {\
|
||||
(isnull) = 0; (ischange) = 1; break;\
|
||||
(isnull) = 0; break;\
|
||||
}\
|
||||
else if (endp != s) {\
|
||||
(isnull) = -1; /* empty, but position changed */ \
|
||||
|
@ -2008,6 +2123,7 @@ stack_type_str(int stack_type)
|
|||
case STK_VOID: return "Void ";
|
||||
case STK_ABSENT_POS: return "AbsPos";
|
||||
case STK_ABSENT: return "Absent";
|
||||
case STK_MATCH_CACHE_POINT: return "MCache";
|
||||
default: return " ";
|
||||
}
|
||||
}
|
||||
|
@ -2029,7 +2145,7 @@ bsearch_cache_opcodes(const OnigCacheOpcode *cache_opcodes, long num_cache_opcod
|
|||
}
|
||||
|
||||
static long
|
||||
find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, const UChar* p, const OnigStackType *stk, const OnigStackIndex *repeat_stk)
|
||||
find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, const UChar* p, const OnigStackType *stk, const OnigStackIndex *repeat_stk, const OnigCacheOpcode **cache_opcode_ptr)
|
||||
{
|
||||
long m;
|
||||
const OnigCacheOpcode* cache_opcode;
|
||||
|
@ -2048,6 +2164,7 @@ find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_ca
|
|||
}
|
||||
|
||||
cache_opcode = &cache_opcodes[m];
|
||||
*cache_opcode_ptr = &cache_opcodes[m];
|
||||
cache_point = cache_opcode->cache_point;
|
||||
if (cache_opcode->outer_repeat_mem == -1) {
|
||||
return cache_point;
|
||||
|
@ -2079,54 +2196,23 @@ find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_ca
|
|||
cache_point;
|
||||
}
|
||||
|
||||
static void
|
||||
reset_match_cache_on_null_check(regex_t* reg, const UChar* pstart, const UChar* pend, long pos, uint8_t* match_cache_buf, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, long num_cache_points, const OnigStackType *stk, const OnigStackIndex *repeat_stk)
|
||||
{
|
||||
long cache_point_start, cache_point_end;
|
||||
long match_cache_point_start, match_cache_point_end;
|
||||
long match_cache_point_start_index, match_cache_point_end_index;
|
||||
uint8_t match_cache_point_start_bit, match_cache_point_end_bit;
|
||||
|
||||
#ifdef ONIG_DEBUG_MATCH_CACHE
|
||||
fprintf(stderr, "MATCH CACHE: reset start=%p end=%p pos=%ld cache_opcodes=%p num_cache_opcodes=%ld\n", pstart, pend, pos, cache_opcodes, num_cache_opcodes);
|
||||
#endif
|
||||
|
||||
cache_point_start = find_cache_point(reg, cache_opcodes, num_cache_opcodes, pstart, stk, repeat_stk);
|
||||
cache_point_end = find_cache_point(reg, cache_opcodes, num_cache_opcodes, pend, stk, repeat_stk);
|
||||
// Skip to reset if `pstart` or `pend` is not found.
|
||||
// This situation occurs when there is no cache point
|
||||
// between OP_NULL_CHECK_START and OP_NULL_CHECK_END_MEMST.
|
||||
if (cache_point_start == -1 || cache_point_end == -1) return;
|
||||
|
||||
match_cache_point_start = pos * num_cache_points + cache_point_start;
|
||||
match_cache_point_end = pos * num_cache_points + cache_point_end;
|
||||
match_cache_point_start_index = match_cache_point_start >> 3;
|
||||
match_cache_point_end_index = match_cache_point_end >> 3;
|
||||
match_cache_point_start_bit = match_cache_point_start & 7;
|
||||
match_cache_point_end_bit = match_cache_point_end & 7;
|
||||
|
||||
#ifdef ONIG_DEBUG_MATCH_CACHE
|
||||
fprintf(stderr, "MATCH CACHE: reset start %ld (%ld index=%ld bit=%d)\n", match_cache_point_start, cache_point_start, match_cache_point_start_index, match_cache_point_start_bit);
|
||||
fprintf(stderr, "MATCH CACHE: reset end %ld (%ld index=%ld bit=%d)\n", match_cache_point_end, cache_point_end, match_cache_point_end_index, match_cache_point_end_bit);
|
||||
#endif
|
||||
|
||||
if (match_cache_point_start_index == match_cache_point_end_index) {
|
||||
match_cache_buf[match_cache_point_start_index] &=
|
||||
(((1 << (8 - match_cache_point_end_bit - 1)) - 1) << (match_cache_point_end_bit + 1)) |
|
||||
((1 << match_cache_point_start_bit) - 1);
|
||||
static int check_extended_match_cache_point(uint8_t *match_cache_buf, long match_cache_point_index, uint8_t match_cache_point_mask) {
|
||||
if (match_cache_point_mask & 0x80) {
|
||||
return (match_cache_buf[match_cache_point_index + 1] & 0x01) > 0;
|
||||
} else {
|
||||
if (match_cache_point_start_bit) {
|
||||
match_cache_buf[match_cache_point_start_index] &= (1 << (match_cache_point_start_bit - 1)) - 1;
|
||||
match_cache_point_start_index++;
|
||||
}
|
||||
if (match_cache_point_start_index < match_cache_point_end_index) {
|
||||
xmemset(&match_cache_buf[match_cache_point_start_index], 0, match_cache_point_end_index - match_cache_point_start_index);
|
||||
if (match_cache_point_end_bit) {
|
||||
match_cache_buf[match_cache_point_end_index] &= (((1 << (8 - match_cache_point_end_bit - 1)) - 1) << (match_cache_point_end_bit + 1));
|
||||
}
|
||||
}
|
||||
return (match_cache_buf[match_cache_point_index] & (match_cache_point_mask << 1)) > 0;
|
||||
}
|
||||
}
|
||||
|
||||
static void memoize_extended_match_cache_point(uint8_t *match_cache_buf, long match_cache_point_index, uint8_t match_cache_point_mask) {
|
||||
match_cache_buf[match_cache_point_index] |= match_cache_point_mask;
|
||||
if (match_cache_point_mask & 0x80) {
|
||||
match_cache_buf[match_cache_point_index + 1] |= 0x01;
|
||||
} else {
|
||||
match_cache_buf[match_cache_point_index] |= match_cache_point_mask << 1;
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* USE_MATCH_CACHE */
|
||||
|
||||
/* match data(str - end) from position (sstart). */
|
||||
|
@ -2447,7 +2533,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
|
|||
|
||||
# define CHECK_MATCH_CACHE do {\
|
||||
if (msa->match_cache_status == MATCH_CACHE_STATUS_ENABLED) {\
|
||||
long cache_point = find_cache_point(reg, msa->cache_opcodes, msa->num_cache_opcodes, pbegin, stk_base, repeat_stk);\
|
||||
const OnigCacheOpcode *cache_opcode;\
|
||||
long cache_point = find_cache_point(reg, msa->cache_opcodes, msa->num_cache_opcodes, pbegin, stk_base, repeat_stk, &cache_opcode);\
|
||||
if (cache_point >= 0) {\
|
||||
long match_cache_point = msa->num_cache_points * (long)(s - str) + cache_point;\
|
||||
long match_cache_point_index = match_cache_point >> 3;\
|
||||
|
@ -2455,9 +2542,21 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
|
|||
MATCH_CACHE_DEBUG;\
|
||||
if (msa->match_cache_buf[match_cache_point_index] & match_cache_point_mask) {\
|
||||
MATCH_CACHE_DEBUG_HIT;\
|
||||
goto fail;\
|
||||
if (cache_opcode->lookaround_nesting == 0) goto fail;\
|
||||
else if (cache_opcode->lookaround_nesting < 0) {\
|
||||
if (check_extended_match_cache_point(msa->match_cache_buf, match_cache_point_index, match_cache_point_mask)) {\
|
||||
STACK_STOP_BT_FAIL;\
|
||||
goto fail;\
|
||||
} else goto fail;\
|
||||
} else {\
|
||||
if (check_extended_match_cache_point(msa->match_cache_buf, match_cache_point_index, match_cache_point_mask)) {\
|
||||
p = cache_opcode->match_addr;\
|
||||
MOP_OUT;\
|
||||
JUMP;\
|
||||
} else goto fail;\
|
||||
}\
|
||||
}\
|
||||
msa->match_cache_buf[match_cache_point_index] |= match_cache_point_mask;\
|
||||
STACK_PUSH_MATCH_CACHE_POINT(match_cache_point_index, match_cache_point_mask);\
|
||||
}\
|
||||
}\
|
||||
} while (0)
|
||||
|
@ -3538,10 +3637,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
|
|||
CASE(OP_NULL_CHECK_END_MEMST) MOP_IN(OP_NULL_CHECK_END_MEMST);
|
||||
{
|
||||
int isnull;
|
||||
int ischanged = 0; // set 1 when a loop is empty but memory status is changed.
|
||||
|
||||
GET_MEMNUM_INC(mem, p); /* mem: null check id */
|
||||
STACK_NULL_CHECK_MEMST(isnull, ischanged, mem, s, reg);
|
||||
STACK_NULL_CHECK_MEMST(isnull, mem, s, reg);
|
||||
if (isnull) {
|
||||
# ifdef ONIG_DEBUG_MATCH
|
||||
fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%"PRIuPTR" (%p)\n",
|
||||
|
@ -3550,31 +3648,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
|
|||
if (isnull == -1) goto fail;
|
||||
goto null_check_found;
|
||||
}
|
||||
# ifdef USE_MATCH_CACHE
|
||||
if (ischanged && msa->match_cache_status == MATCH_CACHE_STATUS_ENABLED) {
|
||||
RelAddrType rel;
|
||||
OnigUChar *null_check_start;
|
||||
OnigUChar *null_check_end = pbegin;
|
||||
MemNumType mem;
|
||||
UChar* tmp = p;
|
||||
switch (*tmp++) {
|
||||
case OP_JUMP:
|
||||
case OP_PUSH:
|
||||
GET_RELADDR_INC(rel, tmp);
|
||||
null_check_start = tmp + rel;
|
||||
break;
|
||||
case OP_REPEAT_INC:
|
||||
case OP_REPEAT_INC_NG:
|
||||
GET_MEMNUM_INC(mem, tmp);
|
||||
// Skip OP_REPEAT (or OP_REPEAT_NG)
|
||||
null_check_start = STACK_AT(repeat_stk[mem])->u.repeat.pcode;
|
||||
break;
|
||||
default:
|
||||
goto unexpected_bytecode_error;
|
||||
}
|
||||
reset_match_cache_on_null_check(reg, null_check_start, null_check_end, (long)(s - str), msa->match_cache_buf, msa->cache_opcodes, msa->num_cache_opcodes, msa->num_cache_points, stk_base, repeat_stk);
|
||||
}
|
||||
# endif
|
||||
}
|
||||
MOP_OUT;
|
||||
JUMP;
|
||||
|
@ -4002,7 +4075,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
|
|||
fprintf(stderr, "MATCH CACHE: #cache points = %ld\n", msa->num_cache_points);
|
||||
fprintf(stderr, "MATCH CACHE: cache opcodes (%p):\n", msa->cache_opcodes);
|
||||
for (int i = 0; i < msa->num_cache_opcodes; i++) {
|
||||
fprintf(stderr, "MATCH CACHE: [%p] cache_point=%ld outer_repeat_mem=%d num_cache_opcodes_at_outer_repeat=%ld num_cache_opcodes_in_outer_repeat=%ld\n", msa->cache_opcodes[i].addr, msa->cache_opcodes[i].cache_point, msa->cache_opcodes[i].outer_repeat_mem, msa->cache_opcodes[i].num_cache_points_at_outer_repeat, msa->cache_opcodes[i].num_cache_points_in_outer_repeat);
|
||||
fprintf(stderr, "MATCH CACHE: [%p] cache_point=%ld outer_repeat_mem=%d num_cache_opcodes_at_outer_repeat=%ld num_cache_opcodes_in_outer_repeat=%ld lookaround_nesting=%d match_addr=%p\n", msa->cache_opcodes[i].addr, msa->cache_opcodes[i].cache_point, msa->cache_opcodes[i].outer_repeat_mem, msa->cache_opcodes[i].num_cache_points_at_outer_repeat, msa->cache_opcodes[i].num_cache_points_in_outer_repeat, msa->cache_opcodes[i].lookaround_nesting, msa->cache_opcodes[i].match_addr);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
8
regint.h
8
regint.h
|
@ -873,6 +873,12 @@ typedef struct _OnigStackType {
|
|||
UChar *abs_pstr; /* absent start position */
|
||||
const UChar *end_pstr; /* end position */
|
||||
} absent_pos;
|
||||
#ifdef USE_MATCH_CACHE
|
||||
struct {
|
||||
long index; /* index of the match cache buffer */
|
||||
uint8_t mask; /* bit-mask for the match cache buffer */
|
||||
} match_cache_point;
|
||||
#endif
|
||||
} u;
|
||||
} OnigStackType;
|
||||
|
||||
|
@ -883,6 +889,8 @@ typedef struct {
|
|||
int outer_repeat_mem;
|
||||
long num_cache_points_at_outer_repeat;
|
||||
long num_cache_points_in_outer_repeat;
|
||||
int lookaround_nesting;
|
||||
UChar *match_addr;
|
||||
} OnigCacheOpcode;
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1826,7 +1826,6 @@ class TestRegexp < Test::Unit::TestCase
|
|||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
|
||||
assert_nil(/^(a*)*$/ =~ "a" * 1000000 + "x")
|
||||
end;
|
||||
end
|
||||
|
@ -1836,7 +1835,6 @@ class TestRegexp < Test::Unit::TestCase
|
|||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
|
||||
assert_nil(/^a*b?a*$/ =~ "a" * 1000000 + "x")
|
||||
end;
|
||||
end
|
||||
|
@ -1846,8 +1844,61 @@ class TestRegexp < Test::Unit::TestCase
|
|||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
assert_nil(/^a*?(?>a*a*)$/ =~ "a" * 1000000 + "x")
|
||||
end;
|
||||
end
|
||||
|
||||
assert_nil(/^(?>a?a?)(a|a)*$/ =~ "a" * 1000000 + "x")
|
||||
def test_match_cache_atomic_complex
|
||||
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
|
||||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
assert_nil(/a*(?>a*)ab/ =~ "a" * 1000000 + "b")
|
||||
end;
|
||||
end
|
||||
|
||||
def test_match_cache_positive_look_ahead
|
||||
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
|
||||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
assert_nil(/^a*?(?=a*a*)$/ =~ "a" * 1000000 + "x")
|
||||
end;
|
||||
end
|
||||
|
||||
def test_match_cache_positive_look_ahead_complex
|
||||
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
|
||||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
assert_equal(/(?:(?=a*)a)*/ =~ "a" * 1000000, 0)
|
||||
end;
|
||||
end
|
||||
|
||||
def test_match_cache_negative_look_ahead
|
||||
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
|
||||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
assert_nil(/^a*?(?!a*a*)$/ =~ "a" * 1000000 + "x")
|
||||
end;
|
||||
end
|
||||
|
||||
def test_match_cache_positive_look_behind
|
||||
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
|
||||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
assert_nil(/(?<=abc|def)(a|a)*$/ =~ "abc" + "a" * 1000000 + "x")
|
||||
end;
|
||||
end
|
||||
|
||||
def test_match_cache_negative_look_behind
|
||||
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
|
||||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
Regexp.timeout = timeout
|
||||
assert_nil(/(?<!x)(a|a)*$/ =~ "a" * 1000000 + "x")
|
||||
end;
|
||||
end
|
||||
|
||||
|
@ -1864,7 +1915,7 @@ class TestRegexp < Test::Unit::TestCase
|
|||
assert_equal("10:0:0".match(pattern)[0], "10:0:0")
|
||||
end
|
||||
|
||||
def test_bug_19467
|
||||
def test_bug_19467 # [Bug #19467]
|
||||
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
|
||||
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
|
||||
begin;
|
||||
|
@ -1879,7 +1930,7 @@ class TestRegexp < Test::Unit::TestCase
|
|||
assert_equal("123456789".match(/(?:x?\dx?){2,}/)[0], "123456789")
|
||||
end
|
||||
|
||||
def test_bug_19537
|
||||
def test_bug_19537 # [Bug #19537]
|
||||
str = 'aac'
|
||||
re = '^([ab]{1,3})(a?)*$'
|
||||
100.times do
|
||||
|
@ -1894,6 +1945,9 @@ class TestRegexp < Test::Unit::TestCase
|
|||
assert_not_send [Regexp, :linear_time?, /(a)\1/]
|
||||
assert_not_send [Regexp, :linear_time?, "(a)\\1"]
|
||||
|
||||
assert_not_send [Regexp, :linear_time?, /(?=(a))/]
|
||||
assert_not_send [Regexp, :linear_time?, /(?!(a))/]
|
||||
|
||||
assert_raise(TypeError) {Regexp.linear_time?(nil)}
|
||||
assert_raise(TypeError) {Regexp.linear_time?(Regexp.allocate)}
|
||||
end
|
||||
|
|
Загрузка…
Ссылка в новой задаче