Optimize regexp matching for look-around and atomic groups (#7931)

This commit is contained in:
Hiroya Fujinami 2023-10-30 13:10:42 +09:00 коммит произвёл GitHub
Родитель 13c9cbe09e
Коммит 34cb174800
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
4 изменённых файлов: 306 добавлений и 157 удалений

Просмотреть файл

@ -1273,6 +1273,10 @@ compile_length_enclose_node(EncloseNode* node, regex_t* reg)
break;
case ENCLOSE_STOP_BACKTRACK:
/* Disable POP_STOP_BT optimization for simple repeat under the match cache */
/* optimization because the match cache optimization pushes an extra item to */
/* the stack and it breaks the assumption for this optimization. */
#ifndef USE_MATCH_CACHE
if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
QtfrNode* qn = NQTFR(node->target);
tlen = compile_length_tree(qn->target, reg);
@ -1282,8 +1286,11 @@ compile_length_enclose_node(EncloseNode* node, regex_t* reg)
+ SIZE_OP_PUSH + tlen + SIZE_OP_POP + SIZE_OP_JUMP;
}
else {
#endif
len = SIZE_OP_PUSH_STOP_BT + tlen + SIZE_OP_POP_STOP_BT;
#ifndef USE_MATCH_CACHE
}
#endif
break;
case ENCLOSE_CONDITION:
@ -1395,6 +1402,10 @@ compile_enclose_node(EncloseNode* node, regex_t* reg)
break;
case ENCLOSE_STOP_BACKTRACK:
/* Disable POP_STOP_BT optimization for simple repeat under the match cache */
/* optimization because the match cache optimization pushes an extra item to */
/* the stack and it breaks the assumption for this optimization. */
#ifndef USE_MATCH_CACHE
if (IS_ENCLOSE_STOP_BT_SIMPLE_REPEAT(node)) {
QtfrNode* qn = NQTFR(node->target);
r = compile_tree_n_times(qn->target, qn->lower, reg);
@ -1413,12 +1424,15 @@ compile_enclose_node(EncloseNode* node, regex_t* reg)
-((int )SIZE_OP_PUSH + len + (int )SIZE_OP_POP + (int )SIZE_OP_JUMP));
}
else {
#endif
r = add_opcode(reg, OP_PUSH_STOP_BT);
if (r) return r;
r = compile_tree(node->target, reg);
if (r) return r;
r = add_opcode(reg, OP_POP_STOP_BT);
#ifndef USE_MATCH_CACHE
}
#endif
break;
case ENCLOSE_CONDITION:

377
regexec.c
Просмотреть файл

@ -242,9 +242,6 @@ The `Regexp#match` optimization by using a cache.
"cache opcode"
A cachable opcode (e.g. `OP_PUSH`, `OP_REPEAT`, etc).
It is corresponding to some cache points.
`OP_NULL_CHECK_START` and `OP_NULL_CHECK_END_MEMST` are exceptions.
They are counted as cache opcodes, but they have no corresponding cache points.
They are used for resetting a match cache buffer when a null loop is detected.
"cache point"
A cachable point on matching.
@ -258,7 +255,7 @@ We encode a match cache point to an integer value by the following equation:
"match cache point" = "position on input string" * "total number of cache points" + "cache point"
"match cache buffer"
A bit-array for recording (caching) match cache points once arrived.
A bit-array for memoizing (recording) match cache points once backtracked.
*/
/* count the total number of cache opcodes for allocating a match cache buffer. */
@ -272,7 +269,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
OnigEncoding enc = reg->enc;
MemNumType current_repeat_mem = -1;
long num_cache_opcodes = 0;
long num_cache_opcodes_at_null_check_start = -1;
int lookaround_nesting = 0;
while (p < pend) {
switch (*p++) {
@ -326,7 +323,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
break;
case OP_ANYCHAR_STAR:
case OP_ANYCHAR_ML_STAR:
num_cache_opcodes++;break;
num_cache_opcodes++; break;
case OP_ANYCHAR_STAR_PEEK_NEXT:
case OP_ANYCHAR_ML_STAR_PEEK_NEXT:
p++; num_cache_opcodes++; break;
@ -370,7 +367,12 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
case OP_MEMORY_END_PUSH_REC:
case OP_MEMORY_END:
case OP_MEMORY_END_REC:
p += SIZE_MEMNUM; break;
p += SIZE_MEMNUM;
// A memory (capture) in look-around is found.
if (lookaround_nesting != 0) {
goto impossible;
}
break;
case OP_KEEP:
break;
@ -422,37 +424,60 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
goto impossible;
case OP_NULL_CHECK_START:
p += SIZE_MEMNUM;
if (num_cache_opcodes_at_null_check_start != -1) {
// A nested OP_NULL_CHECK_START is not yet supported.
goto impossible;
}
num_cache_opcodes_at_null_check_start = num_cache_opcodes;
break;
case OP_NULL_CHECK_END:
case OP_NULL_CHECK_END_MEMST_PUSH:
p += SIZE_MEMNUM;
num_cache_opcodes_at_null_check_start = -1;
break;
case OP_NULL_CHECK_END_MEMST:
p += SIZE_MEMNUM;
// OP_NULL_CHECK_START and OP_NULL_CHECK_END_MEMST are counted as cache opcodes.
if (num_cache_opcodes_at_null_check_start < num_cache_opcodes) {
num_cache_opcodes += 2;
}
num_cache_opcodes_at_null_check_start = -1;
break;
case OP_PUSH_POS:
case OP_POP_POS:
if (lookaround_nesting < 0) {
// A look-around nested in a atomic grouping is found.
goto impossible;
}
lookaround_nesting++;
break;
case OP_PUSH_POS_NOT:
if (lookaround_nesting < 0) {
// A look-around nested in a atomic grouping is found.
goto impossible;
}
p += SIZE_RELADDR;
lookaround_nesting++;
break;
case OP_PUSH_LOOK_BEHIND_NOT:
if (lookaround_nesting < 0) {
// A look-around nested in a atomic grouping is found.
goto impossible;
}
p += SIZE_RELADDR;
p += SIZE_LENGTH;
lookaround_nesting++;
break;
case OP_POP_POS:
case OP_FAIL_POS:
goto impossible;
case OP_FAIL_LOOK_BEHIND_NOT:
lookaround_nesting--;
break;
case OP_PUSH_STOP_BT:
if (lookaround_nesting != 0) {
// A nested atomic grouping is found.
goto impossible;
}
lookaround_nesting++;
lookaround_nesting *= -1;
break;
case OP_POP_STOP_BT:
lookaround_nesting *= -1;
lookaround_nesting--;
break;
case OP_LOOK_BEHIND:
case OP_PUSH_LOOK_BEHIND_NOT:
case OP_FAIL_LOOK_BEHIND_NOT:
p += SIZE_LENGTH;
break;
case OP_PUSH_ABSENT_POS:
case OP_ABSENT_END:
case OP_ABSENT:
@ -506,19 +531,34 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
MemNumType current_repeat_mem = -1;
long cache_point = 0;
long num_cache_points_at_repeat = 0;
int lookaround_nesting = 0;
const OnigCacheOpcode* cache_opcodes_begin = cache_opcodes;
OnigCacheOpcode* cache_opcodes_at_repeat = NULL;
UChar* null_check_start_addr = NULL;
OnigCacheOpcode* cache_opcodes_at_null_check_start = NULL;
# define INC_CACHE_OPCODES do {\
# define INC_CACHE_OPCODES do {\
cache_opcodes->addr = pbegin;\
cache_opcodes->cache_point = cache_point - num_cache_points_at_repeat;\
cache_opcodes->outer_repeat_mem = current_repeat_mem;\
cache_opcodes->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;\
cache_opcodes->num_cache_points_in_outer_repeat = 0;\
cache_point++;\
cache_opcodes->lookaround_nesting = lookaround_nesting;\
cache_point += lookaround_nesting > 0 ? 2 : 1;\
cache_opcodes++;\
} while (0)
} while (0)
# define INC_LOOKAROUND_NESTING lookaround_nesting++
# define DEC_LOOKAROUND_NESTING do {\
UChar* match_addr = p - 1;\
OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes - 1;\
while (\
cache_opcodes_in_lookaround >= cache_opcodes_begin &&\
cache_opcodes_in_lookaround->lookaround_nesting == lookaround_nesting\
) {\
cache_opcodes_in_lookaround->match_addr = match_addr;\
cache_opcodes_in_lookaround--;\
}\
lookaround_nesting--;\
} while (0)
while (p < pend) {
pbegin = p;
@ -620,7 +660,11 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
case OP_MEMORY_END_PUSH_REC:
case OP_MEMORY_END:
case OP_MEMORY_END_REC:
p += SIZE_MEMNUM; break;
p += SIZE_MEMNUM;
if (lookaround_nesting != 0) {
goto unexpected_bytecode_error;
}
break;
case OP_KEEP:
break;
@ -679,46 +723,45 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
case OP_REPEAT_INC_NG_SG:
goto unexpected_bytecode_error;
case OP_NULL_CHECK_START:
if (cache_opcodes_at_null_check_start != NULL) {
goto unexpected_bytecode_error;
}
p += SIZE_MEMNUM;
null_check_start_addr = pbegin;
cache_opcodes_at_null_check_start = cache_opcodes;
break;
case OP_NULL_CHECK_END:
case OP_NULL_CHECK_END_MEMST_PUSH:
p += SIZE_MEMNUM;
cache_opcodes_at_null_check_start = NULL;
break;
case OP_NULL_CHECK_END_MEMST:
p += SIZE_MEMNUM;
{
if (cache_opcodes_at_null_check_start < cache_opcodes) {
*cache_opcodes = *(cache_opcodes - 1);
cache_opcodes->addr = pbegin;
cache_opcodes++;
long num_cache_opcodes_in_null_check = (cache_opcodes - cache_opcodes_at_null_check_start);
xmemmove(cache_opcodes_at_null_check_start + 1, cache_opcodes_at_null_check_start, sizeof(OnigCacheOpcode) * num_cache_opcodes_in_null_check);
cache_opcodes_at_null_check_start->addr = null_check_start_addr;
cache_opcodes++;
}
cache_opcodes_at_null_check_start = NULL;
}
break;
case OP_PUSH_POS:
case OP_POP_POS:
INC_LOOKAROUND_NESTING;
break;
case OP_PUSH_POS_NOT:
p += SIZE_RELADDR;
INC_LOOKAROUND_NESTING;
break;
case OP_PUSH_LOOK_BEHIND_NOT:
p += SIZE_RELADDR;
p += SIZE_LENGTH;
INC_LOOKAROUND_NESTING;
break;
case OP_POP_POS:
case OP_FAIL_POS:
goto unexpected_bytecode_error;
case OP_FAIL_LOOK_BEHIND_NOT:
DEC_LOOKAROUND_NESTING;
break;
case OP_PUSH_STOP_BT:
INC_LOOKAROUND_NESTING;
lookaround_nesting *= -1;
break;
case OP_POP_STOP_BT:
lookaround_nesting *= -1;
DEC_LOOKAROUND_NESTING;
break;
case OP_LOOK_BEHIND:
case OP_PUSH_LOOK_BEHIND_NOT:
case OP_FAIL_LOOK_BEHIND_NOT:
case OP_PUSH_ABSENT_POS:
p += SIZE_LENGTH;
break;
case OP_ABSENT_END:
case OP_ABSENT:
goto unexpected_bytecode_error;
@ -933,31 +976,33 @@ onig_region_copy(OnigRegion* to, const OnigRegion* from)
/* stack type */
/* used by normal-POP */
#define STK_ALT 0x0001
#define STK_LOOK_BEHIND_NOT 0x0002
#define STK_POS_NOT 0x0003
#define STK_ALT 0x0001
#define STK_LOOK_BEHIND_NOT 0x0002
#define STK_POS_NOT 0x0003
/* handled by normal-POP */
#define STK_MEM_START 0x0100
#define STK_MEM_END 0x8200
#define STK_REPEAT_INC 0x0300
#define STK_STATE_CHECK_MARK 0x1000
#define STK_MEM_START 0x0100
#define STK_MEM_END 0x8200
#define STK_REPEAT_INC 0x0300
#define STK_STATE_CHECK_MARK 0x1000
/* avoided by normal-POP */
#define STK_NULL_CHECK_START 0x3000
#define STK_NULL_CHECK_END 0x5000 /* for recursive call */
#define STK_MEM_END_MARK 0x8400
#define STK_POS 0x0500 /* used when POP-POS */
#define STK_STOP_BT 0x0600 /* mark for "(?>...)" */
#define STK_REPEAT 0x0700
#define STK_CALL_FRAME 0x0800
#define STK_RETURN 0x0900
#define STK_VOID 0x0a00 /* for fill a blank */
#define STK_ABSENT_POS 0x0b00 /* for absent */
#define STK_ABSENT 0x0c00 /* absent inner loop marker */
#define STK_NULL_CHECK_START 0x3000
#define STK_NULL_CHECK_END 0x5000 /* for recursive call */
#define STK_MEM_END_MARK 0x8400
#define STK_POS 0x0500 /* used when POP-POS */
#define STK_STOP_BT 0x0600 /* mark for "(?>...)" */
#define STK_REPEAT 0x0700
#define STK_CALL_FRAME 0x0800
#define STK_RETURN 0x0900
#define STK_VOID 0x0a00 /* for fill a blank */
#define STK_ABSENT_POS 0x0b00 /* for absent */
#define STK_ABSENT 0x0c00 /* absent inner loop marker */
#define STK_MATCH_CACHE_POINT 0x0d00 /* for the match cache optimization */
#define STK_ATOMIC_MATCH_CACHE_POINT 0x0e00
/* stack type check mask */
#define STK_MASK_POP_USED 0x00ff
#define STK_MASK_TO_VOID_TARGET 0x10ff
#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */
#define STK_MASK_POP_USED 0x00ff
#define STK_MASK_TO_VOID_TARGET 0x10ff
#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */
#ifdef USE_MATCH_CACHE
#define MATCH_ARG_INIT_MATCH_CACHE(msa) do {\
@ -1387,6 +1432,15 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
STACK_INC;\
} while(0)
#define STACK_PUSH_MATCH_CACHE_POINT(match_cache_point_index, match_cache_point_mask) do {\
STACK_ENSURE(1);\
stk->type = STK_MATCH_CACHE_POINT;\
stk->null_check = stk == stk_base ? 0 : (stk-1)->null_check;\
stk->u.match_cache_point.index = (match_cache_point_index);\
stk->u.match_cache_point.mask = (match_cache_point_mask);\
STACK_INC;\
} while(0)
#ifdef ONIG_DEBUG
# define STACK_BASE_CHECK(p, at) \
@ -1398,6 +1452,42 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
# define STACK_BASE_CHECK(p, at)
#endif
#ifdef ONIG_DEBUG_MATCH_CACHE
# define MATCH_CACHE_DEBUG_MEMOIZE(stkp) fprintf(stderr, "MATCH CACHE: memoize (index=%ld mask=%d)\n", stkp->u.match_cache_point.index, stkp->u.match_cache_point.mask);
#else
# define MATCH_CACHE_DEBUG_MEMOIZE(stkp) ((void) 0)
#endif
#ifdef USE_MATCH_CACHE
# define INC_NUM_FAILS msa->num_fails++
# define MEMOIZE_MATCH_CACHE_POINT do {\
if (stk->type == STK_MATCH_CACHE_POINT) {\
msa->match_cache_buf[stk->u.match_cache_point.index] |= stk->u.match_cache_point.mask;\
MATCH_CACHE_DEBUG_MEMOIZE(stk);\
} else if (stk->type == STK_ATOMIC_MATCH_CACHE_POINT) {\
memoize_extended_match_cache_point(msa->match_cache_buf, stk->u.match_cache_point.index, stk->u.match_cache_point.mask);\
MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
}\
} while(0)
# define MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(stkp) do {\
if (stkp->type == STK_MATCH_CACHE_POINT) {\
stkp->type = STK_VOID;\
memoize_extended_match_cache_point(msa->match_cache_buf, stkp->u.match_cache_point.index, stkp->u.match_cache_point.mask);\
MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
}\
} while(0)
# define MEMOIZE_ATOMIC_MATCH_CACHE_POINT do {\
if (stk->type == STK_MATCH_CACHE_POINT) {\
memoize_extended_match_cache_point(msa->match_cache_buf, stk->u.match_cache_point.index, stk->u.match_cache_point.mask);\
MATCH_CACHE_DEBUG_MEMOIZE(stkp);\
}\
} while(0)
#else
# define INC_NUM_FAILS ((void) 0)
# define MEMOIZE_MATCH_CACHE_POINT ((void) 0)
# define MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT ((void) 0)
#endif
#define STACK_POP_ONE do {\
stk--;\
STACK_BASE_CHECK(stk, "STACK_POP_ONE"); \
@ -1411,6 +1501,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
STACK_BASE_CHECK(stk, "STACK_POP"); \
if ((stk->type & STK_MASK_POP_USED) != 0) break;\
ELSE_IF_STATE_CHECK_MARK(stk);\
MEMOIZE_MATCH_CACHE_POINT;\
}\
break;\
case STACK_POP_LEVEL_MEM_START:\
@ -1423,6 +1514,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
}\
ELSE_IF_STATE_CHECK_MARK(stk);\
MEMOIZE_MATCH_CACHE_POINT;\
}\
break;\
default:\
@ -1442,6 +1534,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
}\
ELSE_IF_STATE_CHECK_MARK(stk);\
MEMOIZE_MATCH_CACHE_POINT;\
}\
break;\
}\
@ -1463,7 +1556,11 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\
mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\
}\
else if (IS_TO_VOID_TARGET(stk)) {\
INC_NUM_FAILS;\
}\
ELSE_IF_STATE_CHECK_MARK(stk);\
MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(stk);\
}\
} while(0)
@ -1520,12 +1617,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
k--;\
STACK_BASE_CHECK(k, "STACK_POS_END"); \
if (IS_TO_VOID_TARGET(k)) {\
INC_NUM_FAILS;\
k->type = STK_VOID;\
}\
else if (k->type == STK_POS) {\
k->type = STK_VOID;\
break;\
}\
MEMOIZE_LOOKAROUND_MATCH_CACHE_POINT(k);\
}\
} while(0)
@ -1535,12 +1634,28 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
k--;\
STACK_BASE_CHECK(k, "STACK_STOP_BT_END"); \
if (IS_TO_VOID_TARGET(k)) {\
INC_NUM_FAILS;\
k->type = STK_VOID;\
}\
else if (k->type == STK_STOP_BT) {\
k->type = STK_VOID;\
break;\
}\
else if (k->type == STK_MATCH_CACHE_POINT) {\
k->type = STK_ATOMIC_MATCH_CACHE_POINT;\
}\
}\
} while(0)
#define STACK_STOP_BT_FAIL do {\
while (1) {\
stk--;\
STACK_BASE_CHECK(stk, "STACK_STOP_BT_END"); \
if (stk->type == STK_STOP_BT) {\
stk->type = STK_VOID;\
break;\
}\
MEMOIZE_ATOMIC_MATCH_CACHE_POINT;\
}\
} while(0)
@ -1579,7 +1694,7 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
}\
} while(0)
#define STACK_NULL_CHECK_MEMST(isnull,ischange,id,s,reg) do {\
#define STACK_NULL_CHECK_MEMST(isnull,id,s,reg) do {\
OnigStackType* k = STACK_AT((stk-1)->null_check)+1;\
while (1) {\
k--;\
@ -1596,14 +1711,14 @@ stack_double(OnigStackType** arg_stk_base, OnigStackType** arg_stk_end,
while (k < stk) {\
if (k->type == STK_MEM_START) {\
if (k->u.mem.end == INVALID_STACK_INDEX) {\
(isnull) = 0; (ischange) = 1; break;\
(isnull) = 0; break;\
}\
if (BIT_STATUS_AT(reg->bt_mem_end, k->u.mem.num))\
endp = STACK_AT(k->u.mem.end)->u.mem.pstr;\
else\
endp = (UChar* )k->u.mem.end;\
if (STACK_AT(k->u.mem.start)->u.mem.pstr != endp) {\
(isnull) = 0; (ischange) = 1; break;\
(isnull) = 0; break;\
}\
else if (endp != s) {\
(isnull) = -1; /* empty, but position changed */ \
@ -2008,6 +2123,7 @@ stack_type_str(int stack_type)
case STK_VOID: return "Void ";
case STK_ABSENT_POS: return "AbsPos";
case STK_ABSENT: return "Absent";
case STK_MATCH_CACHE_POINT: return "MCache";
default: return " ";
}
}
@ -2029,7 +2145,7 @@ bsearch_cache_opcodes(const OnigCacheOpcode *cache_opcodes, long num_cache_opcod
}
static long
find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, const UChar* p, const OnigStackType *stk, const OnigStackIndex *repeat_stk)
find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, const UChar* p, const OnigStackType *stk, const OnigStackIndex *repeat_stk, const OnigCacheOpcode **cache_opcode_ptr)
{
long m;
const OnigCacheOpcode* cache_opcode;
@ -2048,6 +2164,7 @@ find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_ca
}
cache_opcode = &cache_opcodes[m];
*cache_opcode_ptr = &cache_opcodes[m];
cache_point = cache_opcode->cache_point;
if (cache_opcode->outer_repeat_mem == -1) {
return cache_point;
@ -2079,54 +2196,23 @@ find_cache_point(regex_t* reg, const OnigCacheOpcode* cache_opcodes, long num_ca
cache_point;
}
static void
reset_match_cache_on_null_check(regex_t* reg, const UChar* pstart, const UChar* pend, long pos, uint8_t* match_cache_buf, const OnigCacheOpcode* cache_opcodes, long num_cache_opcodes, long num_cache_points, const OnigStackType *stk, const OnigStackIndex *repeat_stk)
{
long cache_point_start, cache_point_end;
long match_cache_point_start, match_cache_point_end;
long match_cache_point_start_index, match_cache_point_end_index;
uint8_t match_cache_point_start_bit, match_cache_point_end_bit;
#ifdef ONIG_DEBUG_MATCH_CACHE
fprintf(stderr, "MATCH CACHE: reset start=%p end=%p pos=%ld cache_opcodes=%p num_cache_opcodes=%ld\n", pstart, pend, pos, cache_opcodes, num_cache_opcodes);
#endif
cache_point_start = find_cache_point(reg, cache_opcodes, num_cache_opcodes, pstart, stk, repeat_stk);
cache_point_end = find_cache_point(reg, cache_opcodes, num_cache_opcodes, pend, stk, repeat_stk);
// Skip to reset if `pstart` or `pend` is not found.
// This situation occurs when there is no cache point
// between OP_NULL_CHECK_START and OP_NULL_CHECK_END_MEMST.
if (cache_point_start == -1 || cache_point_end == -1) return;
match_cache_point_start = pos * num_cache_points + cache_point_start;
match_cache_point_end = pos * num_cache_points + cache_point_end;
match_cache_point_start_index = match_cache_point_start >> 3;
match_cache_point_end_index = match_cache_point_end >> 3;
match_cache_point_start_bit = match_cache_point_start & 7;
match_cache_point_end_bit = match_cache_point_end & 7;
#ifdef ONIG_DEBUG_MATCH_CACHE
fprintf(stderr, "MATCH CACHE: reset start %ld (%ld index=%ld bit=%d)\n", match_cache_point_start, cache_point_start, match_cache_point_start_index, match_cache_point_start_bit);
fprintf(stderr, "MATCH CACHE: reset end %ld (%ld index=%ld bit=%d)\n", match_cache_point_end, cache_point_end, match_cache_point_end_index, match_cache_point_end_bit);
#endif
if (match_cache_point_start_index == match_cache_point_end_index) {
match_cache_buf[match_cache_point_start_index] &=
(((1 << (8 - match_cache_point_end_bit - 1)) - 1) << (match_cache_point_end_bit + 1)) |
((1 << match_cache_point_start_bit) - 1);
static int check_extended_match_cache_point(uint8_t *match_cache_buf, long match_cache_point_index, uint8_t match_cache_point_mask) {
if (match_cache_point_mask & 0x80) {
return (match_cache_buf[match_cache_point_index + 1] & 0x01) > 0;
} else {
if (match_cache_point_start_bit) {
match_cache_buf[match_cache_point_start_index] &= (1 << (match_cache_point_start_bit - 1)) - 1;
match_cache_point_start_index++;
}
if (match_cache_point_start_index < match_cache_point_end_index) {
xmemset(&match_cache_buf[match_cache_point_start_index], 0, match_cache_point_end_index - match_cache_point_start_index);
if (match_cache_point_end_bit) {
match_cache_buf[match_cache_point_end_index] &= (((1 << (8 - match_cache_point_end_bit - 1)) - 1) << (match_cache_point_end_bit + 1));
}
}
return (match_cache_buf[match_cache_point_index] & (match_cache_point_mask << 1)) > 0;
}
}
static void memoize_extended_match_cache_point(uint8_t *match_cache_buf, long match_cache_point_index, uint8_t match_cache_point_mask) {
match_cache_buf[match_cache_point_index] |= match_cache_point_mask;
if (match_cache_point_mask & 0x80) {
match_cache_buf[match_cache_point_index + 1] |= 0x01;
} else {
match_cache_buf[match_cache_point_index] |= match_cache_point_mask << 1;
}
}
#endif /* USE_MATCH_CACHE */
/* match data(str - end) from position (sstart). */
@ -2447,7 +2533,8 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
# define CHECK_MATCH_CACHE do {\
if (msa->match_cache_status == MATCH_CACHE_STATUS_ENABLED) {\
long cache_point = find_cache_point(reg, msa->cache_opcodes, msa->num_cache_opcodes, pbegin, stk_base, repeat_stk);\
const OnigCacheOpcode *cache_opcode;\
long cache_point = find_cache_point(reg, msa->cache_opcodes, msa->num_cache_opcodes, pbegin, stk_base, repeat_stk, &cache_opcode);\
if (cache_point >= 0) {\
long match_cache_point = msa->num_cache_points * (long)(s - str) + cache_point;\
long match_cache_point_index = match_cache_point >> 3;\
@ -2455,9 +2542,21 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
MATCH_CACHE_DEBUG;\
if (msa->match_cache_buf[match_cache_point_index] & match_cache_point_mask) {\
MATCH_CACHE_DEBUG_HIT;\
goto fail;\
if (cache_opcode->lookaround_nesting == 0) goto fail;\
else if (cache_opcode->lookaround_nesting < 0) {\
if (check_extended_match_cache_point(msa->match_cache_buf, match_cache_point_index, match_cache_point_mask)) {\
STACK_STOP_BT_FAIL;\
goto fail;\
} else goto fail;\
} else {\
if (check_extended_match_cache_point(msa->match_cache_buf, match_cache_point_index, match_cache_point_mask)) {\
p = cache_opcode->match_addr;\
MOP_OUT;\
JUMP;\
} else goto fail;\
}\
}\
msa->match_cache_buf[match_cache_point_index] |= match_cache_point_mask;\
STACK_PUSH_MATCH_CACHE_POINT(match_cache_point_index, match_cache_point_mask);\
}\
}\
} while (0)
@ -3538,10 +3637,9 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
CASE(OP_NULL_CHECK_END_MEMST) MOP_IN(OP_NULL_CHECK_END_MEMST);
{
int isnull;
int ischanged = 0; // set 1 when a loop is empty but memory status is changed.
GET_MEMNUM_INC(mem, p); /* mem: null check id */
STACK_NULL_CHECK_MEMST(isnull, ischanged, mem, s, reg);
STACK_NULL_CHECK_MEMST(isnull, mem, s, reg);
if (isnull) {
# ifdef ONIG_DEBUG_MATCH
fprintf(stderr, "NULL_CHECK_END_MEMST: skip id:%d, s:%"PRIuPTR" (%p)\n",
@ -3550,31 +3648,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
if (isnull == -1) goto fail;
goto null_check_found;
}
# ifdef USE_MATCH_CACHE
if (ischanged && msa->match_cache_status == MATCH_CACHE_STATUS_ENABLED) {
RelAddrType rel;
OnigUChar *null_check_start;
OnigUChar *null_check_end = pbegin;
MemNumType mem;
UChar* tmp = p;
switch (*tmp++) {
case OP_JUMP:
case OP_PUSH:
GET_RELADDR_INC(rel, tmp);
null_check_start = tmp + rel;
break;
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
GET_MEMNUM_INC(mem, tmp);
// Skip OP_REPEAT (or OP_REPEAT_NG)
null_check_start = STACK_AT(repeat_stk[mem])->u.repeat.pcode;
break;
default:
goto unexpected_bytecode_error;
}
reset_match_cache_on_null_check(reg, null_check_start, null_check_end, (long)(s - str), msa->match_cache_buf, msa->cache_opcodes, msa->num_cache_opcodes, msa->num_cache_points, stk_base, repeat_stk);
}
# endif
}
MOP_OUT;
JUMP;
@ -4002,7 +4075,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
fprintf(stderr, "MATCH CACHE: #cache points = %ld\n", msa->num_cache_points);
fprintf(stderr, "MATCH CACHE: cache opcodes (%p):\n", msa->cache_opcodes);
for (int i = 0; i < msa->num_cache_opcodes; i++) {
fprintf(stderr, "MATCH CACHE: [%p] cache_point=%ld outer_repeat_mem=%d num_cache_opcodes_at_outer_repeat=%ld num_cache_opcodes_in_outer_repeat=%ld\n", msa->cache_opcodes[i].addr, msa->cache_opcodes[i].cache_point, msa->cache_opcodes[i].outer_repeat_mem, msa->cache_opcodes[i].num_cache_points_at_outer_repeat, msa->cache_opcodes[i].num_cache_points_in_outer_repeat);
fprintf(stderr, "MATCH CACHE: [%p] cache_point=%ld outer_repeat_mem=%d num_cache_opcodes_at_outer_repeat=%ld num_cache_opcodes_in_outer_repeat=%ld lookaround_nesting=%d match_addr=%p\n", msa->cache_opcodes[i].addr, msa->cache_opcodes[i].cache_point, msa->cache_opcodes[i].outer_repeat_mem, msa->cache_opcodes[i].num_cache_points_at_outer_repeat, msa->cache_opcodes[i].num_cache_points_in_outer_repeat, msa->cache_opcodes[i].lookaround_nesting, msa->cache_opcodes[i].match_addr);
}
#endif
}

Просмотреть файл

@ -873,6 +873,12 @@ typedef struct _OnigStackType {
UChar *abs_pstr; /* absent start position */
const UChar *end_pstr; /* end position */
} absent_pos;
#ifdef USE_MATCH_CACHE
struct {
long index; /* index of the match cache buffer */
uint8_t mask; /* bit-mask for the match cache buffer */
} match_cache_point;
#endif
} u;
} OnigStackType;
@ -883,6 +889,8 @@ typedef struct {
int outer_repeat_mem;
long num_cache_points_at_outer_repeat;
long num_cache_points_in_outer_repeat;
int lookaround_nesting;
UChar *match_addr;
} OnigCacheOpcode;
#endif

Просмотреть файл

@ -1826,7 +1826,6 @@ class TestRegexp < Test::Unit::TestCase
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_nil(/^(a*)*$/ =~ "a" * 1000000 + "x")
end;
end
@ -1836,7 +1835,6 @@ class TestRegexp < Test::Unit::TestCase
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_nil(/^a*b?a*$/ =~ "a" * 1000000 + "x")
end;
end
@ -1846,8 +1844,61 @@ class TestRegexp < Test::Unit::TestCase
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_nil(/^a*?(?>a*a*)$/ =~ "a" * 1000000 + "x")
end;
end
assert_nil(/^(?>a?a?)(a|a)*$/ =~ "a" * 1000000 + "x")
def test_match_cache_atomic_complex
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_nil(/a*(?>a*)ab/ =~ "a" * 1000000 + "b")
end;
end
def test_match_cache_positive_look_ahead
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_nil(/^a*?(?=a*a*)$/ =~ "a" * 1000000 + "x")
end;
end
def test_match_cache_positive_look_ahead_complex
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_equal(/(?:(?=a*)a)*/ =~ "a" * 1000000, 0)
end;
end
def test_match_cache_negative_look_ahead
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_nil(/^a*?(?!a*a*)$/ =~ "a" * 1000000 + "x")
end;
end
def test_match_cache_positive_look_behind
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_nil(/(?<=abc|def)(a|a)*$/ =~ "abc" + "a" * 1000000 + "x")
end;
end
def test_match_cache_negative_look_behind
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
Regexp.timeout = timeout
assert_nil(/(?<!x)(a|a)*$/ =~ "a" * 1000000 + "x")
end;
end
@ -1864,7 +1915,7 @@ class TestRegexp < Test::Unit::TestCase
assert_equal("10:0:0".match(pattern)[0], "10:0:0")
end
def test_bug_19467
def test_bug_19467 # [Bug #19467]
assert_separately([], "#{<<-"begin;"}\n#{<<-'end;'}")
timeout = #{ EnvUtil.apply_timeout_scale(10).inspect }
begin;
@ -1879,7 +1930,7 @@ class TestRegexp < Test::Unit::TestCase
assert_equal("123456789".match(/(?:x?\dx?){2,}/)[0], "123456789")
end
def test_bug_19537
def test_bug_19537 # [Bug #19537]
str = 'aac'
re = '^([ab]{1,3})(a?)*$'
100.times do
@ -1894,6 +1945,9 @@ class TestRegexp < Test::Unit::TestCase
assert_not_send [Regexp, :linear_time?, /(a)\1/]
assert_not_send [Regexp, :linear_time?, "(a)\\1"]
assert_not_send [Regexp, :linear_time?, /(?=(a))/]
assert_not_send [Regexp, :linear_time?, /(?!(a))/]
assert_raise(TypeError) {Regexp.linear_time?(nil)}
assert_raise(TypeError) {Regexp.linear_time?(Regexp.allocate)}
end