merge revision(s) 3e6e3ca2627b1aa71b17de902cc1b8188246a828: [Backport #20207] (#10299)

Correctly handle consecutive lookarounds (#9738)

	Fix [Bug #20207]
	Fix [Bug #20212]

	Handling consecutive lookarounds in init_cache_opcodes is buggy, so it
	causes invalid memory access reported in [Bug #20207] and [Bug #20212].
	This fixes it by using recursive functions to detected lookarounds
	nesting correctly.
This commit is contained in:
NARUSE, Yui 2024-03-21 02:13:59 +09:00 коммит произвёл GitHub
Родитель c626c201e4
Коммит 00cb72157a
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
3 изменённых файлов: 151 добавлений и 86 удалений

223
regexec.c
Просмотреть файл

@ -258,18 +258,19 @@ We encode a match cache point to an integer value by the following equation:
A bit-array for memoizing (recording) match cache points once backtracked.
*/
/* count the total number of cache opcodes for allocating a match cache buffer. */
static OnigPosition
count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
static OnigPosition count_num_cache_opcodes_inner(
const regex_t* reg,
MemNumType current_repeat_mem, int lookaround_nesting,
UChar** pp, long* num_cache_opcodes_ptr
)
{
UChar* p = reg->p;
UChar* pend = p + reg->used;
UChar* p = *pp;
UChar* pend = reg->p + reg->used;
LengthType len;
MemNumType repeat_mem;
OnigEncoding enc = reg->enc;
MemNumType current_repeat_mem = -1;
long num_cache_opcodes = 0;
int lookaround_nesting = 0;
long num_cache_opcodes = *num_cache_opcodes_ptr;
OnigPosition result;
while (p < pend) {
switch (*p++) {
@ -402,7 +403,16 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
if (reg->repeat_range[repeat_mem].lower == 0) {
num_cache_opcodes++;
}
current_repeat_mem = repeat_mem;
result = count_num_cache_opcodes_inner(reg, repeat_mem, lookaround_nesting, &p, &num_cache_opcodes);
if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
{
OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
if (repeat_range->lower < repeat_range->upper) {
num_cache_opcodes++;
}
}
break;
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
@ -411,14 +421,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
// A lone or invalid OP_REPEAT_INC is found.
goto impossible;
}
{
OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
if (repeat_range->lower < repeat_range->upper) {
num_cache_opcodes++;
}
current_repeat_mem = -1;
}
break;
goto exit;
case OP_REPEAT_INC_SG:
case OP_REPEAT_INC_NG_SG:
goto impossible;
@ -438,7 +441,10 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
// A look-around nested in a atomic grouping is found.
goto impossible;
}
lookaround_nesting++;
result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
break;
case OP_PUSH_POS_NOT:
if (lookaround_nesting < 0) {
@ -446,7 +452,10 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
goto impossible;
}
p += SIZE_RELADDR;
lookaround_nesting++;
result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
break;
case OP_PUSH_LOOK_BEHIND_NOT:
if (lookaround_nesting < 0) {
@ -455,25 +464,26 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
}
p += SIZE_RELADDR;
p += SIZE_LENGTH;
lookaround_nesting++;
break;
case OP_POP_POS:
case OP_FAIL_POS:
case OP_FAIL_LOOK_BEHIND_NOT:
lookaround_nesting--;
result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
break;
case OP_PUSH_STOP_BT:
if (lookaround_nesting != 0) {
// A nested atomic grouping is found.
goto impossible;
}
lookaround_nesting++;
lookaround_nesting *= -1;
result = count_num_cache_opcodes_inner(reg, current_repeat_mem, -1, &p, &num_cache_opcodes);
if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
break;
case OP_POP_POS:
case OP_FAIL_POS:
case OP_FAIL_LOOK_BEHIND_NOT:
case OP_POP_STOP_BT:
lookaround_nesting *= -1;
lookaround_nesting--;
break;
goto exit;
case OP_LOOK_BEHIND:
p += SIZE_LENGTH;
break;
@ -507,9 +517,15 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
}
}
exit:
*pp = p;
*num_cache_opcodes_ptr = num_cache_opcodes;
return 0;
fail:
*num_cache_opcodes_ptr = num_cache_opcodes;
return result;
impossible:
*num_cache_opcodes_ptr = NUM_CACHE_OPCODES_IMPOSSIBLE;
return 0;
@ -518,48 +534,49 @@ bytecode_error:
return ONIGERR_UNDEFINED_BYTECODE;
}
/* collect cache opcodes from the given regex program, and compute the total number of cache points. */
/* count the total number of cache opcodes for allocating a match cache buffer. */
static OnigPosition
init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num_cache_points_ptr)
count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
{
UChar* pbegin;
UChar* p = reg->p;
UChar* pend = p + reg->used;
*num_cache_opcodes_ptr = 0;
OnigPosition result = count_num_cache_opcodes_inner(reg, -1, 0, &p, num_cache_opcodes_ptr);
if (result == 0 && *num_cache_opcodes_ptr >= 0 && p != reg->p + reg->used) {
return ONIGERR_UNDEFINED_BYTECODE;
}
return result;
}
static OnigPosition
init_cache_opcodes_inner(
const regex_t* reg,
MemNumType current_repeat_mem, int lookaround_nesting,
OnigCacheOpcode** cache_opcodes_ptr, UChar** pp, long* num_cache_points_ptr
)
{
UChar* p = *pp;
UChar* pend = reg->p + reg->used;
UChar* pbegin;
LengthType len;
MemNumType repeat_mem;
OnigEncoding enc = reg->enc;
MemNumType current_repeat_mem = -1;
long cache_point = 0;
long num_cache_points_at_repeat = 0;
int lookaround_nesting = 0;
const OnigCacheOpcode* cache_opcodes_begin = cache_opcodes;
OnigCacheOpcode* cache_opcodes_at_repeat = NULL;
long cache_point = *num_cache_points_ptr;
OnigCacheOpcode *cache_opcodes = *cache_opcodes_ptr;
OnigPosition result;
# define INC_CACHE_OPCODES do {\
cache_opcodes->addr = pbegin;\
cache_opcodes->cache_point = cache_point - num_cache_points_at_repeat;\
cache_opcodes->cache_point = cache_point;\
cache_opcodes->outer_repeat_mem = current_repeat_mem;\
cache_opcodes->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;\
cache_opcodes->num_cache_points_at_outer_repeat = 0;\
cache_opcodes->num_cache_points_in_outer_repeat = 0;\
cache_opcodes->lookaround_nesting = lookaround_nesting;\
cache_opcodes->match_addr = NULL;\
cache_point += lookaround_nesting != 0 ? 2 : 1;\
cache_opcodes++;\
} while (0)
# define INC_LOOKAROUND_NESTING lookaround_nesting++
# define DEC_LOOKAROUND_NESTING do {\
UChar* match_addr = p - 1;\
OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes - 1;\
while (\
cache_opcodes_in_lookaround >= cache_opcodes_begin &&\
cache_opcodes_in_lookaround->lookaround_nesting == lookaround_nesting\
) {\
cache_opcodes_in_lookaround->match_addr = match_addr;\
cache_opcodes_in_lookaround--;\
}\
lookaround_nesting--;\
} while (0)
while (p < pend) {
pbegin = p;
switch (*p++) {
@ -692,33 +709,31 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
if (reg->repeat_range[repeat_mem].lower == 0) {
INC_CACHE_OPCODES;
}
current_repeat_mem = repeat_mem;
num_cache_points_at_repeat = cache_point;
cache_opcodes_at_repeat = cache_opcodes;
break;
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
GET_MEMNUM_INC(repeat_mem, p);
{
long num_cache_points_in_repeat = cache_point - num_cache_points_at_repeat;
long num_cache_points_in_repeat = 0;
long num_cache_points_at_repeat = cache_point;
OnigCacheOpcode* cache_opcodes_in_repeat = cache_opcodes;
result = init_cache_opcodes_inner(reg, repeat_mem, lookaround_nesting, &cache_opcodes, &p, &num_cache_points_in_repeat);
if (result != 0) {
goto fail;
}
OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
if (repeat_range->lower < repeat_range->upper) {
INC_CACHE_OPCODES;
cache_point -= lookaround_nesting != 0 ? 2 : 1;
}
cache_point -= num_cache_points_in_repeat;
int repeat_bounds = repeat_range->upper == 0x7fffffff ? 1 : repeat_range->upper - repeat_range->lower;
cache_point += num_cache_points_in_repeat * repeat_range->lower + (num_cache_points_in_repeat + (lookaround_nesting != 0 ? 2 : 1)) * repeat_bounds;
OnigCacheOpcode* cache_opcodes_in_repeat = cache_opcodes - 1;
while (cache_opcodes_at_repeat <= cache_opcodes_in_repeat) {
for (; cache_opcodes_in_repeat < cache_opcodes; cache_opcodes_in_repeat++) {
cache_opcodes_in_repeat->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;
cache_opcodes_in_repeat->num_cache_points_in_outer_repeat = num_cache_points_in_repeat;
cache_opcodes_in_repeat--;
}
current_repeat_mem = -1;
num_cache_points_at_repeat = 0;
cache_opcodes_at_repeat = NULL;
}
break;
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
p += SIZE_MEMNUM;
goto exit;
case OP_REPEAT_INC_SG:
case OP_REPEAT_INC_NG_SG:
goto unexpected_bytecode_error;
@ -734,33 +749,51 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
break;
case OP_PUSH_POS:
INC_LOOKAROUND_NESTING;
lookaround:
{
OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes;
result = init_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &cache_opcodes, &p, &cache_point);
if (result != 0) {
goto fail;
}
UChar* match_addr = p - 1;
for (; cache_opcodes_in_lookaround < cache_opcodes; cache_opcodes_in_lookaround++) {
if (cache_opcodes_in_lookaround->match_addr == NULL) {
cache_opcodes_in_lookaround->match_addr = match_addr;
}
}
}
break;
case OP_PUSH_POS_NOT:
p += SIZE_RELADDR;
INC_LOOKAROUND_NESTING;
break;
goto lookaround;
case OP_PUSH_LOOK_BEHIND_NOT:
p += SIZE_RELADDR;
p += SIZE_LENGTH;
INC_LOOKAROUND_NESTING;
goto lookaround;
case OP_PUSH_STOP_BT:
{
OnigCacheOpcode* cache_opcodes_in_atomic = cache_opcodes;
result = init_cache_opcodes_inner(reg, current_repeat_mem, -1, &cache_opcodes, &p, &cache_point);
if (result != 0) {
goto fail;
}
UChar* match_addr = p - 1;
for (; cache_opcodes_in_atomic < cache_opcodes; cache_opcodes_in_atomic++) {
if (cache_opcodes_in_atomic->match_addr == NULL) {
cache_opcodes_in_atomic->match_addr = match_addr;
}
}
}
break;
case OP_POP_POS:
case OP_FAIL_POS:
case OP_FAIL_LOOK_BEHIND_NOT:
DEC_LOOKAROUND_NESTING;
break;
case OP_PUSH_STOP_BT:
INC_LOOKAROUND_NESTING;
lookaround_nesting *= -1;
break;
case OP_POP_STOP_BT:
lookaround_nesting *= -1;
DEC_LOOKAROUND_NESTING;
break;
goto exit;
case OP_LOOK_BEHIND:
p += SIZE_LENGTH;
break;
break;
case OP_ABSENT_END:
case OP_ABSENT:
@ -790,15 +823,35 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
}
}
exit:
*cache_opcodes_ptr = cache_opcodes;
*pp = p;
*num_cache_points_ptr = cache_point;
return 0;
fail:
return result;
unexpected_bytecode_error:
return ONIGERR_UNEXPECTED_BYTECODE;
bytecode_error:
return ONIGERR_UNDEFINED_BYTECODE;
}
/* collect cache opcodes from the given regex program, and compute the total number of cache points. */
static OnigPosition
init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes_ptr, long* num_cache_points_ptr)
{
UChar* p = reg->p;
*num_cache_points_ptr = 0;
OnigPosition result = init_cache_opcodes_inner(reg, -1, 0, &cache_opcodes_ptr, &p, num_cache_points_ptr);
if (result == 0 && p != reg->p + reg->used) {
return ONIGERR_UNDEFINED_BYTECODE;
}
return result;
}
#else
static OnigPosition
count_num_cache_opcodes(regex_t* reg, long* num_cache_opcodes)

Просмотреть файл

@ -2033,6 +2033,18 @@ class TestRegexp < Test::Unit::TestCase
assert /^(?:.+){2,4}?b|b/.match? "aaaabaa"
end
def test_bug_20207 # [Bug #20207]
assert(!'clan'.match?(/(?=.*a)(?!.*n)/))
end
def test_bug_20212 # [Bug #20212]
regex = Regexp.new(
/\A((?=.*?[a-z])(?!.*--)[a-z\d]+[a-z\d-]*[a-z\d]+).((?=.*?[a-z])(?!.*--)[a-z\d]+[a-z\d-]*[a-z\d]+).((?=.*?[a-z])(?!.*--)[a-zd]+[a-zd-]*[a-zd]+).((?=.*?[a-z])(?!.*--)[a-zd]+[a-zd-]*[a-zd]+)\Z/x
)
string = "www.google.com"
100.times.each { assert(regex.match?(string)) }
end
def test_linear_time_p
assert_send [Regexp, :linear_time?, /a/]
assert_send [Regexp, :linear_time?, 'a']

Просмотреть файл

@ -11,7 +11,7 @@
# define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR
#define RUBY_VERSION_TEENY 0
#define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR
#define RUBY_PATCHLEVEL 16
#define RUBY_PATCHLEVEL 17
#include "ruby/version.h"
#include "ruby/internal/abi.h"