Add function rb_reg_onig_match

rb_reg_onig_match performs preparation, error handling, and cleanup for
matching a regex against a string. This reduces repetitive code and
removes the need for StringScanner to access internal data of regex.
This commit is contained in:
Peter Zhu 2023-07-26 15:57:03 -04:00
Родитель e5effa4bd0
Коммит 7193b404a1
5 изменённых файлов: 185 добавлений и 191 удалений

Просмотреть файл

@ -3,6 +3,7 @@ require 'mkmf'
if RUBY_ENGINE == 'ruby'
$INCFLAGS << " -I$(top_srcdir)" if $extmk
have_func("onig_region_memsize", "ruby.h")
have_func("rb_reg_onig_match", "ruby.h")
create_makefile 'strscan'
else
File.write('Makefile', dummy_makefile("").join)

Просмотреть файл

@ -539,6 +539,68 @@ adjust_register_position(struct strscanner *p, long position)
}
}
/* rb_reg_onig_match is available in Ruby 3.3 and later. */
#ifndef HAVE_RB_REG_ONIG_MATCH
static OnigPosition
rb_reg_onig_match(VALUE re, VALUE str,
OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args),
void *args, struct re_registers *regs)
{
regex_t *reg = rb_reg_prepare_re(re, str);
bool tmpreg = reg != RREGEXP_PTR(re);
if (!tmpreg) RREGEXP(re)->usecnt++;
OnigPosition result = match(reg, str, regs, args);
if (!tmpreg) RREGEXP(re)->usecnt--;
if (tmpreg) {
if (RREGEXP(re)->usecnt) {
onig_free(reg);
}
else {
onig_free(RREGEXP_PTR(re));
RREGEXP_PTR(re) = reg;
}
}
if (result < 0) {
if (result != ONIG_MISMATCH) {
rb_raise(ScanError, "regexp buffer overflow");
}
}
return result;
}
#endif
static OnigPosition
strscan_match(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr)
{
struct strscanner *p = (struct strscanner *)args_ptr;
return onig_match(reg,
match_target(p),
(UChar* )(CURPTR(p) + S_RESTLEN(p)),
(UChar* )CURPTR(p),
regs,
ONIG_OPTION_NONE);
}
static OnigPosition
strscan_search(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr)
{
struct strscanner *p = (struct strscanner *)args_ptr;
return onig_search(reg,
match_target(p),
(UChar *)(CURPTR(p) + S_RESTLEN(p)),
(UChar *)CURPTR(p),
(UChar *)(CURPTR(p) + S_RESTLEN(p)),
regs,
ONIG_OPTION_NONE);
}
static VALUE
strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly)
{
@ -560,47 +622,14 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly
}
if (RB_TYPE_P(pattern, T_REGEXP)) {
regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
regex_t *re;
long ret;
int tmpreg;
p->regex = pattern;
re = rb_reg_prepare_re(pattern, p->str);
tmpreg = re != RREGEXP_PTR(pattern);
if (!tmpreg) RREGEXP(pattern)->usecnt++;
OnigPosition ret = rb_reg_onig_match(pattern,
p->str,
headonly ? strscan_match : strscan_search,
(void *)p,
&(p->regs));
if (headonly) {
ret = onig_match(re,
match_target(p),
(UChar* )(CURPTR(p) + S_RESTLEN(p)),
(UChar* )CURPTR(p),
&(p->regs),
ONIG_OPTION_NONE);
}
else {
ret = onig_search(re,
match_target(p),
(UChar* )(CURPTR(p) + S_RESTLEN(p)),
(UChar* )CURPTR(p),
(UChar* )(CURPTR(p) + S_RESTLEN(p)),
&(p->regs),
ONIG_OPTION_NONE);
}
if (!tmpreg) RREGEXP(pattern)->usecnt--;
if (tmpreg) {
if (RREGEXP(pattern)->usecnt) {
onig_free(re);
}
else {
onig_free(RREGEXP_PTR(pattern));
RREGEXP_PTR(pattern) = re;
}
}
if (ret == -2) rb_raise(ScanError, "regexp buffer overflow");
if (ret < 0) {
/* not matched */
if (ret == ONIG_MISMATCH) {
return Qnil;
}
}

Просмотреть файл

@ -18,6 +18,7 @@
#include <stdio.h>
#include "ruby/onigmo.h"
#include "ruby/regex.h"
#include "ruby/internal/core/rmatch.h"
#include "ruby/internal/dllexport.h"
@ -105,25 +106,28 @@ long rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int dir);
VALUE rb_reg_quote(VALUE str);
/**
* Exercises various checks and preprocesses so that the given regular
* expression can be applied to the given string. The preprocess here includes
* (but not limited to) for instance encoding conversion.
* Runs a regular expression match using function `match`. Performs preparation,
* error handling, and memory cleanup.
*
* @param[in] re Target regular expression.
* @param[in] str What `re` is about to run on.
* @param[in] match The function to run to match `str` against `re`.
* @param[in] args Pointer to arguments to pass into `match`.
* @param[out] regs Registers on a successful match.
* @exception rb_eArgError `re` does not fit for `str`.
* @exception rb_eEncCompatError `re` and `str` are incompatible.
* @exception rb_eRegexpError `re` is malformed.
* @return A preprocessesed pattern buffer ready to be applied to `str`.
* @note The return value is manages by our GC. Don't free.
* @return Match position on a successful match, `ONIG_MISMATCH` otherwise.
*
* @internal
*
* The return type, `regex_t *`, is defined in `<ruby/onigmo.h>`, _and_
* The type `regex_t *` is defined in `<ruby/onigmo.h>`, _and_
* _conflicts_ with POSIX's `<regex.h>`. We can no longer save the situation
* at this point. Just don't mix the two.
*/
regex_t *rb_reg_prepare_re(VALUE re, VALUE str);
OnigPosition rb_reg_onig_match(VALUE re, VALUE str,
OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args),
void *args, struct re_registers *regs);
/**
* Duplicates a match data. This is roughly the same as `onig_region_copy()`,

242
re.c
Просмотреть файл

@ -1575,8 +1575,8 @@ rb_reg_prepare_enc(VALUE re, VALUE str, int warn)
return enc;
}
regex_t *
rb_reg_prepare_re0(VALUE re, VALUE str, onig_errmsg_buffer err)
static regex_t *
rb_reg_prepare_re(VALUE re, VALUE str, onig_errmsg_buffer err)
{
regex_t *reg = RREGEXP_PTR(re);
int r;
@ -1620,11 +1620,40 @@ rb_reg_prepare_re0(VALUE re, VALUE str, onig_errmsg_buffer err)
return reg;
}
regex_t *
rb_reg_prepare_re(VALUE re, VALUE str)
OnigPosition
rb_reg_onig_match(VALUE re, VALUE str,
OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args),
void *args, struct re_registers *regs)
{
onig_errmsg_buffer err = "";
return rb_reg_prepare_re0(re, str, err);
regex_t *reg = rb_reg_prepare_re(re, str, err);
bool tmpreg = reg != RREGEXP_PTR(re);
if (!tmpreg) RREGEXP(re)->usecnt++;
OnigPosition result = match(reg, str, regs, args);
if (!tmpreg) RREGEXP(re)->usecnt--;
if (tmpreg) {
if (RREGEXP(re)->usecnt) {
onig_free(reg);
}
else {
onig_free(RREGEXP_PTR(re));
RREGEXP_PTR(re) = reg;
}
}
if (result < 0) {
onig_region_free(regs, 0);
if (result != ONIG_MISMATCH) {
onig_error_code_to_str((UChar*)err, (int)result);
rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
}
}
return result;
}
long
@ -1658,65 +1687,52 @@ rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse)
return pos;
}
struct reg_onig_search_args {
long pos;
long range;
};
static OnigPosition
reg_onig_search(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr)
{
struct reg_onig_search_args *args = (struct reg_onig_search_args *)args_ptr;
const char *ptr;
long len;
RSTRING_GETMEM(str, ptr, len);
return onig_search(
reg,
(UChar *)ptr,
(UChar *)(ptr + len),
(UChar *)(ptr + args->pos),
(UChar *)(ptr + args->range),
regs,
ONIG_OPTION_NONE);
}
/* returns byte offset */
static long
rb_reg_search_set_match(VALUE re, VALUE str, long pos, int reverse, int set_backref_str, VALUE *set_match)
{
long result;
VALUE match;
struct re_registers regi, *regs = &regi;
char *start, *range;
long len;
regex_t *reg;
int tmpreg;
onig_errmsg_buffer err = "";
RSTRING_GETMEM(str, start, len);
range = start;
long len = RSTRING_LEN(str);
if (pos > len || pos < 0) {
rb_backref_set(Qnil);
return -1;
}
reg = rb_reg_prepare_re0(re, str, err);
tmpreg = reg != RREGEXP_PTR(re);
if (!tmpreg) RREGEXP(re)->usecnt++;
struct reg_onig_search_args args = {
.pos = pos,
.range = reverse ? 0 : len,
};
MEMZERO(regs, struct re_registers, 1);
if (!reverse) {
range += len;
}
result = onig_search(reg,
(UChar*)start,
((UChar*)(start + len)),
((UChar*)(start + pos)),
((UChar*)range),
regs, ONIG_OPTION_NONE);
if (!tmpreg) RREGEXP(re)->usecnt--;
if (tmpreg) {
if (RREGEXP(re)->usecnt) {
onig_free(reg);
}
else {
onig_free(RREGEXP_PTR(re));
RREGEXP_PTR(re) = reg;
}
}
if (result < 0) {
if (regs == &regi)
onig_region_free(regs, 0);
if (result == ONIG_MISMATCH) {
rb_backref_set(Qnil);
return result;
}
else {
onig_error_code_to_str((UChar*)err, (int)result);
rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
}
}
VALUE match = match_alloc(rb_cMatch);
struct re_registers *regs = RMATCH_REGS(match);
match = match_alloc(rb_cMatch);
memcpy(RMATCH_REGS(match), regs, sizeof(struct re_registers));
OnigPosition result = rb_reg_onig_match(re, str, reg_onig_search, &args, regs);
if (result == ONIG_MISMATCH) {
rb_backref_set(Qnil);
return ONIG_MISMATCH;
}
if (set_backref_str) {
RB_OBJ_WRITE(match, &RMATCH(match)->str, rb_str_new4(str));
@ -1748,69 +1764,35 @@ rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
return rb_reg_search0(re, str, pos, reverse, 1);
}
bool
rb_reg_start_with_p(VALUE re, VALUE str)
static OnigPosition
reg_onig_match(regex_t *reg, VALUE str, struct re_registers *regs, void *_)
{
long result;
VALUE match;
struct re_registers regi, *regs = &regi;
regex_t *reg;
int tmpreg;
onig_errmsg_buffer err = "";
reg = rb_reg_prepare_re0(re, str, err);
tmpreg = reg != RREGEXP_PTR(re);
if (!tmpreg) RREGEXP(re)->usecnt++;
match = rb_backref_get();
if (!NIL_P(match)) {
if (FL_TEST(match, MATCH_BUSY)) {
match = Qnil;
}
else {
regs = RMATCH_REGS(match);
}
}
if (NIL_P(match)) {
MEMZERO(regs, struct re_registers, 1);
}
const char *ptr;
long len;
RSTRING_GETMEM(str, ptr, len);
result = onig_match(reg,
(UChar*)(ptr),
((UChar*)(ptr + len)),
(UChar*)(ptr),
regs, ONIG_OPTION_NONE);
if (!tmpreg) RREGEXP(re)->usecnt--;
if (tmpreg) {
if (RREGEXP(re)->usecnt) {
onig_free(reg);
}
else {
onig_free(RREGEXP_PTR(re));
RREGEXP_PTR(re) = reg;
}
}
if (result < 0) {
if (regs == &regi)
onig_region_free(regs, 0);
if (result == ONIG_MISMATCH) {
rb_backref_set(Qnil);
return false;
}
else {
onig_error_code_to_str((UChar*)err, (int)result);
rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
}
return onig_match(
reg,
(UChar *)ptr,
(UChar *)(ptr + len),
(UChar *)ptr,
regs,
ONIG_OPTION_NONE);
}
bool
rb_reg_start_with_p(VALUE re, VALUE str)
{
VALUE match = rb_backref_get();
if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) {
match = match_alloc(rb_cMatch);
}
if (NIL_P(match)) {
int err;
match = match_alloc(rb_cMatch);
err = rb_reg_region_copy(RMATCH_REGS(match), regs);
onig_region_free(regs, 0);
if (err) rb_memerror();
struct re_registers *regs = RMATCH_REGS(match);
if (rb_reg_onig_match(re, str, reg_onig_match, NULL, regs) == ONIG_MISMATCH) {
rb_backref_set(Qnil);
return false;
}
RB_OBJ_WRITE(match, &RMATCH(match)->str, rb_str_new4(str));
@ -3784,12 +3766,6 @@ rb_reg_match_m_p(int argc, VALUE *argv, VALUE re)
VALUE
rb_reg_match_p(VALUE re, VALUE str, long pos)
{
regex_t *reg;
onig_errmsg_buffer err = "";
OnigPosition result;
const UChar *start, *end;
int tmpreg;
if (NIL_P(str)) return Qfalse;
str = SYMBOL_P(str) ? rb_sym2str(str) : StringValue(str);
if (pos) {
@ -3804,33 +3780,13 @@ rb_reg_match_p(VALUE re, VALUE str, long pos)
pos = beg - RSTRING_PTR(str);
}
}
reg = rb_reg_prepare_re0(re, str, err);
tmpreg = reg != RREGEXP_PTR(re);
if (!tmpreg) RREGEXP(re)->usecnt++;
start = ((UChar*)RSTRING_PTR(str));
end = start + RSTRING_LEN(str);
result = onig_search(reg, start, end, start + pos, end,
NULL, ONIG_OPTION_NONE);
if (!tmpreg) RREGEXP(re)->usecnt--;
if (tmpreg) {
if (RREGEXP(re)->usecnt) {
onig_free(reg);
}
else {
onig_free(RREGEXP_PTR(re));
RREGEXP_PTR(re) = reg;
}
}
if (result < 0) {
if (result == ONIG_MISMATCH) {
return Qfalse;
}
else {
onig_error_code_to_str((UChar*)err, (int)result);
rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re);
}
}
return Qtrue;
struct reg_onig_search_args args = {
.pos = pos,
.range = RSTRING_LEN(str),
};
return rb_reg_onig_match(re, str, reg_onig_search, &args, NULL) == ONIG_MISMATCH ? Qfalse : Qtrue;
}
/*

Просмотреть файл

@ -888,12 +888,16 @@ onig_region_free(OnigRegion* r, int free_self)
if (r->allocated > 0) {
xfree(r->beg);
xfree(r->end);
r->allocated = 0;
}
#ifdef USE_CAPTURE_HISTORY
history_root_free(r);
#endif
if (free_self) xfree(r);
if (free_self) {
xfree(r);
}
else {
memset(r, 0, sizeof(OnigRegion));
}
}
}