From 7193b404a1a56e50f8046d0382914907020c1559 Mon Sep 17 00:00:00 2001 From: Peter Zhu Date: Wed, 26 Jul 2023 15:57:03 -0400 Subject: [PATCH] Add function rb_reg_onig_match rb_reg_onig_match performs preparation, error handling, and cleanup for matching a regex against a string. This reduces repetitive code and removes the need for StringScanner to access internal data of regex. --- ext/strscan/extconf.rb | 1 + ext/strscan/strscan.c | 107 +++++++++++------- include/ruby/re.h | 18 +-- re.c | 242 +++++++++++++++++------------------------ regexec.c | 8 +- 5 files changed, 185 insertions(+), 191 deletions(-) diff --git a/ext/strscan/extconf.rb b/ext/strscan/extconf.rb index b53b63e455..bd65606a4e 100644 --- a/ext/strscan/extconf.rb +++ b/ext/strscan/extconf.rb @@ -3,6 +3,7 @@ require 'mkmf' if RUBY_ENGINE == 'ruby' $INCFLAGS << " -I$(top_srcdir)" if $extmk have_func("onig_region_memsize", "ruby.h") + have_func("rb_reg_onig_match", "ruby.h") create_makefile 'strscan' else File.write('Makefile', dummy_makefile("").join) diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 9f24287e35..c8e8ef6be9 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -539,6 +539,68 @@ adjust_register_position(struct strscanner *p, long position) } } +/* rb_reg_onig_match is available in Ruby 3.3 and later. */ +#ifndef HAVE_RB_REG_ONIG_MATCH +static OnigPosition +rb_reg_onig_match(VALUE re, VALUE str, + OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args), + void *args, struct re_registers *regs) +{ + regex_t *reg = rb_reg_prepare_re(re, str); + + bool tmpreg = reg != RREGEXP_PTR(re); + if (!tmpreg) RREGEXP(re)->usecnt++; + + OnigPosition result = match(reg, str, regs, args); + + if (!tmpreg) RREGEXP(re)->usecnt--; + if (tmpreg) { + if (RREGEXP(re)->usecnt) { + onig_free(reg); + } + else { + onig_free(RREGEXP_PTR(re)); + RREGEXP_PTR(re) = reg; + } + } + + if (result < 0) { + if (result != ONIG_MISMATCH) { + rb_raise(ScanError, "regexp buffer overflow"); + } + } + + return result; +} +#endif + +static OnigPosition +strscan_match(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr) +{ + struct strscanner *p = (struct strscanner *)args_ptr; + + return onig_match(reg, + match_target(p), + (UChar* )(CURPTR(p) + S_RESTLEN(p)), + (UChar* )CURPTR(p), + regs, + ONIG_OPTION_NONE); +} + +static OnigPosition +strscan_search(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr) +{ + struct strscanner *p = (struct strscanner *)args_ptr; + + return onig_search(reg, + match_target(p), + (UChar *)(CURPTR(p) + S_RESTLEN(p)), + (UChar *)CURPTR(p), + (UChar *)(CURPTR(p) + S_RESTLEN(p)), + regs, + ONIG_OPTION_NONE); +} + static VALUE strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly) { @@ -560,47 +622,14 @@ strscan_do_scan(VALUE self, VALUE pattern, int succptr, int getstr, int headonly } if (RB_TYPE_P(pattern, T_REGEXP)) { - regex_t *rb_reg_prepare_re(VALUE re, VALUE str); - regex_t *re; - long ret; - int tmpreg; - p->regex = pattern; - re = rb_reg_prepare_re(pattern, p->str); - tmpreg = re != RREGEXP_PTR(pattern); - if (!tmpreg) RREGEXP(pattern)->usecnt++; + OnigPosition ret = rb_reg_onig_match(pattern, + p->str, + headonly ? strscan_match : strscan_search, + (void *)p, + &(p->regs)); - if (headonly) { - ret = onig_match(re, - match_target(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - (UChar* )CURPTR(p), - &(p->regs), - ONIG_OPTION_NONE); - } - else { - ret = onig_search(re, - match_target(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - (UChar* )CURPTR(p), - (UChar* )(CURPTR(p) + S_RESTLEN(p)), - &(p->regs), - ONIG_OPTION_NONE); - } - if (!tmpreg) RREGEXP(pattern)->usecnt--; - if (tmpreg) { - if (RREGEXP(pattern)->usecnt) { - onig_free(re); - } - else { - onig_free(RREGEXP_PTR(pattern)); - RREGEXP_PTR(pattern) = re; - } - } - - if (ret == -2) rb_raise(ScanError, "regexp buffer overflow"); - if (ret < 0) { - /* not matched */ + if (ret == ONIG_MISMATCH) { return Qnil; } } diff --git a/include/ruby/re.h b/include/ruby/re.h index 3892d6e7f2..52faeb1e98 100644 --- a/include/ruby/re.h +++ b/include/ruby/re.h @@ -18,6 +18,7 @@ #include +#include "ruby/onigmo.h" #include "ruby/regex.h" #include "ruby/internal/core/rmatch.h" #include "ruby/internal/dllexport.h" @@ -105,25 +106,28 @@ long rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int dir); VALUE rb_reg_quote(VALUE str); /** - * Exercises various checks and preprocesses so that the given regular - * expression can be applied to the given string. The preprocess here includes - * (but not limited to) for instance encoding conversion. + * Runs a regular expression match using function `match`. Performs preparation, + * error handling, and memory cleanup. * * @param[in] re Target regular expression. * @param[in] str What `re` is about to run on. + * @param[in] match The function to run to match `str` against `re`. + * @param[in] args Pointer to arguments to pass into `match`. + * @param[out] regs Registers on a successful match. * @exception rb_eArgError `re` does not fit for `str`. * @exception rb_eEncCompatError `re` and `str` are incompatible. * @exception rb_eRegexpError `re` is malformed. - * @return A preprocessesed pattern buffer ready to be applied to `str`. - * @note The return value is manages by our GC. Don't free. + * @return Match position on a successful match, `ONIG_MISMATCH` otherwise. * * @internal * - * The return type, `regex_t *`, is defined in ``, _and_ + * The type `regex_t *` is defined in ``, _and_ * _conflicts_ with POSIX's ``. We can no longer save the situation * at this point. Just don't mix the two. */ -regex_t *rb_reg_prepare_re(VALUE re, VALUE str); +OnigPosition rb_reg_onig_match(VALUE re, VALUE str, + OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args), + void *args, struct re_registers *regs); /** * Duplicates a match data. This is roughly the same as `onig_region_copy()`, diff --git a/re.c b/re.c index 6581218b8e..35cdc2d810 100644 --- a/re.c +++ b/re.c @@ -1575,8 +1575,8 @@ rb_reg_prepare_enc(VALUE re, VALUE str, int warn) return enc; } -regex_t * -rb_reg_prepare_re0(VALUE re, VALUE str, onig_errmsg_buffer err) +static regex_t * +rb_reg_prepare_re(VALUE re, VALUE str, onig_errmsg_buffer err) { regex_t *reg = RREGEXP_PTR(re); int r; @@ -1620,11 +1620,40 @@ rb_reg_prepare_re0(VALUE re, VALUE str, onig_errmsg_buffer err) return reg; } -regex_t * -rb_reg_prepare_re(VALUE re, VALUE str) +OnigPosition +rb_reg_onig_match(VALUE re, VALUE str, + OnigPosition (*match)(regex_t *reg, VALUE str, struct re_registers *regs, void *args), + void *args, struct re_registers *regs) { onig_errmsg_buffer err = ""; - return rb_reg_prepare_re0(re, str, err); + regex_t *reg = rb_reg_prepare_re(re, str, err); + + bool tmpreg = reg != RREGEXP_PTR(re); + if (!tmpreg) RREGEXP(re)->usecnt++; + + OnigPosition result = match(reg, str, regs, args); + + if (!tmpreg) RREGEXP(re)->usecnt--; + if (tmpreg) { + if (RREGEXP(re)->usecnt) { + onig_free(reg); + } + else { + onig_free(RREGEXP_PTR(re)); + RREGEXP_PTR(re) = reg; + } + } + + if (result < 0) { + onig_region_free(regs, 0); + + if (result != ONIG_MISMATCH) { + onig_error_code_to_str((UChar*)err, (int)result); + rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re); + } + } + + return result; } long @@ -1658,65 +1687,52 @@ rb_reg_adjust_startpos(VALUE re, VALUE str, long pos, int reverse) return pos; } +struct reg_onig_search_args { + long pos; + long range; +}; + +static OnigPosition +reg_onig_search(regex_t *reg, VALUE str, struct re_registers *regs, void *args_ptr) +{ + struct reg_onig_search_args *args = (struct reg_onig_search_args *)args_ptr; + const char *ptr; + long len; + RSTRING_GETMEM(str, ptr, len); + + return onig_search( + reg, + (UChar *)ptr, + (UChar *)(ptr + len), + (UChar *)(ptr + args->pos), + (UChar *)(ptr + args->range), + regs, + ONIG_OPTION_NONE); +} + /* returns byte offset */ static long rb_reg_search_set_match(VALUE re, VALUE str, long pos, int reverse, int set_backref_str, VALUE *set_match) { - long result; - VALUE match; - struct re_registers regi, *regs = ®i; - char *start, *range; - long len; - regex_t *reg; - int tmpreg; - onig_errmsg_buffer err = ""; - - RSTRING_GETMEM(str, start, len); - range = start; + long len = RSTRING_LEN(str); if (pos > len || pos < 0) { rb_backref_set(Qnil); return -1; } - reg = rb_reg_prepare_re0(re, str, err); - tmpreg = reg != RREGEXP_PTR(re); - if (!tmpreg) RREGEXP(re)->usecnt++; + struct reg_onig_search_args args = { + .pos = pos, + .range = reverse ? 0 : len, + }; - MEMZERO(regs, struct re_registers, 1); - if (!reverse) { - range += len; - } - result = onig_search(reg, - (UChar*)start, - ((UChar*)(start + len)), - ((UChar*)(start + pos)), - ((UChar*)range), - regs, ONIG_OPTION_NONE); - if (!tmpreg) RREGEXP(re)->usecnt--; - if (tmpreg) { - if (RREGEXP(re)->usecnt) { - onig_free(reg); - } - else { - onig_free(RREGEXP_PTR(re)); - RREGEXP_PTR(re) = reg; - } - } - if (result < 0) { - if (regs == ®i) - onig_region_free(regs, 0); - if (result == ONIG_MISMATCH) { - rb_backref_set(Qnil); - return result; - } - else { - onig_error_code_to_str((UChar*)err, (int)result); - rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re); - } - } + VALUE match = match_alloc(rb_cMatch); + struct re_registers *regs = RMATCH_REGS(match); - match = match_alloc(rb_cMatch); - memcpy(RMATCH_REGS(match), regs, sizeof(struct re_registers)); + OnigPosition result = rb_reg_onig_match(re, str, reg_onig_search, &args, regs); + if (result == ONIG_MISMATCH) { + rb_backref_set(Qnil); + return ONIG_MISMATCH; + } if (set_backref_str) { RB_OBJ_WRITE(match, &RMATCH(match)->str, rb_str_new4(str)); @@ -1748,69 +1764,35 @@ rb_reg_search(VALUE re, VALUE str, long pos, int reverse) return rb_reg_search0(re, str, pos, reverse, 1); } -bool -rb_reg_start_with_p(VALUE re, VALUE str) +static OnigPosition +reg_onig_match(regex_t *reg, VALUE str, struct re_registers *regs, void *_) { - long result; - VALUE match; - struct re_registers regi, *regs = ®i; - regex_t *reg; - int tmpreg; - onig_errmsg_buffer err = ""; - - reg = rb_reg_prepare_re0(re, str, err); - tmpreg = reg != RREGEXP_PTR(re); - if (!tmpreg) RREGEXP(re)->usecnt++; - - match = rb_backref_get(); - if (!NIL_P(match)) { - if (FL_TEST(match, MATCH_BUSY)) { - match = Qnil; - } - else { - regs = RMATCH_REGS(match); - } - } - if (NIL_P(match)) { - MEMZERO(regs, struct re_registers, 1); - } const char *ptr; long len; RSTRING_GETMEM(str, ptr, len); - result = onig_match(reg, - (UChar*)(ptr), - ((UChar*)(ptr + len)), - (UChar*)(ptr), - regs, ONIG_OPTION_NONE); - if (!tmpreg) RREGEXP(re)->usecnt--; - if (tmpreg) { - if (RREGEXP(re)->usecnt) { - onig_free(reg); - } - else { - onig_free(RREGEXP_PTR(re)); - RREGEXP_PTR(re) = reg; - } - } - if (result < 0) { - if (regs == ®i) - onig_region_free(regs, 0); - if (result == ONIG_MISMATCH) { - rb_backref_set(Qnil); - return false; - } - else { - onig_error_code_to_str((UChar*)err, (int)result); - rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re); - } + + return onig_match( + reg, + (UChar *)ptr, + (UChar *)(ptr + len), + (UChar *)ptr, + regs, + ONIG_OPTION_NONE); +} + +bool +rb_reg_start_with_p(VALUE re, VALUE str) +{ + VALUE match = rb_backref_get(); + if (NIL_P(match) || FL_TEST(match, MATCH_BUSY)) { + match = match_alloc(rb_cMatch); } - if (NIL_P(match)) { - int err; - match = match_alloc(rb_cMatch); - err = rb_reg_region_copy(RMATCH_REGS(match), regs); - onig_region_free(regs, 0); - if (err) rb_memerror(); + struct re_registers *regs = RMATCH_REGS(match); + + if (rb_reg_onig_match(re, str, reg_onig_match, NULL, regs) == ONIG_MISMATCH) { + rb_backref_set(Qnil); + return false; } RB_OBJ_WRITE(match, &RMATCH(match)->str, rb_str_new4(str)); @@ -3784,12 +3766,6 @@ rb_reg_match_m_p(int argc, VALUE *argv, VALUE re) VALUE rb_reg_match_p(VALUE re, VALUE str, long pos) { - regex_t *reg; - onig_errmsg_buffer err = ""; - OnigPosition result; - const UChar *start, *end; - int tmpreg; - if (NIL_P(str)) return Qfalse; str = SYMBOL_P(str) ? rb_sym2str(str) : StringValue(str); if (pos) { @@ -3804,33 +3780,13 @@ rb_reg_match_p(VALUE re, VALUE str, long pos) pos = beg - RSTRING_PTR(str); } } - reg = rb_reg_prepare_re0(re, str, err); - tmpreg = reg != RREGEXP_PTR(re); - if (!tmpreg) RREGEXP(re)->usecnt++; - start = ((UChar*)RSTRING_PTR(str)); - end = start + RSTRING_LEN(str); - result = onig_search(reg, start, end, start + pos, end, - NULL, ONIG_OPTION_NONE); - if (!tmpreg) RREGEXP(re)->usecnt--; - if (tmpreg) { - if (RREGEXP(re)->usecnt) { - onig_free(reg); - } - else { - onig_free(RREGEXP_PTR(re)); - RREGEXP_PTR(re) = reg; - } - } - if (result < 0) { - if (result == ONIG_MISMATCH) { - return Qfalse; - } - else { - onig_error_code_to_str((UChar*)err, (int)result); - rb_reg_raise(RREGEXP_SRC_PTR(re), RREGEXP_SRC_LEN(re), err, re); - } - } - return Qtrue; + + struct reg_onig_search_args args = { + .pos = pos, + .range = RSTRING_LEN(str), + }; + + return rb_reg_onig_match(re, str, reg_onig_search, &args, NULL) == ONIG_MISMATCH ? Qfalse : Qtrue; } /* diff --git a/regexec.c b/regexec.c index b0f73e41d5..ba0e7c2cd2 100644 --- a/regexec.c +++ b/regexec.c @@ -888,12 +888,16 @@ onig_region_free(OnigRegion* r, int free_self) if (r->allocated > 0) { xfree(r->beg); xfree(r->end); - r->allocated = 0; } #ifdef USE_CAPTURE_HISTORY history_root_free(r); #endif - if (free_self) xfree(r); + if (free_self) { + xfree(r); + } + else { + memset(r, 0, sizeof(OnigRegion)); + } } }