2023-09-27 19:24:48 +03:00
|
|
|
#include "prism/regexp.h"
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* This is the parser that is going to handle parsing regular expressions.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef struct {
|
2023-10-31 20:26:31 +03:00
|
|
|
/** A pointer to the start of the source that we are parsing. */
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *start;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** A pointer to the current position in the source. */
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *cursor;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** A pointer to the end of the source that we are parsing. */
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *end;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** A list of named captures that we've found. */
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_string_list_t *named_captures;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** Whether the encoding has changed from the default. */
|
2023-08-03 20:25:38 +03:00
|
|
|
bool encoding_changed;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** The encoding of the source. */
|
2023-12-01 04:59:00 +03:00
|
|
|
const pm_encoding_t *encoding;
|
2023-09-27 19:24:48 +03:00
|
|
|
} pm_regexp_parser_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* This initializes a new parser with the given source.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static void
|
2023-12-01 04:59:00 +03:00
|
|
|
pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
|
2023-09-27 19:24:48 +03:00
|
|
|
*parser = (pm_regexp_parser_t) {
|
2023-06-20 18:53:02 +03:00
|
|
|
.start = start,
|
|
|
|
.cursor = start,
|
|
|
|
.end = end,
|
2023-08-03 20:25:38 +03:00
|
|
|
.named_captures = named_captures,
|
|
|
|
.encoding_changed = encoding_changed,
|
|
|
|
.encoding = encoding
|
2023-06-20 18:53:02 +03:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* This appends a new string to the list of named captures.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static void
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
|
|
|
|
pm_string_t string;
|
|
|
|
pm_string_shared_init(&string, start, end);
|
|
|
|
pm_string_list_append(parser->named_captures, &string);
|
|
|
|
pm_string_free(&string);
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Returns true if the next character is the end of the source.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static inline bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return parser->cursor >= parser->end;
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Optionally accept a char and consume it if it exists.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static inline bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
|
|
|
|
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
2023-06-20 18:53:02 +03:00
|
|
|
parser->cursor++;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Expect a character to be present and consume it.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static inline bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
|
|
|
|
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
|
2023-06-20 18:53:02 +03:00
|
|
|
parser->cursor++;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* This advances the current token to the next instance of the given character.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
|
|
|
|
if (pm_regexp_char_is_eof(parser)) {
|
2023-06-23 17:43:28 +03:00
|
|
|
return false;
|
|
|
|
}
|
2023-08-03 20:25:38 +03:00
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
const uint8_t *end = (const uint8_t *) pm_memchr(parser->cursor, value, (size_t) (parser->end - parser->cursor), parser->encoding_changed, parser->encoding);
|
2023-06-20 18:53:02 +03:00
|
|
|
if (end == NULL) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
parser->cursor = end + 1;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Range quantifiers are a special class of quantifiers that look like
|
|
|
|
*
|
|
|
|
* * {digit}
|
|
|
|
* * {digit,}
|
|
|
|
* * {digit,digit}
|
|
|
|
* * {,digit}
|
|
|
|
*
|
|
|
|
* Unfortunately, if there are any spaces in between, then this just becomes a
|
|
|
|
* regular character match expression and we have to backtrack. So when this
|
|
|
|
* function first starts running, we'll create a "save" point and then attempt
|
|
|
|
* to parse the quantifier. If it fails, we'll restore the save point and
|
|
|
|
* return.
|
|
|
|
*
|
|
|
|
* The properly track everything, we're going to build a little state machine.
|
|
|
|
* It looks something like the following:
|
|
|
|
*
|
2023-11-04 20:14:26 +03:00
|
|
|
* +-------+ +---------+ ------------+
|
|
|
|
* ---- lbrace ---> | start | ---- digit ---> | minimum | |
|
|
|
|
* +-------+ +---------+ <--- digit -+
|
|
|
|
* | | |
|
|
|
|
* +-------+ | | rbrace
|
|
|
|
* | comma | <----- comma +---- comma -------+ |
|
|
|
|
* +-------+ V V
|
|
|
|
* | +---------+ +---------+
|
|
|
|
* +-- digit --> | maximum | -- rbrace --> || final ||
|
|
|
|
* +---------+ +---------+
|
|
|
|
* | ^
|
|
|
|
* +- digit -+
|
2023-10-31 18:35:56 +03:00
|
|
|
*
|
|
|
|
* Note that by the time we've hit this function, the lbrace has already been
|
|
|
|
* consumed so we're in the start state.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *savepoint = parser->cursor;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
|
|
|
enum {
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_REGEXP_RANGE_QUANTIFIER_STATE_START,
|
|
|
|
PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM,
|
|
|
|
PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM,
|
|
|
|
PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA
|
|
|
|
} state = PM_REGEXP_RANGE_QUANTIFIER_STATE_START;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
switch (state) {
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_RANGE_QUANTIFIER_STATE_START:
|
2023-06-20 18:53:02 +03:00
|
|
|
switch (*parser->cursor) {
|
|
|
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM;
|
2023-06-20 18:53:02 +03:00
|
|
|
break;
|
|
|
|
case ',':
|
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA;
|
2023-06-20 18:53:02 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
parser->cursor = savepoint;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MINIMUM:
|
2023-06-20 18:53:02 +03:00
|
|
|
switch (*parser->cursor) {
|
|
|
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
|
|
|
parser->cursor++;
|
|
|
|
break;
|
|
|
|
case ',':
|
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
|
2023-06-20 18:53:02 +03:00
|
|
|
break;
|
|
|
|
case '}':
|
|
|
|
parser->cursor++;
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
parser->cursor = savepoint;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_RANGE_QUANTIFIER_STATE_COMMA:
|
2023-06-20 18:53:02 +03:00
|
|
|
switch (*parser->cursor) {
|
|
|
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
state = PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM;
|
2023-06-20 18:53:02 +03:00
|
|
|
break;
|
|
|
|
default:
|
|
|
|
parser->cursor = savepoint;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_RANGE_QUANTIFIER_STATE_MAXIMUM:
|
2023-06-20 18:53:02 +03:00
|
|
|
switch (*parser->cursor) {
|
|
|
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
|
|
|
|
parser->cursor++;
|
|
|
|
break;
|
|
|
|
case '}':
|
|
|
|
parser->cursor++;
|
|
|
|
return true;
|
|
|
|
default:
|
|
|
|
parser->cursor = savepoint;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* quantifier : star-quantifier
|
|
|
|
* | plus-quantifier
|
|
|
|
* | optional-quantifier
|
|
|
|
* | range-quantifier
|
|
|
|
* | <empty>
|
|
|
|
* ;
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
|
2023-10-23 21:31:30 +03:00
|
|
|
if (pm_regexp_char_is_eof(parser)) return true;
|
|
|
|
|
2023-06-20 18:53:02 +03:00
|
|
|
switch (*parser->cursor) {
|
|
|
|
case '*':
|
|
|
|
case '+':
|
|
|
|
case '?':
|
|
|
|
parser->cursor++;
|
|
|
|
return true;
|
|
|
|
case '{':
|
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
return pm_regexp_parse_range_quantifier(parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
default:
|
|
|
|
// In this case there is no quantifier.
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
|
|
|
|
* ;
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
|
|
|
|
if (!pm_regexp_char_expect(parser, ':')) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_accept(parser, '^');
|
2023-06-20 18:53:02 +03:00
|
|
|
|
|
|
|
return (
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_find(parser, ':') &&
|
|
|
|
pm_regexp_char_expect(parser, ']') &&
|
|
|
|
pm_regexp_char_expect(parser, ']')
|
2023-06-20 18:53:02 +03:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Forward declaration because character sets can be nested.
|
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* match-char-set : '[' '^'? (match-range | match-char)* ']'
|
|
|
|
* ;
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
|
|
|
|
pm_regexp_char_accept(parser, '^');
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ']') {
|
2023-06-20 18:53:02 +03:00
|
|
|
switch (*parser->cursor++) {
|
|
|
|
case '[':
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_lbracket(parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
break;
|
|
|
|
case '\\':
|
2023-09-27 19:24:48 +03:00
|
|
|
if (!pm_regexp_char_is_eof(parser)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
parser->cursor++;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
// do nothing, we've already advanced the cursor
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
return pm_regexp_char_expect(parser, ']');
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* A left bracket can either mean a POSIX class or a character set.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *reset = parser->cursor;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
|
|
|
if ((parser->cursor + 2 < parser->end) && parser->cursor[0] == '[' && parser->cursor[1] == ':') {
|
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
if (pm_regexp_parse_posix_class(parser)) return true;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
|
|
|
parser->cursor = reset;
|
|
|
|
}
|
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
return pm_regexp_parse_character_set(parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Forward declaration here since parsing groups needs to go back up the grammar
|
|
|
|
// to parse expressions within them.
|
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* These are the states of the options that are configurable on the regular
|
|
|
|
* expression (or from within a group).
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef enum {
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_REGEXP_OPTION_STATE_INVALID,
|
|
|
|
PM_REGEXP_OPTION_STATE_TOGGLEABLE,
|
|
|
|
PM_REGEXP_OPTION_STATE_ADDABLE,
|
|
|
|
PM_REGEXP_OPTION_STATE_ADDED,
|
|
|
|
PM_REGEXP_OPTION_STATE_REMOVED
|
|
|
|
} pm_regexp_option_state_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-06-26 16:51:24 +03:00
|
|
|
// These are the options that are configurable on the regular expression (or
|
|
|
|
// from within a group).
|
2023-10-31 18:35:56 +03:00
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
|
|
|
|
#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
|
|
|
|
#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
|
2023-06-26 16:51:24 +03:00
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* This is the set of options that are configurable on the regular expression.
|
|
|
|
*/
|
2023-07-31 21:17:17 +03:00
|
|
|
typedef struct {
|
2023-10-31 20:26:31 +03:00
|
|
|
/** The current state of each option. */
|
2023-09-27 19:24:48 +03:00
|
|
|
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
|
|
|
|
} pm_regexp_options_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Initialize a new set of options to their default values.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static void
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_options_init(pm_regexp_options_t *options) {
|
|
|
|
memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
|
|
|
|
options->values['i' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
|
|
|
options->values['m' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
|
|
|
options->values['x' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_TOGGLEABLE;
|
|
|
|
options->values['d' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
|
|
|
options->values['a' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
|
|
|
options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Attempt to add the given option to the set of options. Returns true if it was
|
|
|
|
* added, false if it was already present.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
|
|
|
|
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
|
|
|
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
2023-06-26 16:51:24 +03:00
|
|
|
|
|
|
|
switch (options->values[key]) {
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_OPTION_STATE_INVALID:
|
|
|
|
case PM_REGEXP_OPTION_STATE_REMOVED:
|
2023-06-26 16:51:24 +03:00
|
|
|
return false;
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
|
|
|
|
case PM_REGEXP_OPTION_STATE_ADDABLE:
|
|
|
|
options->values[key] = PM_REGEXP_OPTION_STATE_ADDED;
|
2023-06-26 16:51:24 +03:00
|
|
|
return true;
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_OPTION_STATE_ADDED:
|
2023-06-26 16:51:24 +03:00
|
|
|
return true;
|
|
|
|
}
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
2023-06-26 16:51:24 +03:00
|
|
|
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Attempt to remove the given option from the set of options. Returns true if
|
|
|
|
* it was removed, false if it was already absent.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
|
|
|
|
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
|
|
|
|
key = (uint8_t) (key - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM);
|
2023-06-26 16:51:24 +03:00
|
|
|
|
|
|
|
switch (options->values[key]) {
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_OPTION_STATE_INVALID:
|
|
|
|
case PM_REGEXP_OPTION_STATE_ADDABLE:
|
2023-06-26 16:51:24 +03:00
|
|
|
return false;
|
2023-09-27 19:24:48 +03:00
|
|
|
case PM_REGEXP_OPTION_STATE_TOGGLEABLE:
|
|
|
|
case PM_REGEXP_OPTION_STATE_ADDED:
|
|
|
|
case PM_REGEXP_OPTION_STATE_REMOVED:
|
|
|
|
options->values[key] = PM_REGEXP_OPTION_STATE_REMOVED;
|
2023-06-26 16:51:24 +03:00
|
|
|
return true;
|
|
|
|
}
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
2023-06-26 16:51:24 +03:00
|
|
|
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Groups can have quite a few different patterns for syntax. They basically
|
|
|
|
* just wrap a set of expressions, but they can potentially have options after a
|
|
|
|
* question mark. If there _isn't_ a question mark, then it's just a set of
|
|
|
|
* expressions. If there _is_, then here are the options:
|
|
|
|
*
|
|
|
|
* * (?#...) - inline comments
|
|
|
|
* * (?:subexp) - non-capturing group
|
|
|
|
* * (?=subexp) - positive lookahead
|
|
|
|
* * (?!subexp) - negative lookahead
|
|
|
|
* * (?>subexp) - atomic group
|
|
|
|
* * (?~subexp) - absence operator
|
|
|
|
* * (?<=subexp) - positive lookbehind
|
|
|
|
* * (?<!subexp) - negative lookbehind
|
|
|
|
* * (?<name>subexp) - named capturing group
|
|
|
|
* * (?'name'subexp) - named capturing group
|
|
|
|
* * (?(cond)yes-subexp) - conditional expression
|
|
|
|
* * (?(cond)yes-subexp|no-subexp) - conditional expression
|
|
|
|
* * (?imxdau-imx) - turn on and off configuration
|
|
|
|
* * (?imxdau-imx:subexp) - turn on and off configuration for an expression
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
|
2023-06-20 18:53:02 +03:00
|
|
|
// First, parse any options for the group.
|
2023-09-27 19:24:48 +03:00
|
|
|
if (pm_regexp_char_accept(parser, '?')) {
|
|
|
|
if (pm_regexp_char_is_eof(parser)) {
|
2023-06-23 12:53:38 +03:00
|
|
|
return false;
|
|
|
|
}
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_options_t options;
|
|
|
|
pm_regexp_options_init(&options);
|
2023-06-20 18:53:02 +03:00
|
|
|
|
|
|
|
switch (*parser->cursor) {
|
|
|
|
case '#': { // inline comments
|
2023-08-03 20:38:54 +03:00
|
|
|
if (parser->encoding_changed && parser->encoding->multibyte) {
|
2023-08-03 22:49:29 +03:00
|
|
|
bool escaped = false;
|
|
|
|
|
2023-08-03 20:38:54 +03:00
|
|
|
// Here we're going to take a slow path and iterate through
|
|
|
|
// each multibyte character to find the close paren. We do
|
|
|
|
// this because \ can be a trailing byte in some encodings.
|
|
|
|
while (parser->cursor < parser->end) {
|
2023-08-03 22:49:29 +03:00
|
|
|
if (!escaped && *parser->cursor == ')') {
|
2023-08-03 20:38:54 +03:00
|
|
|
parser->cursor++;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
size_t width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
|
|
|
|
if (width == 0) return false;
|
|
|
|
|
2023-08-03 22:49:29 +03:00
|
|
|
escaped = (width == 1) && (*parser->cursor == '\\');
|
2023-08-03 20:38:54 +03:00
|
|
|
parser->cursor += width;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
} else {
|
|
|
|
// Here we can take the fast path and use memchr to find the
|
|
|
|
// next ) because we are safe checking backward for \ since
|
|
|
|
// it cannot be a trailing character.
|
2023-09-27 19:24:48 +03:00
|
|
|
bool found = pm_regexp_char_find(parser, ')');
|
2023-08-03 20:38:54 +03:00
|
|
|
|
|
|
|
while (found && (parser->start <= parser->cursor - 2) && (*(parser->cursor - 2) == '\\')) {
|
2023-09-27 19:24:48 +03:00
|
|
|
found = pm_regexp_char_find(parser, ')');
|
2023-08-03 20:38:54 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
return found;
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
case ':': // non-capturing group
|
|
|
|
case '=': // positive lookahead
|
|
|
|
case '!': // negative lookahead
|
|
|
|
case '>': // atomic group
|
|
|
|
case '~': // absence operator
|
|
|
|
parser->cursor++;
|
|
|
|
break;
|
|
|
|
case '<':
|
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
if (pm_regexp_char_is_eof(parser)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
switch (*parser->cursor) {
|
|
|
|
case '=': // positive lookbehind
|
|
|
|
case '!': // negative lookbehind
|
|
|
|
parser->cursor++;
|
|
|
|
break;
|
|
|
|
default: { // named capture group
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *start = parser->cursor;
|
2023-09-27 19:24:48 +03:00
|
|
|
if (!pm_regexp_char_find(parser, '>')) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
2023-06-20 18:53:02 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case '\'': { // named capture group
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *start = ++parser->cursor;
|
2023-09-27 19:24:48 +03:00
|
|
|
if (!pm_regexp_char_find(parser, '\'')) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parser_named_capture(parser, start, parser->cursor - 1);
|
2023-06-20 18:53:02 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
case '(': // conditional expression
|
2023-09-27 19:24:48 +03:00
|
|
|
if (!pm_regexp_char_find(parser, ')')) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
case 'i': case 'm': case 'x': case 'd': case 'a': case 'u': // options
|
2023-09-27 19:24:48 +03:00
|
|
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != '-' && *parser->cursor != ':' && *parser->cursor != ')') {
|
|
|
|
if (!pm_regexp_options_add(&options, *parser->cursor)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
parser->cursor++;
|
|
|
|
}
|
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
if (pm_regexp_char_is_eof(parser)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// If we hit a -, then we're done parsing options.
|
|
|
|
if (*parser->cursor != '-') break;
|
|
|
|
|
|
|
|
// Otherwise, fallthrough to the - case.
|
|
|
|
/* fallthrough */
|
|
|
|
case '-':
|
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ':' && *parser->cursor != ')') {
|
|
|
|
if (!pm_regexp_options_remove(&options, *parser->cursor)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
parser->cursor++;
|
|
|
|
}
|
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
if (pm_regexp_char_is_eof(parser)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now, parse the expressions within this group.
|
2023-09-27 19:24:48 +03:00
|
|
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')') {
|
|
|
|
if (!pm_regexp_parse_expression(parser)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_accept(parser, '|');
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
// Finally, make sure we have a closing parenthesis.
|
2023-09-27 19:24:48 +03:00
|
|
|
return pm_regexp_char_expect(parser, ')');
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* item : anchor
|
|
|
|
* | match-posix-class
|
|
|
|
* | match-char-set
|
|
|
|
* | match-char-class
|
|
|
|
* | match-char-prop
|
|
|
|
* | match-char
|
|
|
|
* | match-any
|
|
|
|
* | group
|
|
|
|
* | quantified
|
|
|
|
* ;
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
|
2024-02-19 00:36:16 +03:00
|
|
|
switch (*parser->cursor) {
|
2023-06-20 18:53:02 +03:00
|
|
|
case '^':
|
|
|
|
case '$':
|
2024-02-19 00:36:16 +03:00
|
|
|
parser->cursor++;
|
2023-06-20 18:53:02 +03:00
|
|
|
return true;
|
|
|
|
case '\\':
|
2024-02-19 00:36:16 +03:00
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
if (!pm_regexp_char_is_eof(parser)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
parser->cursor++;
|
|
|
|
}
|
2023-09-27 19:24:48 +03:00
|
|
|
return pm_regexp_parse_quantifier(parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
case '(':
|
2024-02-19 00:36:16 +03:00
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
return pm_regexp_parse_group(parser) && pm_regexp_parse_quantifier(parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
case '[':
|
2024-02-19 00:36:16 +03:00
|
|
|
parser->cursor++;
|
2023-09-27 19:24:48 +03:00
|
|
|
return pm_regexp_parse_lbracket(parser) && pm_regexp_parse_quantifier(parser);
|
2024-02-19 00:36:16 +03:00
|
|
|
default: {
|
|
|
|
size_t width;
|
|
|
|
if (!parser->encoding_changed) {
|
|
|
|
width = pm_encoding_utf_8_char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
|
|
|
|
} else {
|
|
|
|
width = parser->encoding->char_width(parser->cursor, (ptrdiff_t) (parser->end - parser->cursor));
|
|
|
|
}
|
|
|
|
|
|
|
|
if (width == 0) return false; // TODO: add appropriate error
|
|
|
|
parser->cursor += width;
|
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
return pm_regexp_parse_quantifier(parser);
|
2024-02-19 00:36:16 +03:00
|
|
|
}
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* expression : item+
|
|
|
|
* ;
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
|
|
|
|
if (!pm_regexp_parse_item(parser)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
while (!pm_regexp_char_is_eof(parser) && *parser->cursor != ')' && *parser->cursor != '|') {
|
|
|
|
if (!pm_regexp_parse_item(parser)) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* pattern : EOF
|
|
|
|
* | expression EOF
|
|
|
|
* | expression '|' pattern
|
|
|
|
* ;
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
static bool
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
|
2023-06-20 18:53:02 +03:00
|
|
|
return (
|
|
|
|
(
|
|
|
|
// Exit early if the pattern is empty.
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_is_eof(parser) ||
|
2023-06-20 18:53:02 +03:00
|
|
|
// Parse the first expression in the pattern.
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parse_expression(parser)
|
2023-06-20 18:53:02 +03:00
|
|
|
) &&
|
|
|
|
(
|
|
|
|
// Return now if we've parsed the entire pattern.
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_char_is_eof(parser) ||
|
2023-06-20 18:53:02 +03:00
|
|
|
// Otherwise, we should have a pipe character.
|
2023-09-27 19:24:48 +03:00
|
|
|
(pm_regexp_char_expect(parser, '|') && pm_regexp_parse_pattern(parser))
|
2023-06-20 18:53:02 +03:00
|
|
|
)
|
|
|
|
);
|
|
|
|
}
|
|
|
|
|
2023-10-31 18:35:56 +03:00
|
|
|
/**
|
|
|
|
* Parse a regular expression and extract the names of all of the named capture
|
|
|
|
* groups.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
PRISM_EXPORTED_FUNCTION bool
|
2023-12-01 04:59:00 +03:00
|
|
|
pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, const pm_encoding_t *encoding) {
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_regexp_parser_t parser;
|
|
|
|
pm_regexp_parser_init(&parser, source, source + size, named_captures, encoding_changed, encoding);
|
|
|
|
return pm_regexp_parse_pattern(&parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
}
|