[ruby/prism] Documentation for diagnostics and regexp

https://github.com/ruby/prism/commit/16e0579044
This commit is contained in:
Kevin Newton 2023-10-31 11:35:56 -04:00
Родитель affa6714bc
Коммит 1de05631b5
4 изменённых файлов: 225 добавлений и 152 удалений

Просмотреть файл

@ -1,56 +1,55 @@
#include "prism/diagnostic.h"
/*
## Message composition
When composing an error message, use sentence fragments.
Try describing the property of the code that caused the error, rather than the rule that is being
violated. It may help to use a fragment that completes a sentence beginning, "The parser
encountered (a) ...". If appropriate, add a description of the rule violation (or other helpful
context) after a semicolon.
For example:, instead of "Control escape sequence cannot be doubled", prefer:
> "Invalid control escape sequence; control cannot be repeated"
In some cases, where the failure is more general or syntax expectations are violated, it may make
more sense to use a fragment that completes a sentence beginning, "The parser ...".
For example:
> "Expected an expression after `(`"
> "Cannot parse the expression"
## Message style guide
- Use articles like "a", "an", and "the" when appropriate.
- e.g., prefer "Cannot parse the expression" to "Cannot parse expression".
- Use the common name for tokens and nodes.
- e.g., prefer "keyword splat" to "assoc splat"
- e.g., prefer "embedded document" to "embdoc"
- Capitalize the initial word of the message.
- Use back ticks around token literals
- e.g., "Expected a `=>` between the hash key and value"
- Do not use `.` or other punctuation at the end of the message.
- Do not use contractions like "can't". Prefer "cannot" to "can not".
- For tokens that can have multiple meanings, reference the token and its meaning.
- e.g., "`*` splat argument" is clearer and more complete than "splat argument" or "`*` argument"
## Error names (PM_ERR_*)
- When appropriate, prefer node name to token name.
- e.g., prefer "SPLAT" to "STAR" in the context of argument parsing.
- Prefer token name to common name.
- e.g., prefer "STAR" to "ASTERISK".
- Try to order the words in the name from more general to more specific,
- e.g., "INVALID_NUMBER_DECIMAL" is better than "DECIMAL_INVALID_NUMBER".
- When in doubt, look for similar patterns and name them so that they are grouped when lexically
sorted. See PM_ERR_ARGUMENT_NO_FORWARDING_* for an example.
*/
/**
* ## Message composition
*
* When composing an error message, use sentence fragments.
*
* Try describing the property of the code that caused the error, rather than the rule that is being
* violated. It may help to use a fragment that completes a sentence beginning, "The parser
* encountered (a) ...". If appropriate, add a description of the rule violation (or other helpful
* context) after a semicolon.
*
* For example:, instead of "Control escape sequence cannot be doubled", prefer:
*
* > "Invalid control escape sequence; control cannot be repeated"
*
* In some cases, where the failure is more general or syntax expectations are violated, it may make
* more sense to use a fragment that completes a sentence beginning, "The parser ...".
*
* For example:
*
* > "Expected an expression after `(`"
* > "Cannot parse the expression"
*
*
* ## Message style guide
*
* - Use articles like "a", "an", and "the" when appropriate.
* - e.g., prefer "Cannot parse the expression" to "Cannot parse expression".
* - Use the common name for tokens and nodes.
* - e.g., prefer "keyword splat" to "assoc splat"
* - e.g., prefer "embedded document" to "embdoc"
* - Capitalize the initial word of the message.
* - Use back ticks around token literals
* - e.g., "Expected a `=>` between the hash key and value"
* - Do not use `.` or other punctuation at the end of the message.
* - Do not use contractions like "can't". Prefer "cannot" to "can not".
* - For tokens that can have multiple meanings, reference the token and its meaning.
* - e.g., "`*` splat argument" is clearer and more complete than "splat argument" or "`*` argument"
*
*
* ## Error names (PM_ERR_*)
*
* - When appropriate, prefer node name to token name.
* - e.g., prefer "SPLAT" to "STAR" in the context of argument parsing.
* - Prefer token name to common name.
* - e.g., prefer "STAR" to "ASTERISK".
* - Try to order the words in the name from more general to more specific,
* - e.g., "INVALID_NUMBER_DECIMAL" is better than "DECIMAL_INVALID_NUMBER".
* - When in doubt, look for similar patterns and name them so that they are grouped when lexically
* sorted. See PM_ERR_ARGUMENT_NO_FORWARDING_* for an example.
*/
static const char* const diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
[PM_ERR_ALIAS_ARGUMENT] = "Invalid argument being passed to `alias`; expected a bare word, symbol, constant, or global variable",
[PM_ERR_AMPAMPEQ_MULTI_ASSIGN] = "Unexpected `&&=` in a multiple assignment",
@ -263,7 +262,9 @@ pm_diagnostic_message(pm_diagnostic_id_t diag_id) {
return message;
}
// Append an error to the given list of diagnostic.
/**
* Append an error to the given list of diagnostic.
*/
bool
pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) {
pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) calloc(sizeof(pm_diagnostic_t), 1);
@ -274,7 +275,9 @@ pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *
return true;
}
// Deallocate the internal state of the given diagnostic list.
/**
* Deallocate the internal state of the given diagnostic list.
*/
void
pm_diagnostic_list_free(pm_list_t *list) {
pm_list_node_t *node, *next;

Просмотреть файл

@ -20,6 +20,10 @@ typedef struct {
const char *message;
} pm_diagnostic_t;
/**
* The diagnostic IDs of all of the diagnostics, used to communicate the types
* of errors between the parser and the user.
*/
typedef enum {
PM_ERR_ALIAS_ARGUMENT,
PM_ERR_AMPAMPEQ_MULTI_ASSIGN,
@ -223,14 +227,27 @@ typedef enum {
PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS,
PM_WARN_AMBIGUOUS_PREFIX_STAR,
PM_WARN_AMBIGUOUS_SLASH,
/* This must be the last member. */
PM_DIAGNOSTIC_ID_LEN,
} pm_diagnostic_id_t;
// Append a diagnostic to the given list of diagnostics.
/**
* Append a diagnostic to the given list of diagnostics.
*
* @param list The list to append to.
* @param start The start of the diagnostic.
* @param end The end of the diagnostic.
* @param diag_id The diagnostic ID.
* @return Whether the diagnostic was successfully appended.
*/
bool pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id);
// Deallocate the internal state of the given diagnostic list.
/**
* Deallocate the internal state of the given diagnostic list.
*
* @param list The list to deallocate.
*/
void pm_diagnostic_list_free(pm_list_t *list);
#endif

Просмотреть файл

@ -1,6 +1,8 @@
#include "prism/regexp.h"
// This is the parser that is going to handle parsing regular expressions.
/**
* This is the parser that is going to handle parsing regular expressions.
*/
typedef struct {
const uint8_t *start;
const uint8_t *cursor;
@ -10,7 +12,9 @@ typedef struct {
pm_encoding_t *encoding;
} pm_regexp_parser_t;
// This initializes a new parser with the given source.
/**
* This initializes a new parser with the given source.
*/
static void
pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
*parser = (pm_regexp_parser_t) {
@ -23,7 +27,9 @@ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const ui
};
}
// This appends a new string to the list of named captures.
/**
* This appends a new string to the list of named captures.
*/
static void
pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
pm_string_t string;
@ -32,13 +38,17 @@ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start,
pm_string_free(&string);
}
// Returns true if the next character is the end of the source.
/**
* Returns true if the next character is the end of the source.
*/
static inline bool
pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
return parser->cursor >= parser->end;
}
// Optionally accept a char and consume it if it exists.
/**
* Optionally accept a char and consume it if it exists.
*/
static inline bool
pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
@ -48,7 +58,9 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
return false;
}
// Expect a character to be present and consume it.
/**
* Expect a character to be present and consume it.
*/
static inline bool
pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
@ -58,7 +70,9 @@ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
return false;
}
// This advances the current token to the next instance of the given character.
/**
* This advances the current token to the next instance of the given character.
*/
static bool
pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
if (pm_regexp_char_is_eof(parser)) {
@ -74,37 +88,39 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
return true;
}
// Range quantifiers are a special class of quantifiers that look like
//
// * {digit}
// * {digit,}
// * {digit,digit}
// * {,digit}
//
// Unfortunately, if there are any spaces in between, then this just becomes a
// regular character match expression and we have to backtrack. So when this
// function first starts running, we'll create a "save" point and then attempt
// to parse the quantifier. If it fails, we'll restore the save point and
// return.
//
// The properly track everything, we're going to build a little state machine.
// It looks something like the following:
//
// ┌───────┐ ┌─────────┐ ────────────┐
// ──── lbrace ───> │ start │ ──── digit ───> │ minimum │ │
// └───────┘ └─────────┘ <─── digit ─┘
// │ │ │
// ┌───────┐ │ │ rbrace
// │ comma │ <───── comma ┌──── comma ───────┘ │
// └───────┘ V V
// │ ┌─────────┐ ┌─────────┐
// └── digit ──> │ maximum │ ── rbrace ──> │| final |│
// └─────────┘ └─────────┘
// │ ^
// └─ digit ─┘
//
// Note that by the time we've hit this function, the lbrace has already been
// consumed so we're in the start state.
/**
* Range quantifiers are a special class of quantifiers that look like
*
* * {digit}
* * {digit,}
* * {digit,digit}
* * {,digit}
*
* Unfortunately, if there are any spaces in between, then this just becomes a
* regular character match expression and we have to backtrack. So when this
* function first starts running, we'll create a "save" point and then attempt
* to parse the quantifier. If it fails, we'll restore the save point and
* return.
*
* The properly track everything, we're going to build a little state machine.
* It looks something like the following:
*
*
* lbrace > start digit > minimum
* < digit
*
* rbrace
* comma < comma comma
* V V
*
* digit > maximum rbrace > | final |
*
* ^
* digit
*
* Note that by the time we've hit this function, the lbrace has already been
* consumed so we're in the start state.
*/
static bool
pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
const uint8_t *savepoint = parser->cursor;
@ -180,12 +196,14 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
return true;
}
// quantifier : star-quantifier
// | plus-quantifier
// | optional-quantifier
// | range-quantifier
// | <empty>
// ;
/**
* quantifier : star-quantifier
* | plus-quantifier
* | optional-quantifier
* | range-quantifier
* | <empty>
* ;
*/
static bool
pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
if (pm_regexp_char_is_eof(parser)) return true;
@ -205,8 +223,10 @@ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
}
}
// match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
// ;
/**
* match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
* ;
*/
static bool
pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
if (!pm_regexp_char_expect(parser, ':')) {
@ -226,8 +246,10 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
static bool
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);
// match-char-set : '[' '^'? (match-range | match-char)* ']'
// ;
/**
* match-char-set : '[' '^'? (match-range | match-char)* ']'
* ;
*/
static bool
pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
pm_regexp_char_accept(parser, '^');
@ -251,7 +273,9 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
return pm_regexp_char_expect(parser, ']');
}
// A left bracket can either mean a POSIX class or a character set.
/**
* A left bracket can either mean a POSIX class or a character set.
*/
static bool
pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
const uint8_t *reset = parser->cursor;
@ -271,8 +295,10 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
static bool
pm_regexp_parse_expression(pm_regexp_parser_t *parser);
// These are the states of the options that are configurable on the regular
// expression (or from within a group).
/**
* These are the states of the options that are configurable on the regular
* expression (or from within a group).
*/
typedef enum {
PM_REGEXP_OPTION_STATE_INVALID,
PM_REGEXP_OPTION_STATE_TOGGLEABLE,
@ -283,16 +309,21 @@ typedef enum {
// These are the options that are configurable on the regular expression (or
// from within a group).
#define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
#define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
#define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)
// This is the set of options that are configurable on the regular expression.
/**
* This is the set of options that are configurable on the regular expression.
*/
typedef struct {
uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
} pm_regexp_options_t;
// Initialize a new set of options to their default values.
/**
* Initialize a new set of options to their default values.
*/
static void
pm_regexp_options_init(pm_regexp_options_t *options) {
memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
@ -304,8 +335,10 @@ pm_regexp_options_init(pm_regexp_options_t *options) {
options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
}
// Attempt to add the given option to the set of options. Returns true if it was
// added, false if it was already present.
/**
* Attempt to add the given option to the set of options. Returns true if it was
* added, false if it was already present.
*/
static bool
pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
@ -327,8 +360,10 @@ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
return false;
}
// Attempt to remove the given option from the set of options. Returns true if
// it was removed, false if it was already absent.
/**
* Attempt to remove the given option from the set of options. Returns true if
* it was removed, false if it was already absent.
*/
static bool
pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
@ -349,26 +384,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
return false;
}
// Groups can have quite a few different patterns for syntax. They basically
// just wrap a set of expressions, but they can potentially have options after a
// question mark. If there _isn't_ a question mark, then it's just a set of
// expressions. If there _is_, then here are the options:
//
// * (?#...) - inline comments
// * (?:subexp) - non-capturing group
// * (?=subexp) - positive lookahead
// * (?!subexp) - negative lookahead
// * (?>subexp) - atomic group
// * (?~subexp) - absence operator
// * (?<=subexp) - positive lookbehind
// * (?<!subexp) - negative lookbehind
// * (?<name>subexp) - named capturing group
// * (?'name'subexp) - named capturing group
// * (?(cond)yes-subexp) - conditional expression
// * (?(cond)yes-subexp|no-subexp) - conditional expression
// * (?imxdau-imx) - turn on and off configuration
// * (?imxdau-imx:subexp) - turn on and off configuration for an expression
//
/**
* Groups can have quite a few different patterns for syntax. They basically
* just wrap a set of expressions, but they can potentially have options after a
* question mark. If there _isn't_ a question mark, then it's just a set of
* expressions. If there _is_, then here are the options:
*
* * (?#...) - inline comments
* * (?:subexp) - non-capturing group
* * (?=subexp) - positive lookahead
* * (?!subexp) - negative lookahead
* * (?>subexp) - atomic group
* * (?~subexp) - absence operator
* * (?<=subexp) - positive lookbehind
* * (?<!subexp) - negative lookbehind
* * (?<name>subexp) - named capturing group
* * (?'name'subexp) - named capturing group
* * (?(cond)yes-subexp) - conditional expression
* * (?(cond)yes-subexp|no-subexp) - conditional expression
* * (?imxdau-imx) - turn on and off configuration
* * (?imxdau-imx:subexp) - turn on and off configuration for an expression
*/
static bool
pm_regexp_parse_group(pm_regexp_parser_t *parser) {
// First, parse any options for the group.
@ -503,16 +539,18 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
return pm_regexp_char_expect(parser, ')');
}
// item : anchor
// | match-posix-class
// | match-char-set
// | match-char-class
// | match-char-prop
// | match-char
// | match-any
// | group
// | quantified
// ;
/**
* item : anchor
* | match-posix-class
* | match-char-set
* | match-char-class
* | match-char-prop
* | match-char
* | match-any
* | group
* | quantified
* ;
*/
static bool
pm_regexp_parse_item(pm_regexp_parser_t *parser) {
switch (*parser->cursor++) {
@ -533,8 +571,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
}
}
// expression : item+
// ;
/**
* expression : item+
* ;
*/
static bool
pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
if (!pm_regexp_parse_item(parser)) {
@ -550,10 +590,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
return true;
}
// pattern : EOF
// | expression EOF
// | expression '|' pattern
// ;
/**
* pattern : EOF
* | expression EOF
* | expression '|' pattern
* ;
*/
static bool
pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
return (
@ -572,8 +614,10 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
);
}
// Parse a regular expression and extract the names of all of the named capture
// groups.
/**
* Parse a regular expression and extract the names of all of the named capture
* groups.
*/
PRISM_EXPORTED_FUNCTION bool
pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
pm_regexp_parser_t parser;

Просмотреть файл

@ -12,8 +12,17 @@
#include <stddef.h>
#include <string.h>
// Parse a regular expression and extract the names of all of the named capture
// groups.
/**
* Parse a regular expression and extract the names of all of the named capture
* groups.
*
* @param source The source code to parse.
* @param size The size of the source code.
* @param named_captures The list to add the names of the named capture groups.
* @param encoding_changed Whether or not the encoding changed from the default.
* @param encoding The encoding of the source code.
* @return Whether or not the parsing was successful.
*/
PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding);
#endif