[ruby/prism] Documentation for diagnostics and regexp

https://github.com/ruby/prism/commit/16e0579044
2023-10-31 11:35:56 -04:00 · 2023-10-31 11:35:56 -04:00 · 1de05631b5
--- a/prism/diagnostic.c
+++ b/prism/diagnostic.c
@ -1,56 +1,55 @@
 #include "prism/diagnostic.h"

-/*
-  ## Message composition
-
-  When composing an error message, use sentence fragments.
-
-  Try describing the property of the code that caused the error, rather than the rule that is being
-  violated. It may help to use a fragment that completes a sentence beginning, "The parser
-  encountered (a) ...". If appropriate, add a description of the rule violation (or other helpful
-  context) after a semicolon.
-
-  For example:, instead of "Control escape sequence cannot be doubled", prefer:
-
-  > "Invalid control escape sequence; control cannot be repeated"
-
-  In some cases, where the failure is more general or syntax expectations are violated, it may make
-  more sense to use a fragment that completes a sentence beginning, "The parser ...".
-
-  For example:
-
-  > "Expected an expression after `(`"
-  > "Cannot parse the expression"
-
-
-  ## Message style guide
-
-  - Use articles like "a", "an", and "the" when appropriate.
-    - e.g., prefer "Cannot parse the expression" to "Cannot parse expression".
-  - Use the common name for tokens and nodes.
-    - e.g., prefer "keyword splat" to "assoc splat"
-    - e.g., prefer "embedded document" to "embdoc"
-  - Capitalize the initial word of the message.
-  - Use back ticks around token literals
-    - e.g., "Expected a `=>` between the hash key and value"
-  - Do not use `.` or other punctuation at the end of the message.
-  - Do not use contractions like "can't". Prefer "cannot" to "can not".
-  - For tokens that can have multiple meanings, reference the token and its meaning.
-    - e.g., "`*` splat argument" is clearer and more complete than "splat argument" or "`*` argument"
-
-
-  ## Error names (PM_ERR_*)
-
-  - When appropriate, prefer node name to token name.
-    - e.g., prefer "SPLAT" to "STAR" in the context of argument parsing.
-  - Prefer token name to common name.
-    - e.g., prefer "STAR" to "ASTERISK".
-  - Try to order the words in the name from more general to more specific,
-    - e.g., "INVALID_NUMBER_DECIMAL" is better than "DECIMAL_INVALID_NUMBER".
-    - When in doubt, look for similar patterns and name them so that they are grouped when lexically
-      sorted. See PM_ERR_ARGUMENT_NO_FORWARDING_* for an example.
-*/
-
+/**
+ * ## Message composition
+ *
+ * When composing an error message, use sentence fragments.
+ *
+ * Try describing the property of the code that caused the error, rather than the rule that is being
+ * violated. It may help to use a fragment that completes a sentence beginning, "The parser
+ * encountered (a) ...". If appropriate, add a description of the rule violation (or other helpful
+ * context) after a semicolon.
+ *
+ * For example:, instead of "Control escape sequence cannot be doubled", prefer:
+ *
+ * > "Invalid control escape sequence; control cannot be repeated"
+ *
+ * In some cases, where the failure is more general or syntax expectations are violated, it may make
+ * more sense to use a fragment that completes a sentence beginning, "The parser ...".
+ *
+ * For example:
+ *
+ * > "Expected an expression after `(`"
+ * > "Cannot parse the expression"
+ *
+ *
+ * ## Message style guide
+ *
+ * - Use articles like "a", "an", and "the" when appropriate.
+ *   - e.g., prefer "Cannot parse the expression" to "Cannot parse expression".
+ * - Use the common name for tokens and nodes.
+ *   - e.g., prefer "keyword splat" to "assoc splat"
+ *   - e.g., prefer "embedded document" to "embdoc"
+ * - Capitalize the initial word of the message.
+ * - Use back ticks around token literals
+ *   - e.g., "Expected a `=>` between the hash key and value"
+ * - Do not use `.` or other punctuation at the end of the message.
+ * - Do not use contractions like "can't". Prefer "cannot" to "can not".
+ * - For tokens that can have multiple meanings, reference the token and its meaning.
+ *   - e.g., "`*` splat argument" is clearer and more complete than "splat argument" or "`*` argument"
+ *
+ *
+ * ## Error names (PM_ERR_*)
+ *
+ * - When appropriate, prefer node name to token name.
+ *   - e.g., prefer "SPLAT" to "STAR" in the context of argument parsing.
+ * - Prefer token name to common name.
+ *   - e.g., prefer "STAR" to "ASTERISK".
+ * - Try to order the words in the name from more general to more specific,
+ *   - e.g., "INVALID_NUMBER_DECIMAL" is better than "DECIMAL_INVALID_NUMBER".
+ *   - When in doubt, look for similar patterns and name them so that they are grouped when lexically
+ *     sorted. See PM_ERR_ARGUMENT_NO_FORWARDING_* for an example.
+ */
 static const char* const diagnostic_messages[PM_DIAGNOSTIC_ID_LEN] = {
    [PM_ERR_ALIAS_ARGUMENT]                     = "Invalid argument being passed to `alias`; expected a bare word, symbol, constant, or global variable",
    [PM_ERR_AMPAMPEQ_MULTI_ASSIGN]              = "Unexpected `&&=` in a multiple assignment",
@ -263,7 +262,9 @@ pm_diagnostic_message(pm_diagnostic_id_t diag_id) {
    return message;
 }

-// Append an error to the given list of diagnostic.
+/**
+ * Append an error to the given list of diagnostic.
+ */
 bool
 pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id) {
    pm_diagnostic_t *diagnostic = (pm_diagnostic_t *) calloc(sizeof(pm_diagnostic_t), 1);
@ -274,7 +275,9 @@ pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *
    return true;
 }

-// Deallocate the internal state of the given diagnostic list.
+/**
+ * Deallocate the internal state of the given diagnostic list.
+ */
 void
 pm_diagnostic_list_free(pm_list_t *list) {
    pm_list_node_t *node, *next;
--- a/prism/diagnostic.h
+++ b/prism/diagnostic.h
@ -20,6 +20,10 @@ typedef struct {
    const char *message;
 } pm_diagnostic_t;

+/**
+ * The diagnostic IDs of all of the diagnostics, used to communicate the types
+ * of errors between the parser and the user.
+ */
 typedef enum {
    PM_ERR_ALIAS_ARGUMENT,
    PM_ERR_AMPAMPEQ_MULTI_ASSIGN,
@ -223,14 +227,27 @@ typedef enum {
    PM_WARN_AMBIGUOUS_FIRST_ARGUMENT_PLUS,
    PM_WARN_AMBIGUOUS_PREFIX_STAR,
    PM_WARN_AMBIGUOUS_SLASH,
+
    /* This must be the last member. */
    PM_DIAGNOSTIC_ID_LEN,
 } pm_diagnostic_id_t;

-// Append a diagnostic to the given list of diagnostics.
+/**
+ * Append a diagnostic to the given list of diagnostics.
+ *
+ * @param list The list to append to.
+ * @param start The start of the diagnostic.
+ * @param end The end of the diagnostic.
+ * @param diag_id The diagnostic ID.
+ * @return Whether the diagnostic was successfully appended.
+ */
 bool pm_diagnostic_list_append(pm_list_t *list, const uint8_t *start, const uint8_t *end, pm_diagnostic_id_t diag_id);

-// Deallocate the internal state of the given diagnostic list.
+/**
+ * Deallocate the internal state of the given diagnostic list.
+ *
+ * @param list The list to deallocate.
+ */
 void pm_diagnostic_list_free(pm_list_t *list);

 #endif
--- a/prism/regexp.c
+++ b/prism/regexp.c
@ -1,6 +1,8 @@
 #include "prism/regexp.h"

-// This is the parser that is going to handle parsing regular expressions.
+/**
+ * This is the parser that is going to handle parsing regular expressions.
+ */
 typedef struct {
    const uint8_t *start;
    const uint8_t *cursor;
@ -10,7 +12,9 @@ typedef struct {
    pm_encoding_t *encoding;
 } pm_regexp_parser_t;

-// This initializes a new parser with the given source.
+/**
+ * This initializes a new parser with the given source.
+ */
 static void
 pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
    *parser = (pm_regexp_parser_t) {
@ -23,7 +27,9 @@ pm_regexp_parser_init(pm_regexp_parser_t *parser, const uint8_t *start, const ui
    };
 }

-// This appends a new string to the list of named captures.
+/**
+ * This appends a new string to the list of named captures.
+ */
 static void
 pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start, const uint8_t *end) {
    pm_string_t string;
@ -32,13 +38,17 @@ pm_regexp_parser_named_capture(pm_regexp_parser_t *parser, const uint8_t *start,
    pm_string_free(&string);
 }

-// Returns true if the next character is the end of the source.
+/**
+ * Returns true if the next character is the end of the source.
+ */
 static inline bool
 pm_regexp_char_is_eof(pm_regexp_parser_t *parser) {
    return parser->cursor >= parser->end;
 }

-// Optionally accept a char and consume it if it exists.
+/**
+ * Optionally accept a char and consume it if it exists.
+ */
 static inline bool
 pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
    if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
@ -48,7 +58,9 @@ pm_regexp_char_accept(pm_regexp_parser_t *parser, uint8_t value) {
    return false;
 }

-// Expect a character to be present and consume it.
+/**
+ * Expect a character to be present and consume it.
+ */
 static inline bool
 pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
    if (!pm_regexp_char_is_eof(parser) && *parser->cursor == value) {
@ -58,7 +70,9 @@ pm_regexp_char_expect(pm_regexp_parser_t *parser, uint8_t value) {
    return false;
 }

-// This advances the current token to the next instance of the given character.
+/**
+ * This advances the current token to the next instance of the given character.
+ */
 static bool
 pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
    if (pm_regexp_char_is_eof(parser)) {
@ -74,37 +88,39 @@ pm_regexp_char_find(pm_regexp_parser_t *parser, uint8_t value) {
    return true;
 }

-// Range quantifiers are a special class of quantifiers that look like
-//
-// * {digit}
-// * {digit,}
-// * {digit,digit}
-// * {,digit}
-//
-// Unfortunately, if there are any spaces in between, then this just becomes a
-// regular character match expression and we have to backtrack. So when this
-// function first starts running, we'll create a "save" point and then attempt
-// to parse the quantifier. If it fails, we'll restore the save point and
-// return.
-//
-// The properly track everything, we're going to build a little state machine.
-// It looks something like the following:
-//
-//                  ┌───────┐                 ┌─────────┐ ────────────┐
-// ──── lbrace ───> │ start │ ──── digit ───> │ minimum │             │
-//                  └───────┘                 └─────────┘ <─── digit ─┘
-//                      │                       │    │
-//   ┌───────┐          │                       │  rbrace
-//   │ comma │ <───── comma  ┌──── comma ───────┘    │
-//   └───────┘               V                       V
-//      │             ┌─────────┐               ┌─────────┐
-//      └── digit ──> │ maximum │ ── rbrace ──> │| final |│
-//                    └─────────┘               └─────────┘
-//                    │         ^
-//                    └─ digit ─┘
-//
-// Note that by the time we've hit this function, the lbrace has already been
-// consumed so we're in the start state.
+/**
+ * Range quantifiers are a special class of quantifiers that look like
+ *
+ * * {digit}
+ * * {digit,}
+ * * {digit,digit}
+ * * {,digit}
+ *
+ * Unfortunately, if there are any spaces in between, then this just becomes a
+ * regular character match expression and we have to backtrack. So when this
+ * function first starts running, we'll create a "save" point and then attempt
+ * to parse the quantifier. If it fails, we'll restore the save point and
+ * return.
+ *
+ * The properly track everything, we're going to build a little state machine.
+ * It looks something like the following:
+ *
+ *                  ┌───────┐                 ┌─────────┐ ────────────┐
+ * ──── lbrace ───> │ start │ ──── digit ───> │ minimum │             │
+ *                  └───────┘                 └─────────┘ <─── digit ─┘
+ *                      │                       │    │
+ *   ┌───────┐          │                       │  rbrace
+ *   │ comma │ <───── comma  ┌──── comma ───────┘    │
+ *   └───────┘               V                       V
+ *      │             ┌─────────┐               ┌─────────┐
+ *      └── digit ──> │ maximum │ ── rbrace ──> │| final |│
+ *                    └─────────┘               └─────────┘
+ *                    │         ^
+ *                    └─ digit ─┘
+ *
+ * Note that by the time we've hit this function, the lbrace has already been
+ * consumed so we're in the start state.
+ */
 static bool
 pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
    const uint8_t *savepoint = parser->cursor;
@ -180,12 +196,14 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
    return true;
 }

-// quantifier : star-quantifier
-//            | plus-quantifier
-//            | optional-quantifier
-//            | range-quantifier
-//            | <empty>
-//            ;
+/**
+ * quantifier : star-quantifier
+ *            | plus-quantifier
+ *            | optional-quantifier
+ *            | range-quantifier
+ *            | <empty>
+ *            ;
+ */
 static bool
 pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
    if (pm_regexp_char_is_eof(parser)) return true;
@ -205,8 +223,10 @@ pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
    }
 }

-// match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
-//                   ;
+/**
+ * match-posix-class : '[' '[' ':' '^'? CHAR+ ':' ']' ']'
+ *                   ;
+ */
 static bool
 pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
    if (!pm_regexp_char_expect(parser, ':')) {
@ -226,8 +246,10 @@ pm_regexp_parse_posix_class(pm_regexp_parser_t *parser) {
 static bool
 pm_regexp_parse_lbracket(pm_regexp_parser_t *parser);

-// match-char-set : '[' '^'? (match-range | match-char)* ']'
-//                ;
+/**
+ * match-char-set : '[' '^'? (match-range | match-char)* ']'
+ *                ;
+ */
 static bool
 pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
    pm_regexp_char_accept(parser, '^');
@ -251,7 +273,9 @@ pm_regexp_parse_character_set(pm_regexp_parser_t *parser) {
    return pm_regexp_char_expect(parser, ']');
 }

-// A left bracket can either mean a POSIX class or a character set.
+/**
+ * A left bracket can either mean a POSIX class or a character set.
+ */
 static bool
 pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
    const uint8_t *reset = parser->cursor;
@ -271,8 +295,10 @@ pm_regexp_parse_lbracket(pm_regexp_parser_t *parser) {
 static bool
 pm_regexp_parse_expression(pm_regexp_parser_t *parser);

-// These are the states of the options that are configurable on the regular
-// expression (or from within a group).
+/**
+ * These are the states of the options that are configurable on the regular
+ * expression (or from within a group).
+ */
 typedef enum {
    PM_REGEXP_OPTION_STATE_INVALID,
    PM_REGEXP_OPTION_STATE_TOGGLEABLE,
@ -283,16 +309,21 @@ typedef enum {

 // These are the options that are configurable on the regular expression (or
 // from within a group).
+
 #define PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM 'a'
 #define PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM 'x'
 #define PRISM_REGEXP_OPTION_STATE_SLOTS (PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM + 1)

-// This is the set of options that are configurable on the regular expression.
+/**
+ * This is the set of options that are configurable on the regular expression.
+ */
 typedef struct {
    uint8_t values[PRISM_REGEXP_OPTION_STATE_SLOTS];
 } pm_regexp_options_t;

-// Initialize a new set of options to their default values.
+/**
+ * Initialize a new set of options to their default values.
+ */
 static void
 pm_regexp_options_init(pm_regexp_options_t *options) {
    memset(options, PM_REGEXP_OPTION_STATE_INVALID, sizeof(uint8_t) * PRISM_REGEXP_OPTION_STATE_SLOTS);
@ -304,8 +335,10 @@ pm_regexp_options_init(pm_regexp_options_t *options) {
    options->values['u' - PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM] = PM_REGEXP_OPTION_STATE_ADDABLE;
 }

-// Attempt to add the given option to the set of options. Returns true if it was
-// added, false if it was already present.
+/**
+ * Attempt to add the given option to the set of options. Returns true if it was
+ * added, false if it was already present.
+ */
 static bool
 pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
    if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
@ -327,8 +360,10 @@ pm_regexp_options_add(pm_regexp_options_t *options, uint8_t key) {
    return false;
 }

-// Attempt to remove the given option from the set of options. Returns true if
-// it was removed, false if it was already absent.
+/**
+ * Attempt to remove the given option from the set of options. Returns true if
+ * it was removed, false if it was already absent.
+ */
 static bool
 pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
    if (key >= PRISM_REGEXP_OPTION_STATE_SLOT_MINIMUM && key <= PRISM_REGEXP_OPTION_STATE_SLOT_MAXIMUM) {
@ -349,26 +384,27 @@ pm_regexp_options_remove(pm_regexp_options_t *options, uint8_t key) {
    return false;
 }

-// Groups can have quite a few different patterns for syntax. They basically
-// just wrap a set of expressions, but they can potentially have options after a
-// question mark. If there _isn't_ a question mark, then it's just a set of
-// expressions. If there _is_, then here are the options:
-//
-// * (?#...)                       - inline comments
-// * (?:subexp)                    - non-capturing group
-// * (?=subexp)                    - positive lookahead
-// * (?!subexp)                    - negative lookahead
-// * (?>subexp)                    - atomic group
-// * (?~subexp)                    - absence operator
-// * (?<=subexp)                   - positive lookbehind
-// * (?<!subexp)                   - negative lookbehind
-// * (?<name>subexp)               - named capturing group
-// * (?'name'subexp)               - named capturing group
-// * (?(cond)yes-subexp)           - conditional expression
-// * (?(cond)yes-subexp|no-subexp) - conditional expression
-// * (?imxdau-imx)                 - turn on and off configuration
-// * (?imxdau-imx:subexp)          - turn on and off configuration for an expression
-//
+/**
+ * Groups can have quite a few different patterns for syntax. They basically
+ * just wrap a set of expressions, but they can potentially have options after a
+ * question mark. If there _isn't_ a question mark, then it's just a set of
+ * expressions. If there _is_, then here are the options:
+ *
+ * * (?#...)                       - inline comments
+ * * (?:subexp)                    - non-capturing group
+ * * (?=subexp)                    - positive lookahead
+ * * (?!subexp)                    - negative lookahead
+ * * (?>subexp)                    - atomic group
+ * * (?~subexp)                    - absence operator
+ * * (?<=subexp)                   - positive lookbehind
+ * * (?<!subexp)                   - negative lookbehind
+ * * (?<name>subexp)               - named capturing group
+ * * (?'name'subexp)               - named capturing group
+ * * (?(cond)yes-subexp)           - conditional expression
+ * * (?(cond)yes-subexp|no-subexp) - conditional expression
+ * * (?imxdau-imx)                 - turn on and off configuration
+ * * (?imxdau-imx:subexp)          - turn on and off configuration for an expression
+ */
 static bool
 pm_regexp_parse_group(pm_regexp_parser_t *parser) {
    // First, parse any options for the group.
@ -503,16 +539,18 @@ pm_regexp_parse_group(pm_regexp_parser_t *parser) {
    return pm_regexp_char_expect(parser, ')');
 }

-// item : anchor
-//      | match-posix-class
-//      | match-char-set
-//      | match-char-class
-//      | match-char-prop
-//      | match-char
-//      | match-any
-//      | group
-//      | quantified
-//      ;
+/**
+ * item : anchor
+ *      | match-posix-class
+ *      | match-char-set
+ *      | match-char-class
+ *      | match-char-prop
+ *      | match-char
+ *      | match-any
+ *      | group
+ *      | quantified
+ *      ;
+ */
 static bool
 pm_regexp_parse_item(pm_regexp_parser_t *parser) {
    switch (*parser->cursor++) {
@ -533,8 +571,10 @@ pm_regexp_parse_item(pm_regexp_parser_t *parser) {
    }
 }

-// expression : item+
-//            ;
+/**
+ * expression : item+
+ *            ;
+ */
 static bool
 pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
    if (!pm_regexp_parse_item(parser)) {
@ -550,10 +590,12 @@ pm_regexp_parse_expression(pm_regexp_parser_t *parser) {
    return true;
 }

-// pattern : EOF
-//         | expression EOF
-//         | expression '|' pattern
-//         ;
+/**
+ * pattern : EOF
+ *         | expression EOF
+ *         | expression '|' pattern
+ *         ;
+ */
 static bool
 pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
    return (
@ -572,8 +614,10 @@ pm_regexp_parse_pattern(pm_regexp_parser_t *parser) {
    );
 }

-// Parse a regular expression and extract the names of all of the named capture
-// groups.
+/**
+ * Parse a regular expression and extract the names of all of the named capture
+ * groups.
+ */
 PRISM_EXPORTED_FUNCTION bool
 pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding) {
    pm_regexp_parser_t parser;
--- a/prism/regexp.h
+++ b/prism/regexp.h
@ -12,8 +12,17 @@
 #include <stddef.h>
 #include <string.h>

-// Parse a regular expression and extract the names of all of the named capture
-// groups.
+/**
+ * Parse a regular expression and extract the names of all of the named capture
+ * groups.
+ *
+ * @param source The source code to parse.
+ * @param size The size of the source code.
+ * @param named_captures The list to add the names of the named capture groups.
+ * @param encoding_changed Whether or not the encoding changed from the default.
+ * @param encoding The encoding of the source code.
+ * @return Whether or not the parsing was successful.
+ */
 PRISM_EXPORTED_FUNCTION bool pm_regexp_named_capture_group_names(const uint8_t *source, size_t size, pm_string_list_t *named_captures, bool encoding_changed, pm_encoding_t *encoding);

 #endif