2023-10-31 20:26:31 +03:00
|
|
|
/**
|
|
|
|
* @file parser.h
|
|
|
|
*
|
|
|
|
* The parser used to parse Ruby source.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
#ifndef PRISM_PARSER_H
|
|
|
|
#define PRISM_PARSER_H
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-09-27 19:24:48 +03:00
|
|
|
#include "prism/ast.h"
|
|
|
|
#include "prism/defines.h"
|
2023-11-30 19:36:10 +03:00
|
|
|
#include "prism/encoding.h"
|
2024-01-02 19:18:29 +03:00
|
|
|
#include "prism/options.h"
|
2023-09-27 19:24:48 +03:00
|
|
|
#include "prism/util/pm_constant_pool.h"
|
|
|
|
#include "prism/util/pm_list.h"
|
|
|
|
#include "prism/util/pm_newline_list.h"
|
|
|
|
#include "prism/util/pm_state_stack.h"
|
2023-10-07 05:00:01 +03:00
|
|
|
#include "prism/util/pm_string.h"
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-06-30 21:30:24 +03:00
|
|
|
#include <stdbool.h>
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This enum provides various bits that represent different kinds of states that
|
|
|
|
* the lexer can track. This is used to determine which kind of token to return
|
|
|
|
* based on the context of the parser.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef enum {
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_STATE_BIT_BEG,
|
|
|
|
PM_LEX_STATE_BIT_END,
|
|
|
|
PM_LEX_STATE_BIT_ENDARG,
|
|
|
|
PM_LEX_STATE_BIT_ENDFN,
|
|
|
|
PM_LEX_STATE_BIT_ARG,
|
|
|
|
PM_LEX_STATE_BIT_CMDARG,
|
|
|
|
PM_LEX_STATE_BIT_MID,
|
|
|
|
PM_LEX_STATE_BIT_FNAME,
|
|
|
|
PM_LEX_STATE_BIT_DOT,
|
|
|
|
PM_LEX_STATE_BIT_CLASS,
|
|
|
|
PM_LEX_STATE_BIT_LABEL,
|
|
|
|
PM_LEX_STATE_BIT_LABELED,
|
|
|
|
PM_LEX_STATE_BIT_FITEM
|
|
|
|
} pm_lex_state_bit_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This enum combines the various bits from the above enum into individual
|
|
|
|
* values that represent the various states of the lexer.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef enum {
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_STATE_NONE = 0,
|
|
|
|
PM_LEX_STATE_BEG = (1 << PM_LEX_STATE_BIT_BEG),
|
|
|
|
PM_LEX_STATE_END = (1 << PM_LEX_STATE_BIT_END),
|
|
|
|
PM_LEX_STATE_ENDARG = (1 << PM_LEX_STATE_BIT_ENDARG),
|
|
|
|
PM_LEX_STATE_ENDFN = (1 << PM_LEX_STATE_BIT_ENDFN),
|
|
|
|
PM_LEX_STATE_ARG = (1 << PM_LEX_STATE_BIT_ARG),
|
|
|
|
PM_LEX_STATE_CMDARG = (1 << PM_LEX_STATE_BIT_CMDARG),
|
|
|
|
PM_LEX_STATE_MID = (1 << PM_LEX_STATE_BIT_MID),
|
|
|
|
PM_LEX_STATE_FNAME = (1 << PM_LEX_STATE_BIT_FNAME),
|
|
|
|
PM_LEX_STATE_DOT = (1 << PM_LEX_STATE_BIT_DOT),
|
|
|
|
PM_LEX_STATE_CLASS = (1 << PM_LEX_STATE_BIT_CLASS),
|
|
|
|
PM_LEX_STATE_LABEL = (1 << PM_LEX_STATE_BIT_LABEL),
|
|
|
|
PM_LEX_STATE_LABELED = (1 << PM_LEX_STATE_BIT_LABELED),
|
|
|
|
PM_LEX_STATE_FITEM = (1 << PM_LEX_STATE_BIT_FITEM),
|
|
|
|
PM_LEX_STATE_BEG_ANY = PM_LEX_STATE_BEG | PM_LEX_STATE_MID | PM_LEX_STATE_CLASS,
|
|
|
|
PM_LEX_STATE_ARG_ANY = PM_LEX_STATE_ARG | PM_LEX_STATE_CMDARG,
|
|
|
|
PM_LEX_STATE_END_ANY = PM_LEX_STATE_END | PM_LEX_STATE_ENDARG | PM_LEX_STATE_ENDFN
|
|
|
|
} pm_lex_state_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* The type of quote that a heredoc uses.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef enum {
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_HEREDOC_QUOTE_NONE,
|
|
|
|
PM_HEREDOC_QUOTE_SINGLE = '\'',
|
|
|
|
PM_HEREDOC_QUOTE_DOUBLE = '"',
|
|
|
|
PM_HEREDOC_QUOTE_BACKTICK = '`',
|
|
|
|
} pm_heredoc_quote_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* The type of indentation that a heredoc uses.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef enum {
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_HEREDOC_INDENT_NONE,
|
|
|
|
PM_HEREDOC_INDENT_DASH,
|
|
|
|
PM_HEREDOC_INDENT_TILDE,
|
|
|
|
} pm_heredoc_indent_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* When lexing Ruby source, the lexer has a small amount of state to tell which
|
|
|
|
* kind of token it is currently lexing. For example, when we find the start of
|
|
|
|
* a string, the first token that we return is a TOKEN_STRING_BEGIN token. After
|
|
|
|
* that the lexer is now in the PM_LEX_STRING mode, and will return tokens that
|
|
|
|
* are found as part of a string.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
typedef struct pm_lex_mode {
|
2023-10-31 20:26:31 +03:00
|
|
|
/** The type of this lex mode. */
|
2023-06-20 18:53:02 +03:00
|
|
|
enum {
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This state is used when any given token is being lexed. */
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_DEFAULT,
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This state is used when we're lexing as normal but inside an embedded
|
|
|
|
* expression of a string.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_EMBEXPR,
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This state is used when we're lexing a variable that is embedded
|
|
|
|
* directly inside of a string with the # shorthand.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_EMBVAR,
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This state is used when you are inside the content of a heredoc. */
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_HEREDOC,
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This state is used when we are lexing a list of tokens, as in a %w
|
|
|
|
* word list literal or a %i symbol list literal.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_LIST,
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This state is used when a regular expression has been begun and we
|
|
|
|
* are looking for the terminator.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_REGEXP,
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This state is used when we are lexing a string or a string-like
|
|
|
|
* token, as in string content with either quote or an xstring.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_LEX_STRING
|
2023-06-20 18:53:02 +03:00
|
|
|
} mode;
|
|
|
|
|
2023-10-31 20:26:31 +03:00
|
|
|
/** The data associated with this type of lex mode. */
|
2023-06-20 18:53:02 +03:00
|
|
|
union {
|
|
|
|
struct {
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This keeps track of the nesting level of the list. */
|
2023-06-20 18:53:02 +03:00
|
|
|
size_t nesting;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** Whether or not interpolation is allowed in this list. */
|
2023-06-20 18:53:02 +03:00
|
|
|
bool interpolation;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* When lexing a list, it takes into account balancing the
|
|
|
|
* terminator if the terminator is one of (), [], {}, or <>.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t incrementor;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This is the terminator of the list literal. */
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t terminator;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is the character set that should be used to delimit the
|
|
|
|
* tokens within the list.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t breakpoints[11];
|
2023-06-20 18:53:02 +03:00
|
|
|
} list;
|
|
|
|
|
|
|
|
struct {
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This keeps track of the nesting level of the regular expression.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
size_t nesting;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* When lexing a regular expression, it takes into account balancing
|
|
|
|
* the terminator if the terminator is one of (), [], {}, or <>.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t incrementor;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This is the terminator of the regular expression. */
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t terminator;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is the character set that should be used to delimit the
|
|
|
|
* tokens within the regular expression.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t breakpoints[6];
|
2023-06-20 18:53:02 +03:00
|
|
|
} regexp;
|
|
|
|
|
|
|
|
struct {
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This keeps track of the nesting level of the string. */
|
2023-06-20 18:53:02 +03:00
|
|
|
size_t nesting;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** Whether or not interpolation is allowed in this string. */
|
2023-06-20 18:53:02 +03:00
|
|
|
bool interpolation;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* Whether or not at the end of the string we should allow a :,
|
|
|
|
* which would indicate this was a dynamic symbol instead of a
|
|
|
|
* string.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
bool label_allowed;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* When lexing a string, it takes into account balancing the
|
|
|
|
* terminator if the terminator is one of (), [], {}, or <>.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t incrementor;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is the terminator of the string. It is typically either a
|
|
|
|
* single or double quote.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t terminator;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is the character set that should be used to delimit the
|
|
|
|
* tokens within the string.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
uint8_t breakpoints[6];
|
2023-06-20 18:53:02 +03:00
|
|
|
} string;
|
|
|
|
|
|
|
|
struct {
|
2023-10-31 19:54:54 +03:00
|
|
|
/** A pointer to the start of the heredoc identifier. */
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *ident_start;
|
2023-10-31 19:54:54 +03:00
|
|
|
|
|
|
|
/** The length of the heredoc identifier. */
|
2023-06-20 18:53:02 +03:00
|
|
|
size_t ident_length;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The type of quote that the heredoc uses. */
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_heredoc_quote_t quote;
|
2023-10-31 19:54:54 +03:00
|
|
|
|
|
|
|
/** The type of indentation that the heredoc uses. */
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_heredoc_indent_t indent;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is the pointer to the character where lexing should resume
|
|
|
|
* once the heredoc has been completely processed.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *next_start;
|
2023-10-11 19:39:41 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is used to track the amount of common whitespace on each
|
|
|
|
* line so that we know how much to dedent each line in the case of
|
|
|
|
* a tilde heredoc.
|
|
|
|
*/
|
2023-10-11 19:39:41 +03:00
|
|
|
size_t common_whitespace;
|
2023-06-20 18:53:02 +03:00
|
|
|
} heredoc;
|
|
|
|
} as;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The previous lex state so that it knows how to pop. */
|
2023-09-27 19:24:48 +03:00
|
|
|
struct pm_lex_mode *prev;
|
|
|
|
} pm_lex_mode_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* We pre-allocate a certain number of lex states in order to avoid having to
|
|
|
|
* call malloc too many times while parsing. You really shouldn't need more than
|
|
|
|
* this because you only really nest deeply when doing string interpolation.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
#define PM_LEX_STACK_SIZE 4
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 20:26:31 +03:00
|
|
|
/**
|
|
|
|
* The parser used to parse Ruby source.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
typedef struct pm_parser pm_parser_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* While parsing, we keep track of a stack of contexts. This is helpful for
|
|
|
|
* error recovery so that we can pop back to a previous context when we hit a
|
|
|
|
* token that is understood by a parent context but not by the current context.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef enum {
|
2024-01-30 01:27:45 +03:00
|
|
|
/** a null context, used for returning a value from a function */
|
|
|
|
PM_CONTEXT_NONE = 0,
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** a begin statement */
|
|
|
|
PM_CONTEXT_BEGIN,
|
|
|
|
|
|
|
|
/** expressions in block arguments using braces */
|
|
|
|
PM_CONTEXT_BLOCK_BRACES,
|
|
|
|
|
|
|
|
/** expressions in block arguments using do..end */
|
|
|
|
PM_CONTEXT_BLOCK_KEYWORDS,
|
|
|
|
|
|
|
|
/** a case when statements */
|
|
|
|
PM_CONTEXT_CASE_WHEN,
|
|
|
|
|
|
|
|
/** a case in statements */
|
|
|
|
PM_CONTEXT_CASE_IN,
|
|
|
|
|
|
|
|
/** a class declaration */
|
|
|
|
PM_CONTEXT_CLASS,
|
|
|
|
|
|
|
|
/** a method definition */
|
|
|
|
PM_CONTEXT_DEF,
|
|
|
|
|
|
|
|
/** a method definition's parameters */
|
|
|
|
PM_CONTEXT_DEF_PARAMS,
|
|
|
|
|
|
|
|
/** a method definition's default parameter */
|
|
|
|
PM_CONTEXT_DEFAULT_PARAMS,
|
|
|
|
|
|
|
|
/** an else clause */
|
|
|
|
PM_CONTEXT_ELSE,
|
|
|
|
|
|
|
|
/** an elsif clause */
|
|
|
|
PM_CONTEXT_ELSIF,
|
|
|
|
|
|
|
|
/** an interpolated expression */
|
|
|
|
PM_CONTEXT_EMBEXPR,
|
|
|
|
|
|
|
|
/** an ensure statement */
|
|
|
|
PM_CONTEXT_ENSURE,
|
|
|
|
|
2023-11-29 04:32:26 +03:00
|
|
|
/** an ensure statement within a method definition */
|
|
|
|
PM_CONTEXT_ENSURE_DEF,
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** a for loop */
|
|
|
|
PM_CONTEXT_FOR,
|
|
|
|
|
|
|
|
/** a for loop's index */
|
|
|
|
PM_CONTEXT_FOR_INDEX,
|
|
|
|
|
|
|
|
/** an if statement */
|
|
|
|
PM_CONTEXT_IF,
|
|
|
|
|
|
|
|
/** a lambda expression with braces */
|
|
|
|
PM_CONTEXT_LAMBDA_BRACES,
|
|
|
|
|
|
|
|
/** a lambda expression with do..end */
|
|
|
|
PM_CONTEXT_LAMBDA_DO_END,
|
|
|
|
|
|
|
|
/** the top level context */
|
|
|
|
PM_CONTEXT_MAIN,
|
|
|
|
|
|
|
|
/** a module declaration */
|
|
|
|
PM_CONTEXT_MODULE,
|
|
|
|
|
|
|
|
/** a parenthesized expression */
|
|
|
|
PM_CONTEXT_PARENS,
|
|
|
|
|
|
|
|
/** an END block */
|
|
|
|
PM_CONTEXT_POSTEXE,
|
|
|
|
|
|
|
|
/** a predicate inside an if/elsif/unless statement */
|
|
|
|
PM_CONTEXT_PREDICATE,
|
|
|
|
|
|
|
|
/** a BEGIN block */
|
|
|
|
PM_CONTEXT_PREEXE,
|
|
|
|
|
|
|
|
/** a rescue else statement */
|
|
|
|
PM_CONTEXT_RESCUE_ELSE,
|
|
|
|
|
2023-11-29 04:32:26 +03:00
|
|
|
/** a rescue else statement within a method definition */
|
|
|
|
PM_CONTEXT_RESCUE_ELSE_DEF,
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** a rescue statement */
|
|
|
|
PM_CONTEXT_RESCUE,
|
|
|
|
|
2023-11-29 04:32:26 +03:00
|
|
|
/** a rescue statement within a method definition */
|
|
|
|
PM_CONTEXT_RESCUE_DEF,
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** a singleton class definition */
|
|
|
|
PM_CONTEXT_SCLASS,
|
|
|
|
|
|
|
|
/** an unless statement */
|
|
|
|
PM_CONTEXT_UNLESS,
|
|
|
|
|
|
|
|
/** an until statement */
|
|
|
|
PM_CONTEXT_UNTIL,
|
|
|
|
|
|
|
|
/** a while statement */
|
|
|
|
PM_CONTEXT_WHILE,
|
2023-09-27 19:24:48 +03:00
|
|
|
} pm_context_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This is a node in a linked list of contexts. */
|
2023-09-27 19:24:48 +03:00
|
|
|
typedef struct pm_context_node {
|
2023-10-31 20:26:31 +03:00
|
|
|
/** The context that this node represents. */
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_context_t context;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** A pointer to the previous context in the linked list. */
|
2023-09-27 19:24:48 +03:00
|
|
|
struct pm_context_node *prev;
|
|
|
|
} pm_context_node_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This is the type of a comment that we've found while parsing. */
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef enum {
|
2023-09-27 19:24:48 +03:00
|
|
|
PM_COMMENT_INLINE,
|
2023-11-27 22:17:02 +03:00
|
|
|
PM_COMMENT_EMBDOC
|
2023-09-27 19:24:48 +03:00
|
|
|
} pm_comment_type_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 04:58:48 +03:00
|
|
|
/**
|
|
|
|
* This is a node in the linked list of comments that we've found while parsing.
|
|
|
|
*
|
|
|
|
* @extends pm_list_node_t
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
typedef struct pm_comment {
|
2023-10-31 20:26:31 +03:00
|
|
|
/** The embedded base node. */
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_list_node_t node;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
2023-12-02 00:00:56 +03:00
|
|
|
/** The location of the comment in the source. */
|
|
|
|
pm_location_t location;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** The type of comment that we've found. */
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_comment_type_t type;
|
|
|
|
} pm_comment_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 04:58:48 +03:00
|
|
|
/**
|
|
|
|
* This is a node in the linked list of magic comments that we've found while
|
|
|
|
* parsing.
|
|
|
|
*
|
|
|
|
* @extends pm_list_node_t
|
|
|
|
*/
|
2023-10-13 19:16:11 +03:00
|
|
|
typedef struct {
|
2023-10-31 20:26:31 +03:00
|
|
|
/** The embedded base node. */
|
2023-10-13 19:16:11 +03:00
|
|
|
pm_list_node_t node;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** A pointer to the start of the key in the source. */
|
2023-10-13 19:16:11 +03:00
|
|
|
const uint8_t *key_start;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** A pointer to the start of the value in the source. */
|
2023-10-13 19:16:11 +03:00
|
|
|
const uint8_t *value_start;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** The length of the key in the source. */
|
2023-10-13 19:16:11 +03:00
|
|
|
uint32_t key_length;
|
2023-10-31 20:26:31 +03:00
|
|
|
|
|
|
|
/** The length of the value in the source. */
|
2023-10-13 19:16:11 +03:00
|
|
|
uint32_t value_length;
|
|
|
|
} pm_magic_comment_t;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* When the encoding that is being used to parse the source is changed by prism,
|
|
|
|
* we provide the ability here to call out to a user-defined function.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
typedef void (*pm_encoding_changed_callback_t)(pm_parser_t *parser);
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* When you are lexing through a file, the lexer needs all of the information
|
|
|
|
* that the parser additionally provides (for example, the local table). So if
|
|
|
|
* you want to properly lex Ruby, you need to actually lex it in the context of
|
|
|
|
* the parser. In order to provide this functionality, we optionally allow a
|
|
|
|
* struct to be attached to the parser that calls back out to a user-provided
|
|
|
|
* callback when each token is lexed.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
typedef struct {
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This opaque pointer is used to provide whatever information the user
|
|
|
|
* deemed necessary to the callback. In our case we use it to pass the array
|
|
|
|
* that the tokens get appended into.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
void *data;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is the callback that is called when a token is lexed. It is passed
|
|
|
|
* the opaque data pointer, the parser, and the token that was lexed.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
void (*callback)(void *data, pm_parser_t *parser, pm_token_t *token);
|
|
|
|
} pm_lex_callback_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This struct represents a node in a linked list of scopes. Some scopes can see
|
|
|
|
* into their parent scopes, while others cannot.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
typedef struct pm_scope {
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The IDs of the locals in the given scope. */
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_constant_id_list_t locals;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** A pointer to the previous scope in the linked list. */
|
2023-09-27 19:24:48 +03:00
|
|
|
struct pm_scope *previous;
|
2023-09-13 17:23:56 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* A boolean indicating whether or not this scope can see into its parent.
|
|
|
|
* If closed is true, then the scope cannot see into its parent.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
bool closed;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* A boolean indicating whether or not this scope has explicit parameters.
|
|
|
|
* This is necessary to determine whether or not numbered parameters are
|
|
|
|
* allowed.
|
|
|
|
*/
|
2023-09-13 17:23:56 +03:00
|
|
|
bool explicit_params;
|
2023-09-13 18:39:09 +03:00
|
|
|
|
2024-01-26 02:08:44 +03:00
|
|
|
/**
|
|
|
|
* Booleans indicating whether the parameters for this scope have declared
|
|
|
|
* forwarding parameters.
|
|
|
|
*
|
|
|
|
* For example, some combinations of:
|
|
|
|
* def foo(*); end
|
|
|
|
* def foo(**); end
|
|
|
|
* def foo(&); end
|
|
|
|
* def foo(...); end
|
|
|
|
*/
|
|
|
|
|
|
|
|
uint8_t forwarding_params;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
2023-11-28 23:42:33 +03:00
|
|
|
* An integer indicating the number of numbered parameters on this scope.
|
2023-10-31 19:54:54 +03:00
|
|
|
* This is necessary to determine if child blocks are allowed to use
|
2023-11-28 23:42:33 +03:00
|
|
|
* numbered parameters, and to pass information to consumers of the AST
|
|
|
|
* about how many numbered parameters exist.
|
2023-10-31 19:54:54 +03:00
|
|
|
*/
|
2023-12-01 04:47:08 +03:00
|
|
|
uint8_t numbered_parameters;
|
2023-09-27 19:24:48 +03:00
|
|
|
} pm_scope_t;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2024-01-26 02:08:44 +03:00
|
|
|
static const uint8_t PM_FORWARDING_POSITIONALS = 0x1;
|
|
|
|
static const uint8_t PM_FORWARDING_KEYWORDS = 0x2;
|
|
|
|
static const uint8_t PM_FORWARDING_BLOCK = 0x4;
|
|
|
|
static const uint8_t PM_FORWARDING_ALL = 0x8;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This struct represents the overall parser. It contains a reference to the
|
|
|
|
* source file, as well as pointers that indicate where in the source it's
|
|
|
|
* currently parsing. It also contains the most recent and current token that
|
|
|
|
* it's considering.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
struct pm_parser {
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The current state of the lexer. */
|
|
|
|
pm_lex_state_t lex_state;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** Tracks the current nesting of (), [], and {}. */
|
|
|
|
int enclosure_nesting;
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Used to temporarily track the nesting of enclosures to determine if a {
|
|
|
|
* is the beginning of a lambda following the parameters of a lambda.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
int lambda_enclosure_nesting;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* Used to track the nesting of braces to ensure we get the correct value
|
|
|
|
* when we are interpolating blocks with braces.
|
|
|
|
*/
|
2023-06-20 18:53:02 +03:00
|
|
|
int brace_nesting;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* The stack used to determine if a do keyword belongs to the predicate of a
|
|
|
|
* while, until, or for loop.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_state_stack_t do_loop_stack;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* The stack used to determine if a do keyword belongs to the beginning of a
|
|
|
|
* block.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_state_stack_t accepts_block_stack;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 20:26:31 +03:00
|
|
|
/** A stack of lex modes. */
|
2023-06-20 18:53:02 +03:00
|
|
|
struct {
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The current mode of the lexer. */
|
|
|
|
pm_lex_mode_t *current;
|
|
|
|
|
|
|
|
/** The stack of lexer modes. */
|
|
|
|
pm_lex_mode_t stack[PM_LEX_STACK_SIZE];
|
|
|
|
|
|
|
|
/** The current index into the lexer mode stack. */
|
|
|
|
size_t index;
|
2023-06-20 18:53:02 +03:00
|
|
|
} lex_modes;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The pointer to the start of the source. */
|
|
|
|
const uint8_t *start;
|
|
|
|
|
|
|
|
/** The pointer to the end of the source. */
|
|
|
|
const uint8_t *end;
|
|
|
|
|
|
|
|
/** The previous token we were considering. */
|
|
|
|
pm_token_t previous;
|
|
|
|
|
|
|
|
/** The current token we're considering. */
|
|
|
|
pm_token_t current;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is a special field set on the parser when we need the parser to jump
|
|
|
|
* to a specific location when lexing the next token, as opposed to just
|
|
|
|
* using the end of the previous token. Normally this is NULL.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *next_start;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This field indicates the end of a heredoc whose identifier was found on
|
|
|
|
* the current line. If another heredoc is found on the same line, then this
|
|
|
|
* will be moved forward to the end of that heredoc. If no heredocs are
|
|
|
|
* found on a line then this is NULL.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *heredoc_end;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The list of comments that have been found while parsing. */
|
|
|
|
pm_list_t comment_list;
|
|
|
|
|
|
|
|
/** The list of magic comments that have been found while parsing. */
|
|
|
|
pm_list_t magic_comment_list;
|
|
|
|
|
2024-01-09 22:02:17 +03:00
|
|
|
/**
|
|
|
|
* An optional location that represents the location of the __END__ marker
|
|
|
|
* and the rest of the content of the file. This content is loaded into the
|
|
|
|
* DATA constant when the file being parsed is the main file being executed.
|
|
|
|
*/
|
2023-11-27 22:17:02 +03:00
|
|
|
pm_location_t data_loc;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The list of warnings that have been found while parsing. */
|
|
|
|
pm_list_t warning_list;
|
|
|
|
|
|
|
|
/** The list of errors that have been found while parsing. */
|
|
|
|
pm_list_t error_list;
|
|
|
|
|
|
|
|
/** The current local scope. */
|
|
|
|
pm_scope_t *current_scope;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** The current parsing context. */
|
|
|
|
pm_context_node_t *current_context;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* The encoding functions for the current file is attached to the parser as
|
|
|
|
* it's parsing so that it can change with a magic comment.
|
|
|
|
*/
|
2023-12-04 20:51:22 +03:00
|
|
|
const pm_encoding_t *encoding;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* When the encoding that is being used to parse the source is changed by
|
|
|
|
* prism, we provide the ability here to call out to a user-defined
|
|
|
|
* function.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_encoding_changed_callback_t encoding_changed_callback;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This pointer indicates where a comment must start if it is to be
|
|
|
|
* considered an encoding comment.
|
|
|
|
*/
|
2023-08-29 17:48:20 +03:00
|
|
|
const uint8_t *encoding_comment_start;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is an optional callback that can be attached to the parser that will
|
|
|
|
* be called whenever a new token is lexed by the parser.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_lex_callback_t *lex_callback;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This is the path of the file being parsed. We use the filepath when
|
|
|
|
* constructing SourceFileNodes.
|
|
|
|
*/
|
2024-02-01 23:16:15 +03:00
|
|
|
pm_string_t filepath;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This constant pool keeps all of the constants defined throughout the file
|
|
|
|
* so that we can reference them later.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_constant_pool_t constant_pool;
|
2023-06-20 18:53:02 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This is the list of newline offsets in the source file. */
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_newline_list_t newline_list;
|
2023-09-11 19:05:14 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* We want to add a flag to integer nodes that indicates their base. We only
|
|
|
|
* want to parse these once, but we don't have space on the token itself to
|
|
|
|
* communicate this information. So we store it here and pass it through
|
|
|
|
* when we find tokens that we need it for.
|
|
|
|
*/
|
2023-09-27 19:24:48 +03:00
|
|
|
pm_node_flags_t integer_base;
|
2023-09-12 19:31:50 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This string is used to pass information from the lexer to the parser. It
|
|
|
|
* is particularly necessary because of escape sequences.
|
|
|
|
*/
|
2023-10-07 05:00:01 +03:00
|
|
|
pm_string_t current_string;
|
|
|
|
|
2023-11-02 22:06:50 +03:00
|
|
|
/**
|
|
|
|
* The line number at the start of the parse. This will be used to offset
|
|
|
|
* the line numbers of all of the locations.
|
|
|
|
*/
|
2023-11-29 13:46:33 +03:00
|
|
|
int32_t start_line;
|
2023-11-02 22:06:50 +03:00
|
|
|
|
2023-12-04 20:51:22 +03:00
|
|
|
/**
|
|
|
|
* When a string-like expression is being lexed, any byte or escape sequence
|
|
|
|
* that resolves to a value whose top bit is set (i.e., >= 0x80) will
|
|
|
|
* explicitly set the encoding to the same encoding as the source.
|
|
|
|
* Alternatively, if a unicode escape sequence is used (e.g., \\u{80}) that
|
|
|
|
* resolves to a value whose top bit is set, then the encoding will be
|
|
|
|
* explicitly set to UTF-8.
|
|
|
|
*
|
|
|
|
* The _next_ time this happens, if the encoding that is about to become the
|
|
|
|
* explicitly set encoding does not match the previously set explicit
|
|
|
|
* encoding, a mixed encoding error will be emitted.
|
|
|
|
*
|
|
|
|
* When the expression is finished being lexed, the explicit encoding
|
|
|
|
* controls the encoding of the expression. For the most part this means
|
|
|
|
* that the expression will either be encoded in the source encoding or
|
|
|
|
* UTF-8. This holds for all encodings except US-ASCII. If the source is
|
|
|
|
* US-ASCII and an explicit encoding was set that was _not_ UTF-8, then the
|
|
|
|
* expression will be encoded as ASCII-8BIT.
|
|
|
|
*
|
|
|
|
* Note that if the expression is a list, different elements within the same
|
|
|
|
* list can have different encodings, so this will get reset between each
|
|
|
|
* element. Furthermore all of this only applies to lists that support
|
|
|
|
* interpolation, because otherwise escapes that could change the encoding
|
|
|
|
* are ignored.
|
|
|
|
*
|
|
|
|
* At first glance, it may make more sense for this to live on the lexer
|
|
|
|
* mode, but we need it here to communicate back to the parser for character
|
|
|
|
* literals that do not push a new lexer mode.
|
|
|
|
*/
|
|
|
|
const pm_encoding_t *explicit_encoding;
|
|
|
|
|
2024-01-02 19:18:29 +03:00
|
|
|
/** The current parameter name id on parsing its default value. */
|
|
|
|
pm_constant_id_t current_param_name;
|
|
|
|
|
|
|
|
/** The version of prism that we should use to parse. */
|
|
|
|
pm_options_version_t version;
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** Whether or not we're at the beginning of a command. */
|
2023-09-13 17:10:47 +03:00
|
|
|
bool command_start;
|
2023-09-12 19:31:50 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** Whether or not we're currently recovering from a syntax error. */
|
2023-09-13 17:10:47 +03:00
|
|
|
bool recovering;
|
2023-09-12 19:31:50 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* Whether or not the encoding has been changed by a magic comment. We use
|
|
|
|
* this to provide a fast path for the lexer instead of going through the
|
|
|
|
* function pointer.
|
|
|
|
*/
|
2023-09-13 17:10:47 +03:00
|
|
|
bool encoding_changed;
|
2023-09-12 19:31:50 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* This flag indicates that we are currently parsing a pattern matching
|
|
|
|
* expression and impacts that calculation of newlines.
|
|
|
|
*/
|
2023-09-13 17:10:47 +03:00
|
|
|
bool pattern_matching_newlines;
|
2023-09-12 19:31:50 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/** This flag indicates that we are currently parsing a keyword argument. */
|
2023-09-13 17:10:47 +03:00
|
|
|
bool in_keyword_arg;
|
2023-09-12 19:53:24 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* Whether or not the parser has seen a token that has semantic meaning
|
|
|
|
* (i.e., a token that is not a comment or whitespace).
|
|
|
|
*/
|
2023-09-13 17:10:47 +03:00
|
|
|
bool semantic_token_seen;
|
2023-09-12 19:53:24 +03:00
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
/**
|
|
|
|
* Whether or not we have found a frozen_string_literal magic comment with
|
|
|
|
* a true value.
|
|
|
|
*/
|
2023-09-13 17:10:47 +03:00
|
|
|
bool frozen_string_literal;
|
2023-06-20 18:53:02 +03:00
|
|
|
};
|
|
|
|
|
2023-10-31 19:54:54 +03:00
|
|
|
#endif
|