datamations/R/parse_pipeline.R

78 строки
3.0 KiB
R

#' Parse a tidyverse pipeline
#'
#' Parses a tidyverse pipeline, input as a string, into a list of its components as expressions for parsing later on in the datamations process.
#'
#' @param pipeline Input pipeline, as a string.
#' @param supported_tidy_functions Functions that are supported by datamations: \code{group_by} and \code{summarize}/\code{summarise}.
#' @noRd
#'
#' @examples
#' "small_salary %>% group_by(Degree) %>% summarize(mean = mean(Salary))" %>%
#' datamations:::parse_pipeline()
#'
#' "group_by(small_salary, Degree) %>% summarize(mean = mean(Salary))" %>%
#' datamations:::parse_pipeline()
parse_pipeline <- function(pipeline, supported_tidy_functions = c("group_by", "summarize", "filter", "count")) {
pipeline %>%
split_pipeline(supported_tidy_functions = supported_tidy_functions) %>%
purrr::map(rlang::parse_expr)
}
#' Split pipeline into components
#' @noRd
split_pipeline <- function(pipeline, supported_tidy_functions = c("group_by", "summarize", "filter", "count")) {
pipeline <- pipeline %>%
stringr::str_split("%>%") %>%
purrr::pluck(1) %>%
stringr::str_trim()
# Convert summarise to summarize
pipeline <- stringr::str_replace(pipeline, "summarise", "summarize")
# Extract out data if it's the first argument of the first function
pipeline <- parse_data_from_first_function(pipeline, supported_tidy_functions = supported_tidy_functions)
pipeline
}
#' Parse out data from the first function in a pipeline
#' @noRd
parse_data_from_first_function <- function(pipeline, supported_tidy_functions = c("group_by", "summarize", "filter", "count")) {
# If the first element of the pipeline is a supported function, the data is probably embedded in it
if (any(stringr::str_detect(pipeline[[1]], supported_tidy_functions))) {
# Extract the data and check that it is a valid data frame
first_function_data <- stringr::str_extract(pipeline[[1]], pattern = "(?<=\\()(.*?)(?=,)") # Regex is everything between ( and ,
first_function_data_expr <- rlang::parse_expr(first_function_data)
if (is.na(first_function_data)) {
stop("No data detected in pipeline.", call. = FALSE)
}
# Check that the data exists
data_exists <- try(eval(first_function_data_expr), silent = TRUE)
data_exists <- all(class(data_exists) != "try-error")
if (!data_exists) {
stop("No data detected in pipeline.", call. = FALSE)
}
# Check that the data is a data frame
data <- eval(first_function_data_expr)
data_is_df <- is.data.frame(data)
if (!data_is_df) {
stop("Passed data is not a data frame or tibble.", call. = FALSE)
}
# Remove data call from first function
pipeline[[1]] <- stringr::str_remove(pipeline[[1]], first_function_data)
# And the first comma (done separately because there can be any amount of spacing)
pipeline[[1]] <- stringr::str_remove(pipeline[[1]], ",")
# Make the data the first element of the pipeline
pipeline <- append(first_function_data, pipeline)
}
pipeline
}