feat: heatmap with `workpatterns_rank()`

This commit is contained in:
Martin Chan 2022-06-13 14:43:56 +01:00
Родитель 93aa0e1500
Коммит 6b8b850a39
2 изменённых файлов: 178 добавлений и 27 удалений

Просмотреть файл

@ -22,8 +22,17 @@
#' e.g. "`0900"`
#' @param end_hour A character vector specifying starting hours,
#' e.g. `"1700"`
#' @param top number specifying how many top working patterns to display in plot,
#' @param top numeric value specifying how many top working patterns to display in plot,
#' e.g. `"10"`
#'
#' @param mode string specifying aggregation method for plot. Valid
#' options include:
#' - `"binary"`: convert hourly activity into binary blocks. In the plot, each
#' block would display as solid.
#' - `"prop"`: calculate proportion of signals in each hour over total signals
#' across 24 hours, then average across all work weeks. In the plot, each
#' block would display as a heatmap.
#'
#' @param return String specifying what to return. This must be one of the
#' following strings:
#' - `"plot"`
@ -41,6 +50,7 @@
#' @importFrom data.table ":=" "%like%" "%between%"
#'
#' @examples
#' # Plot by default
#' workpatterns_rank(
#' data = em_data,
#' signals = c(
@ -51,6 +61,13 @@
#' )
#' )
#'
#' # Plot with binary mode
#' workpatterns_rank(
#' data = em_data,
#' mode = "binary"
#' )
#'
#'
#' @family Visualization
#' @family Working Patterns
#'
@ -60,6 +77,7 @@ workpatterns_rank <- function(data,
start_hour = "0900",
end_hour = "1700",
top = 10,
mode = "prop",
return = "plot"){
# Make sure data.table knows we know we're using it
@ -119,35 +137,70 @@ workpatterns_rank <- function(data,
## This is run on `signals_df`
num_cols <- names(which(sapply(signals_df, is.numeric))) # Get numeric columns
## Save original `signals_df` before manipulating ------------------------
## Rename `Signals_sent` columns to prevent conflict
signals_df_o <- signals_df %>%
purrr::set_names(
nm = gsub(x = names(.),
replacement = "_ori_",
pattern = "_sent_")
) %>%
cbind(select(signals_df, num_cols)) %>% # duplicate signals
# Convert `Signals_sent_` prefixed to binary. `Signals_ori_` are intact
# Create binary variable 0 or 1
.[, (num_cols) := lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = num_cols] %>%
# Use `mutate()` method
.[, `:=`(WeekCount = .N,
PersonCount = dplyr::n_distinct(PersonId),
Id = .GRP), # group id assignment
by = num_cols]
if(mode == "binary"){
## Summarized table performed on `signals_df` ----------------------------
## Section ignoring `signals_df_o`
signals_df <-
signals_df %>%
data.table::as.data.table() %>%
.[, (num_cols) := lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = num_cols] %>%
.[, list(WeekCount = .N, PersonCount = dplyr::n_distinct(PersonId)), by = input_var]
myTable_return <- data.table::setorder(signals_df, -PersonCount)
} else if(mode == "prop"){
## Save original `signals_df` before manipulating ------------------------
## Rename `Signals_sent` columns to prevent conflict
signals_df_o <- signals_df %>%
purrr::set_names(
nm = gsub(x = names(.),
replacement = "_ori_",
pattern = "_sent_")
) %>%
cbind(select(signals_df, num_cols)) %>% # duplicate signals
# Convert `Signals_sent_` prefixed to binary. `Signals_ori_` are intact
# Create binary variable 0 or 1
.[, (num_cols) := lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = num_cols] %>%
# Use `mutate()` method
.[, `:=`(WeekCount = .N,
PersonCount = dplyr::n_distinct(PersonId),
Id = .GRP), # group id assignment
by = num_cols]
# Wide table showing proportion of signals by hour
# Ranked descending by `WeekCount`
wp_prop_tb <-
signals_df_o %>%
arrange(desc(WeekCount)) %>%
dplyr::select(Id, dplyr::starts_with("Signals_ori_"), WeekCount) %>%
purrr::set_names(nm = gsub(
pattern = "Signals_ori_",
replacement = "",
x = names(.)
)) %>%
purrr::set_names(nm = gsub(
pattern = "_.+",
replacement = "",
x = names(.)
)) %>%
# Need aggregation
.[, Signals_Total := rowSums(.SD), .SDcols = hours_col] %>%
.[, c(hours_col) := .SD / Signals_Total, .SDcols = hours_col] %>%
.[, Signals_Total := NULL] %>% # Remove unneeded column
.[, lapply(.SD, mean, na.rm = TRUE), .SDcols = hours_col, by = list(Id, WeekCount)]
## Summarized table performed on `signals_df`
signals_df <-
signals_df %>%
data.table::as.data.table() %>%
.[, (num_cols) := lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = num_cols] %>%
.[, list(WeekCount = .N, PersonCount = dplyr::n_distinct(PersonId)), by = input_var]
} else {
myTable_return <- data.table::setorder(signals_df, -PersonCount)
stop("invalid value to `mode`.")
if(return == "plot"){
}
if(return == "plot" & mode == "binary"){
## Plot return
sig_label_ <- paste0(sig_label, "_")
@ -239,6 +292,85 @@ workpatterns_rank <- function(data,
"Top", top, "patterns represent", coverage, "of workweeks.", extract_date_range(data, return = "text"))
)
} else if(return == "plot" & mode == "prop"){
## Table for annotation
myTable_legends <-
wp_prop_tb %>%
arrange(desc(WeekCount)) %>%
mutate(patternRank= 1:nrow(.)) %>%
dplyr::select(patternRank, WeekCount) %>%
dplyr::mutate(WeekPercentage = WeekCount / sum(WeekCount, na.rm = TRUE),
WeekCount = paste0(scales::percent(WeekPercentage, accuracy = 0.1))) %>%
utils::head(top)
## Coverage
coverage <-
myTable_legends %>%
summarize(total = sum(WeekPercentage)) %>%
pull(1) %>%
scales::percent(accuracy = 0.1)
## Run plot
wp_prop_tb %>%
utils::head(top) %>%
dplyr::mutate(Id = 1:nrow(.)) %>% # Overwrite Id with row number
tidyr::gather(Hours, Freq, -Id, -WeekCount) %>%
ggplot2::ggplot(ggplot2::aes(x = Hours, y = Id, fill = Freq)) +
ggplot2::geom_tile(height = .5) +
ggplot2::ylab(paste("Top", top, "activity patterns")) +
ggplot2::scale_y_reverse(expand = c(0, 0), breaks = seq(1, top)) +
theme_wpa_basic() +
ggplot2::scale_x_discrete(position = "top")+
ggplot2::theme(
axis.title.x = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank()
) +
scale_fill_continuous(
guide = "legend",
low = "white",
high = "#1d627e",
breaks = 0:1,
name = "",
labels = c("", paste("Observed activity"))
) +
ggplot2::annotate(
"text",
y = myTable_legends$patternRank,
x = 26.5,
label = myTable_legends$WeekCount,
size = 3
) +
ggplot2::annotate("rect",
xmin = 25,
xmax = 28,
ymin = 0.5,
ymax = top + 0.5,
alpha = .2) +
ggplot2::annotate("rect",
xmin = 0.5,
xmax = 17 + 0.5,
ymin = 0.5,
ymax = top + 0.5,
alpha = .1,
fill = "gray50") +
ggplot2::annotate("rect",
xmin = 9 + 0.5,
xmax = 24.5,
ymin = 0.5,
ymax = top + 0.5,
alpha = .1,
fill = "gray50") +
labs(
title = "Patterns of digital activity",
subtitle = paste("Hourly activity based on", subtitle_signal ,"sent over a week"),
caption = paste(
"Top", top, "patterns represent", coverage, "of workweeks.", extract_date_range(data, return = "text"))
)
} else if(return == "table"){
dplyr::as_tibble(myTable_return)

Просмотреть файл

@ -10,6 +10,7 @@ workpatterns_rank(
start_hour = "0900",
end_hour = "1700",
top = 10,
mode = "prop",
return = "plot"
)
}
@ -32,9 +33,19 @@ e.g. "\verb{0900"}}
\item{end_hour}{A character vector specifying starting hours,
e.g. \code{"1700"}}
\item{top}{number specifying how many top working patterns to display in plot,
\item{top}{numeric value specifying how many top working patterns to display in plot,
e.g. \code{"10"}}
\item{mode}{string specifying aggregation method for plot. Valid
options include:
\itemize{
\item \code{"binary"}: convert hourly activity into binary blocks. In the plot, each
block would display as solid.
\item \code{"prop"}: calculate proportion of signals in each hour over total signals
across 24 hours, then average across all work weeks. In the plot, each
block would display as a heatmap.
}}
\item{return}{String specifying what to return. This must be one of the
following strings:
\itemize{
@ -59,6 +70,7 @@ table of working patterns, ranked from the most common to the
least.
}
\examples{
# Plot by default
workpatterns_rank(
data = em_data,
signals = c(
@ -69,6 +81,13 @@ workpatterns_rank(
)
)
# Plot with binary mode
workpatterns_rank(
data = em_data,
mode = "binary"
)
}
\seealso{
Other Visualization: