feat: heatmap with `workpatterns_rank()`

2022-06-13 14:43:56 +01:00 · 2022-06-13 14:43:56 +01:00 · 6b8b850a39
--- a/R/workpatterns_rank.R
+++ b/R/workpatterns_rank.R
@ -22,8 +22,17 @@
 #' e.g. "`0900"`
 #' @param end_hour A character vector specifying starting hours,
 #' e.g. `"1700"`
-#' @param top number specifying how many top working patterns to display in plot,
+#' @param top numeric value specifying how many top working patterns to display in plot,
 #' e.g. `"10"`
+#'
+#' @param mode string specifying aggregation method for plot. Valid
+#' options include:
+#'   - `"binary"`: convert hourly activity into binary blocks. In the plot, each
+#'   block would display as solid.
+#'   - `"prop"`: calculate proportion of signals in each hour over total signals
+#'   across 24 hours, then average across all work weeks. In the plot, each
+#'   block would display as a heatmap.
+#'
 #' @param return String specifying what to return. This must be one of the
 #'   following strings:
 #'   - `"plot"`
@ -41,6 +50,7 @@
 #' @importFrom data.table ":=" "%like%" "%between%"
 #'
 #' @examples
+#' # Plot by default
 #' workpatterns_rank(
 #'   data = em_data,
 #'   signals = c(
@ -51,6 +61,13 @@
 #'   )
 #'   )
 #'
+#' # Plot with binary mode
+#' workpatterns_rank(
+#'   data = em_data,
+#'   mode = "binary"
+#' )
+#'
+#'
 #' @family Visualization
 #' @family Working Patterns
 #'
@ -60,6 +77,7 @@ workpatterns_rank <- function(data,
                              start_hour = "0900",
                              end_hour = "1700",
                              top = 10,
+                              mode = "prop",
                              return = "plot"){

  # Make sure data.table knows we know we're using it
@ -119,35 +137,70 @@ workpatterns_rank <- function(data,
  ## This is run on `signals_df`
  num_cols <- names(which(sapply(signals_df, is.numeric))) # Get numeric columns

-  ## Save original `signals_df` before manipulating ------------------------
-  ## Rename `Signals_sent` columns to prevent conflict
-  signals_df_o <- signals_df %>%
-    purrr::set_names(
-      nm = gsub(x = names(.),
-                replacement = "_ori_",
-                pattern = "_sent_")
-    ) %>%
-    cbind(select(signals_df, num_cols)) %>% # duplicate signals
-    # Convert `Signals_sent_` prefixed to binary. `Signals_ori_` are intact
-    # Create binary variable 0 or 1
-    .[, (num_cols) := lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = num_cols] %>%
-    # Use `mutate()` method
-    .[, `:=`(WeekCount = .N,
-             PersonCount = dplyr::n_distinct(PersonId),
-             Id = .GRP), # group id assignment
-      by = num_cols]
+  if(mode == "binary"){
+
+    ## Summarized table performed on `signals_df` ----------------------------
+    ## Section ignoring `signals_df_o`
+
+    signals_df <-
+      signals_df %>%
+      data.table::as.data.table() %>%
+      .[, (num_cols) := lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = num_cols] %>%
+      .[, list(WeekCount = .N, PersonCount = dplyr::n_distinct(PersonId)), by = input_var]
+
+    myTable_return <- data.table::setorder(signals_df, -PersonCount)
+
+  } else if(mode == "prop"){
+
+    ## Save original `signals_df` before manipulating ------------------------
+    ## Rename `Signals_sent` columns to prevent conflict
+    signals_df_o <- signals_df %>%
+      purrr::set_names(
+        nm = gsub(x = names(.),
+                  replacement = "_ori_",
+                  pattern = "_sent_")
+      ) %>%
+      cbind(select(signals_df, num_cols)) %>% # duplicate signals
+      # Convert `Signals_sent_` prefixed to binary. `Signals_ori_` are intact
+      # Create binary variable 0 or 1
+      .[, (num_cols) := lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = num_cols] %>%
+      # Use `mutate()` method
+      .[, `:=`(WeekCount = .N,
+               PersonCount = dplyr::n_distinct(PersonId),
+               Id = .GRP), # group id assignment
+        by = num_cols]
+
+    # Wide table showing proportion of signals by hour
+    # Ranked descending by `WeekCount`
+    wp_prop_tb <-
+      signals_df_o %>%
+      arrange(desc(WeekCount)) %>%
+      dplyr::select(Id, dplyr::starts_with("Signals_ori_"), WeekCount)  %>%
+      purrr::set_names(nm = gsub(
+        pattern = "Signals_ori_",
+        replacement = "",
+        x = names(.)
+      )) %>%
+      purrr::set_names(nm = gsub(
+        pattern = "_.+",
+        replacement = "",
+        x = names(.)
+      )) %>%
+      # Need aggregation
+      .[, Signals_Total := rowSums(.SD), .SDcols = hours_col] %>%
+      .[, c(hours_col) := .SD / Signals_Total, .SDcols = hours_col] %>%
+      .[, Signals_Total := NULL] %>% # Remove unneeded column
+      .[, lapply(.SD, mean, na.rm = TRUE), .SDcols = hours_col, by = list(Id, WeekCount)]


-  ## Summarized table performed on `signals_df`
-  signals_df <-
-    signals_df %>%
-    data.table::as.data.table() %>%
-    .[, (num_cols) := lapply(.SD, function(x) ifelse(x > 0, 1, 0)), .SDcols = num_cols] %>%
-    .[, list(WeekCount = .N, PersonCount = dplyr::n_distinct(PersonId)), by = input_var]
+  } else {

-  myTable_return <- data.table::setorder(signals_df, -PersonCount)
+    stop("invalid value to `mode`.")

-  if(return == "plot"){
+  }
+
+
+  if(return == "plot" & mode == "binary"){

    ## Plot return
    sig_label_ <- paste0(sig_label, "_")
@ -239,6 +292,85 @@ workpatterns_rank <- function(data,
      "Top", top, "patterns represent", coverage, "of workweeks.", extract_date_range(data, return = "text"))
    )

+  } else if(return == "plot" & mode == "prop"){
+
+
+    ## Table for annotation
+    myTable_legends <-
+      wp_prop_tb %>%
+      arrange(desc(WeekCount)) %>%
+      mutate(patternRank= 1:nrow(.)) %>%
+      dplyr::select(patternRank, WeekCount) %>%
+      dplyr::mutate(WeekPercentage = WeekCount / sum(WeekCount, na.rm = TRUE),
+                    WeekCount = paste0(scales::percent(WeekPercentage, accuracy = 0.1))) %>%
+      utils::head(top)
+
+    ## Coverage
+    coverage <-
+      myTable_legends %>%
+      summarize(total = sum(WeekPercentage)) %>%
+      pull(1) %>%
+      scales::percent(accuracy = 0.1)
+
+
+    ## Run plot
+    wp_prop_tb %>%
+      utils::head(top)  %>%
+      dplyr::mutate(Id = 1:nrow(.)) %>% # Overwrite Id with row number
+      tidyr::gather(Hours, Freq, -Id, -WeekCount) %>%
+      ggplot2::ggplot(ggplot2::aes(x = Hours, y = Id, fill = Freq)) +
+      ggplot2::geom_tile(height = .5) +
+      ggplot2::ylab(paste("Top", top, "activity patterns")) +
+      ggplot2::scale_y_reverse(expand = c(0, 0), breaks = seq(1, top)) +
+      theme_wpa_basic() +
+      ggplot2::scale_x_discrete(position = "top")+
+      ggplot2::theme(
+        axis.title.x = element_blank(),
+        axis.line = element_blank(),
+        axis.ticks = element_blank()
+      ) +
+      scale_fill_continuous(
+        guide = "legend",
+        low = "white",
+        high = "#1d627e",
+        breaks = 0:1,
+        name = "",
+        labels = c("", paste("Observed activity"))
+      ) +
+      ggplot2::annotate(
+        "text",
+        y = myTable_legends$patternRank,
+        x = 26.5,
+        label = myTable_legends$WeekCount,
+        size = 3
+      ) +
+      ggplot2::annotate("rect",
+                        xmin = 25,
+                        xmax = 28,
+                        ymin = 0.5,
+                        ymax = top + 0.5,
+                        alpha = .2) +
+      ggplot2::annotate("rect",
+                        xmin = 0.5,
+                        xmax = 17 + 0.5,
+                        ymin = 0.5,
+                        ymax = top + 0.5,
+                        alpha = .1,
+                        fill = "gray50") +
+      ggplot2::annotate("rect",
+                        xmin = 9 + 0.5,
+                        xmax = 24.5,
+                        ymin = 0.5,
+                        ymax = top + 0.5,
+                        alpha = .1,
+                        fill = "gray50") +
+      labs(
+        title = "Patterns of digital activity",
+        subtitle = paste("Hourly activity based on", subtitle_signal ,"sent over a week"),
+        caption = paste(
+          "Top", top, "patterns represent", coverage, "of workweeks.", extract_date_range(data, return = "text"))
+      )
+
  } else if(return == "table"){

    dplyr::as_tibble(myTable_return)
--- a/man/workpatterns_rank.Rd
+++ b/man/workpatterns_rank.Rd
@ -10,6 +10,7 @@ workpatterns_rank(
  start_hour = "0900",
  end_hour = "1700",
  top = 10,
+  mode = "prop",
  return = "plot"
 )
 }
@ -32,9 +33,19 @@ e.g. "\verb{0900"}}
 \item{end_hour}{A character vector specifying starting hours,
 e.g. \code{"1700"}}

-\item{top}{number specifying how many top working patterns to display in plot,
+\item{top}{numeric value specifying how many top working patterns to display in plot,
 e.g. \code{"10"}}

+\item{mode}{string specifying aggregation method for plot. Valid
+options include:
+\itemize{
+\item \code{"binary"}: convert hourly activity into binary blocks. In the plot, each
+block would display as solid.
+\item \code{"prop"}: calculate proportion of signals in each hour over total signals
+across 24 hours, then average across all work weeks. In the plot, each
+block would display as a heatmap.
+}}
+
 \item{return}{String specifying what to return. This must be one of the
 following strings:
 \itemize{
@ -59,6 +70,7 @@ table of working patterns, ranked from the most common to the
 least.
 }
 \examples{
+# Plot by default
 workpatterns_rank(
  data = em_data,
  signals = c(
@ -69,6 +81,13 @@ workpatterns_rank(
  )
  )

+# Plot with binary mode
+workpatterns_rank(
+  data = em_data,
+  mode = "binary"
+)
+
+
 }
 \seealso{
 Other Visualization: