Merge branch 'main' into feature/network-community-detection

This commit is contained in:
Martin Chan 2020-12-18 18:28:15 +00:00
Родитель e582cc4684 95c621af1e
Коммит 2f14a4931e
16 изменённых файлов: 249 добавлений и 125 удалений

Просмотреть файл

@ -54,11 +54,8 @@ Imports:
tidytext,
ggraph,
igraph,
widyr,
proxy,
rlang,
GGally,
network,
ggwordcloud,
methods,
data.table

Просмотреть файл

@ -118,6 +118,7 @@ export(one2one_rank)
export(one2one_sum)
export(one2one_summary)
export(one2one_trend)
export(pairwise_count)
export(period_change)
export(personas_hclust)
export(read_preamble)
@ -153,10 +154,10 @@ export(workpatterns_rank)
export(wrap)
import(DT)
import(Information)
import(data.table)
import(dplyr)
import(ggplot2)
import(ggraph)
import(network)
import(reshape2)
import(scales)
import(tidyr)
@ -196,4 +197,3 @@ importFrom(tidyselect,all_of)
importFrom(tidytext,unnest_tokens)
importFrom(utils,write.csv)
importFrom(utils,write.table)
importFrom(widyr,pairwise_count)

Просмотреть файл

@ -35,7 +35,6 @@
#' # Return a plot
#' identify_holidayweeks(sq_data, return = "plot")
#'
#' @return
#'
#' @export
identify_holidayweeks <- function(data, sd = 1, return = "message"){

Просмотреть файл

@ -15,21 +15,21 @@
#' @param metric String containing the variable name for metric.
#' @param exc_threshold Exclusion threshold to apply.
#' @param subtitle String to override default plot subtitle.
#' @param ... Additional arguments to pass to `GGally::ggnet2()`
#' @param return Character vector specifying what to return, defaults to "plot".
#' Valid inputs are "plot" and "table".
#'
#' Valid inputs include:
#' - "plot": return a network plot.
#' - "table": return a raw data table used to plot the network.
#' - "network": return an **igraph** object
#'
#' @import ggplot2
#' @import dplyr
#'
#' @examples
#' \dontrun{
#' ## Return a network plot
#' g2g_data %>%
#' network_g2g(time_investor = "TimeInvestors_Organization",
#' collaborator = "Collaborators_Organization",
#' metric = "Collaboration_hours")
#' }
#'
#' @export
network_g2g <- function(data,
@ -38,8 +38,7 @@ network_g2g <- function(data,
metric,
exc_threshold = 0.1,
subtitle = "Collaboration Across Organizations",
return = "plot",
...){
return = "plot"){
plot_data <-
data %>%
@ -61,7 +60,7 @@ network_g2g <- function(data,
plot_data
} else if(return == "plot"){
} else if(return %in% c("plot", "network")){
## Network object
mynet_em <-
@ -69,27 +68,32 @@ network_g2g <- function(data,
filter(metric_prop > exc_threshold) %>%
mutate_at(vars(TimeInvestorOrg, CollaboratorOrg), ~sub(pattern = " ", replacement = "\n", x = .)) %>%
mutate(metric_prop = metric_prop * 10) %>%
network::network(matrix.type = "edgelist",
ignore.eval = FALSE,
names.eval = "weights")
igraph::graph_from_data_frame(directed = FALSE)
## Plot object
mynet_em %>%
GGally::ggnet2(size = 12,
color = "lightblue",
label = TRUE,
label.size = 4,
label.color = "black",
edge.size = "weights",
edge.alpha = .5,
...) +
ggtitle("Group to Group Collaboration",
subtitle = subtitle) +
xlab(label = "") +
ylab(label = "") +
theme_wpa_basic() +
labs(caption = paste("Displays only collaboration above ", exc_threshold * 100, "% of node's total collaboration", sep = "")) +
theme(axis.line = element_blank())
if(return == "network"){
mynet_em # Return igraph object
} else {
## Plot object
mynet_em %>%
ggraph::ggraph(layout = "fr") +
ggraph::geom_edge_link(aes(edge_width = metric_prop * 1), edge_alpha = 0.5, edge_colour = "grey") +
ggraph::geom_node_point(size = 20, colour = "lightblue") +
ggraph::geom_node_text(aes(label = name), size = 3, repel = FALSE) +
ggplot2::theme(panel.background = ggplot2::element_rect(fill = 'white'), legend.position = "none") +
theme_wpa_basic() +
labs(title = "Group to Group Collaboration",
subtitle = subtitle,
x = "",
y = "",
caption = paste("Displays only collaboration above ", exc_threshold * 100, "% of node's total collaboration", sep = "")) +
theme(axis.line = element_blank(),
axis.text = element_blank(),
legend.position = "none")
}
} else {

Просмотреть файл

@ -12,25 +12,40 @@
#' @param data Data frame containing a person-to-person query.
#' @param hrvar String containing the label for the HR attribute.
#' @param return Character vector specifying what to return, defaults to "pdf".
#' Other valid inputs are "plot", "network" and "table". "network" returns the `network`
#' object used to generate the network plot. The "pdf" option is highly recommended over the "plot"
#' option as the PDF export format has significantly higher performance. The "plot" option is highly
#' computationally intensive and is not generally recommended.
#' Valid inputs are:
#' - "pdf": saves the network plot as a PDF in the specified path. See `path`. This is the recommended
#' output format as large networks can be slow in other formats.
#' - "plot": returns a ggplot object of the network plot. It is not recommended that you run this without
#' assigning the output to an object as plotting to the R console can be slow for large networks.
#' - "table": returns the edgelist data frame used in the network.
#' - "network": returns the igraph object used to create the network plot.
#' @param path File path for saving the PDF output. Defaults to "network_p2p".
#' @param bg_fill String to specify background fill colour.
#' @param font_col String to specify font and link colour.
#' @param legend_pos String to specify position of legend. Defaults to "bottom". See `ggplot2::theme()`.
#' @param palette Function for generating a colour palette with a single argument `n`. Uses `rainbow()` by default.
#' @param ... Additional arguments to pass to `GGally::ggnet2()`.
#' For instance, you may specify the argument `mode` to change the node placement algorithm.
#' @param palette Function for generating a colour palette with a single argument `n`. Uses "rainbow" by default.
#' @param node_alpha A numeric value between 0 and 1 to specify the transparency of the nodes.
#' @param algorithm String to specify the node placement algorithm to be used. Defaults to "fr" for the force-directed
#' algorithm of Fruchterman and Reingold. See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a
#' full list of options.
#'
#' @details
#' For specifying the node placement algorithm, please see the `gplot.layout` documentation for the **sna**
#' package, which provides a full list of the valid functions that can be passed into the `mode` argument.
#' @examples
#' ## Simulate simple P2P network
#' sim_net <-
#' data.frame(TieOrigin_PersonId = sample(seq(1, 100), size = 100, replace = TRUE),
#' TieDestination_PersonId = sample(seq(1, 100), size = 100, replace = TRUE)) %>%
#' dplyr::mutate(TieOrigin_Organization = ifelse(TieOrigin_PersonId >= 50, "A", "B"),
#' TieDestination_Organization = ifelse(TieDestination_PersonId >= 50, "A", "B"),
#' StrongTieScore = rep(1, 100))
#'
#' ## Run plot
#' ## ONLY return 'plot' instead of 'pdf' when data size is small
#' network_p2p(data = sim_net,
#' hrvar = "Organization",
#' return = "plot")
#'
#' @import ggplot2
#' @import dplyr
#' @import network
#' @importFrom grDevices rainbow
#'
#' @export
@ -42,44 +57,52 @@ network_p2p <- function(data,
font_col = "#FFFFFF",
legend_pos = "bottom",
palette = "rainbow",
...){
data <-
data %>%
filter(StrongTieType > 0)
node_alpha = 0.7,
algorithm = "fr"){
## No filtering
tieorigin_var <- paste0("TieOrigin_", hrvar)
tiedestin_var <- paste0("TieDestination_", hrvar)
## Extract edge list
strong_edgelist <-
## Set edges df
edges <-
data %>%
select(TieOrigin_PersonId, TieDestination_PersonId)
select(from = "TieOrigin_PersonId",
to = "TieDestination_PersonId",
weight = "StrongTieScore") %>%
select(-weight) # Overwrite - no info on edge
## Create basic network object
mynet <-
network::network(strong_edgelist, matrix.type="edgelist")
## Vertices data frame to provide meta-data
vert_ft <-
rbind(
# TieOrigin
edges %>%
left_join(select(data, TieOrigin_PersonId, tieorigin_var),
by = c("from" = "TieOrigin_PersonId")) %>%
select(node = "from", !!sym(hrvar) := tieorigin_var),
## Extract list of nodes from network object
myedges <- data.frame(PersonId = network.vertex.names(mynet))
# TieDestination
edges %>%
left_join(select(data, TieDestination_PersonId, tiedestin_var),
by = c("to" = "TieDestination_PersonId")) %>%
select(node = "to", !!sym(hrvar) := tiedestin_var)
)
## Extract attributes
totalAttributes <-
bind_rows(select(data, PersonId = "TieOrigin_PersonId", !!sym(hrvar) := tieorigin_var),
select(data, PersonId = "TieOrigin_PersonId", !!sym(hrvar) := tiedestin_var)) %>%
unique()
## Merge list of nodes and attributes
myedges <- merge(myedges, totalAttributes, by = "PersonId", all.x = TRUE)
## Add attributes to network object
mynet %v% hrvar <- myedges[[hrvar]]
## Create igraph object
g <-
igraph::graph_from_data_frame(edges,
directed = FALSE, # Directed, but FALSE for visualization
vertices = unique(vert_ft)) %>% # remove duplicates
igraph::simplify()
## Palette
## Create tibble
pal <-
tibble(!!sym(hrvar) := g %>%
igraph::get.vertex.attribute(hrvar) %>%
unique())
pal <- tibble(!!sym(hrvar) := unique(myedges[[hrvar]]))
## Apply palette function
col_pal <- do.call(what = palette, args = list(nrow(pal)))
## named character vector
@ -90,24 +113,21 @@ network_p2p <- function(data,
if(return == "table"){
data
edges
} else if(return == "network"){
mynet
g
} else if(return %in% c("plot", "pdf")){
outputPlot <-
mynet %>%
GGally::ggnet2(size = 1,
color = hrvar,
label = FALSE,
edge.size = .01,
edge.alpha = .15,
node.alpha = .8,
palette = pal,
...) +
g %>%
ggraph::ggraph(layout = "igraph", algorithm = algorithm) +
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
ggraph::geom_node_point(aes(colour = !!sym(hrvar)), alpha = node_alpha) +
scale_colour_discrete(type = pal) +
theme_void() +
theme(legend.position = legend_pos,
legend.background = element_rect(fill = bg_fill),
plot.background = element_rect(fill = bg_fill),

48
R/pairwise_count.R Normal file
Просмотреть файл

@ -0,0 +1,48 @@
#' @title Perform a pairwise count of words by id
#'
#' @description This is a **data.table** implementation that mimics the output of
#' `widyr::pairwise_count()` to reduce package dependency. This is used internally
#' within `tm_cooc()`.
#'
#' @param data Data frame output from `tm_clean()`.
#' @param id String to represent the id variable. Defaults to "word".
#' @param word String to represent the word variable. Defaults to "word".
#'
#' @import data.table
#'
#' @export
pairwise_count <- function(data,
id = "line",
word = "word"){
data <-
data %>%
dplyr::rename(word := !!sym(word),
id := !!sym(id))
DT <- data.table::as.data.table(data)
# convert to character
DT[, word := as.character(word)]
# subset those with >1 per id
DT2 <- DT[, N := .N, by = id][N>1]
# create all combinations of 2
# return as a data.table with these as columns `V1` and `V2`
# then count the numbers in each id
out_data <-
DT2[, rbindlist(utils::combn(word,2,
FUN = function(x) as.data.table(as.list(x)),
simplify = FALSE)), by = id] %>%
.[, .N, by = list(V1,V2)]
# format and sort
out_data %>%
dplyr::as_tibble() %>%
dplyr::rename(item1 = "V1",
item2 = "V2",
n = "N") %>%
dplyr::arrange(desc(n))
}

Просмотреть файл

@ -83,18 +83,18 @@ theme_wpa_basic <- function(font_size = 12){
text_small_dark <- element_text(size = font_size - 2, colour = text_colour, face = "plain")
text_small_light <- element_text(size = font_size - 2, colour = "#FFFFFF", face = "plain")
text_normal <- element_text(size = font_size + 2, colour = text_colour, face = "plain")
text_italic <- element_text(size = font_size + 2, colour = text_colour, face = "italic")
text_normal <- element_text(size = font_size + 0, colour = text_colour, face = "plain")
text_italic <- element_text(size = font_size + 0, colour = text_colour, face = "italic")
text_bold <- element_text(size = font_size + 2, colour = text_colour, face = "bold")
text_title <- element_text(size = font_size + 5, colour = text_colour, face = "bold")
text_bold <- element_text(size = font_size + 0, colour = text_colour, face = "bold")
text_title <- element_text(size = font_size + 2, colour = text_colour, face = "bold")
theme_minimal() +
theme(plot.background = element_blank(),
# plot.background = element_rect(fill = bg_colour),
text = text_normal,
plot.title = text_title,
plot.subtitle = text_italic,
plot.subtitle = text_normal,
axis.title = text_normal,
axis.text = text_small_dark,

Просмотреть файл

@ -26,15 +26,16 @@
#' @import ggplot2
#' @import ggraph
#' @importFrom igraph graph_from_data_frame
#' @importFrom widyr pairwise_count
#' @importFrom tidytext unnest_tokens
#'
#' @family Text-mining
#'
#' @examples
#' \dontrun{
#' tm_cooc(mt_data,lmult = 0.01)
#' }
#' # Demo using a subset of `mt_data`
#' mt_data %>%
#' dplyr::slice(1:20) %>%
#' tm_cooc(lmult = 0.01)
#'
#' @export
tm_cooc <- function(data,
stopwords = NULL,
@ -50,10 +51,7 @@ tm_cooc <- function(data,
# Calculate frequency of pairs
title_word_pairs <-
text_df %>%
widyr::pairwise_count(word,
line,
sort = TRUE,
upper = FALSE)
pairwise_count(id = "line", word = "word")
# Graph networks
set.seed(seed)

Просмотреть файл

@ -20,6 +20,9 @@
#' @param hrvar HR Variable by which to split metrics, defaults to "Organization"
#' but accepts any character vector, e.g. "Organization"
#' @param path Pass the file path and the desired file name, _excluding the file extension_.
#' @param hrvar_threshold Numeric value determining the maximum number of unique values
#' to be allowed to qualify as a HR variable. This is passed directly to the `threshold`
#' argument within `hrvar_count_all()`.
#' @param timestamp Logical vector specifying whether to include a timestamp in the file name.
#' Defaults to TRUE.
#'
@ -41,6 +44,7 @@ validation_report <- function(data,
meeting_data = NULL,
hrvar = "Organization",
path = "validation report",
hrvar_threshold = 150,
timestamp = TRUE){
## Create timestamped path (if applicable)
@ -114,7 +118,7 @@ validation_report <- function(data,
read_preamble("organizational_data_quality.md"), #13, Header - 2. Organizational Data Quality
read_preamble("attributes_available.md"),#14
data %>% hrvar_count_all(return = "table"),
data %>% hrvar_count_all(return = "table", threshold = hrvar_threshold),
read_preamble("groups_under_privacy_threshold_1.md"), #16, Header - 2.2 Groups under Privacy Threshold
paste(">", data %>% identify_privacythreshold(return="text")),

Просмотреть файл

@ -20,9 +20,6 @@ Valid options are:
\item when 'data_dirty' is passed, a dataset with only outlier weeks is returned as a dataframe.
\item when 'plot' is passed, a pot with holiday weeks highlighted is returned as a dataframe.
}}
}
\value{
}
\description{
This function scans a standard query output for weeks where collaboration hours is far outside the mean.
@ -30,6 +27,15 @@ Returns a list of weeks that appear to be holiday weeks and optionally an edited
By default, missing values are excluded.
As best practice, run this function prior to any analysis to remove atypical collaboration weeks from your dataset.
}
\examples{
# Return a message by default
identify_holidayweeks(sq_data)
# Return a plot
identify_holidayweeks(sq_data, return = "plot")
}
\seealso{
Other Data Validation:

Просмотреть файл

@ -12,8 +12,7 @@ network_g2g(
metric,
exc_threshold = 0.1,
subtitle = "Collaboration Across Organizations",
return = "plot",
...
return = "plot"
)
g2g_network(
@ -23,8 +22,7 @@ g2g_network(
metric,
exc_threshold = 0.1,
subtitle = "Collaboration Across Organizations",
return = "plot",
...
return = "plot"
)
}
\arguments{
@ -41,20 +39,22 @@ g2g_network(
\item{subtitle}{String to override default plot subtitle.}
\item{return}{Character vector specifying what to return, defaults to "plot".
Valid inputs are "plot" and "table".}
\item{...}{Additional arguments to pass to \code{GGally::ggnet2()}}
Valid inputs include:
\itemize{
\item "plot": return a network plot.
\item "table": return a raw data table used to plot the network.
\item "network": return an \strong{igraph} object
}}
}
\description{
Pass a data frame containing a group-to-group query and return a network plot.
Automatically handles "Collaborators_within_group" and "Other_collaborators" within query data.
}
\examples{
\dontrun{
## Return a network plot
g2g_data \%>\%
network_g2g(time_investor = "TimeInvestors_Organization",
collaborator = "Collaborators_Organization",
metric = "Collaboration_hours")
}
}

Просмотреть файл

@ -13,7 +13,8 @@ network_p2p(
font_col = "#FFFFFF",
legend_pos = "bottom",
palette = "rainbow",
...
node_alpha = 0.7,
algorithm = "fr"
)
}
\arguments{
@ -22,10 +23,15 @@ network_p2p(
\item{hrvar}{String containing the label for the HR attribute.}
\item{return}{Character vector specifying what to return, defaults to "pdf".
Other valid inputs are "plot", "network" and "table". "network" returns the \code{network}
object used to generate the network plot. The "pdf" option is highly recommended over the "plot"
option as the PDF export format has significantly higher performance. The "plot" option is highly
computationally intensive and is not generally recommended.}
Valid inputs are:
\itemize{
\item "pdf": saves the network plot as a PDF in the specified path. See \code{path}. This is the recommended
output format as large networks can be slow in other formats.
\item "plot": returns a ggplot object of the network plot. It is not recommended that you run this without
assigning the output to an object as plotting to the R console can be slow for large networks.
\item "table": returns the edgelist data frame used in the network.
\item "network": returns the igraph object used to create the network plot.
}}
\item{path}{File path for saving the PDF output. Defaults to "network_p2p".}
@ -35,16 +41,31 @@ computationally intensive and is not generally recommended.}
\item{legend_pos}{String to specify position of legend. Defaults to "bottom". See \code{ggplot2::theme()}.}
\item{palette}{Function for generating a colour palette with a single argument \code{n}. Uses \code{rainbow()} by default.}
\item{palette}{Function for generating a colour palette with a single argument \code{n}. Uses "rainbow" by default.}
\item{...}{Additional arguments to pass to \code{GGally::ggnet2()}.
For instance, you may specify the argument \code{mode} to change the node placement algorithm.}
\item{node_alpha}{A numeric value between 0 and 1 to specify the transparency of the nodes.}
\item{algorithm}{String to specify the node placement algorithm to be used. Defaults to "fr" for the force-directed
algorithm of Fruchterman and Reingold. See \url{https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html} for a
full list of options.}
}
\description{
Pass a data frame containing a person-to-person query and save a network
plot as a PDF file.
}
\details{
For specifying the node placement algorithm, please see the \code{gplot.layout} documentation for the \strong{sna}
package, which provides a full list of the valid functions that can be passed into the \code{mode} argument.
\examples{
## Simulate simple P2P network
sim_net <-
data.frame(TieOrigin_PersonId = sample(seq(1, 100), size = 100, replace = TRUE),
TieDestination_PersonId = sample(seq(1, 100), size = 100, replace = TRUE)) \%>\%
dplyr::mutate(TieOrigin_Organization = ifelse(TieOrigin_PersonId >= 50, "A", "B"),
TieDestination_Organization = ifelse(TieDestination_PersonId >= 50, "A", "B"),
StrongTieScore = rep(1, 100))
## Run plot
## ONLY return 'plot' instead of 'pdf' when data size is small
network_p2p(data = sim_net,
hrvar = "Organization",
return = "plot")
}

20
man/pairwise_count.Rd Normal file
Просмотреть файл

@ -0,0 +1,20 @@
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/pairwise_count.R
\name{pairwise_count}
\alias{pairwise_count}
\title{Perform a pairwise count of words by id}
\usage{
pairwise_count(data, id = "line", word = "word")
}
\arguments{
\item{data}{Data frame output from \code{tm_clean()}.}
\item{id}{String to represent the id variable. Defaults to "word".}
\item{word}{String to represent the word variable. Defaults to "word".}
}
\description{
This is a \strong{data.table} implementation that mimics the output of
\code{widyr::pairwise_count()} to reduce package dependency. This is used internally
within \code{tm_cooc()}.
}

Просмотреть файл

@ -29,9 +29,11 @@ There is an option to remove stopwords by passing a data frame into the \code{st
argument.
}
\examples{
\dontrun{
tm_cooc(mt_data,lmult = 0.01)
}
# Demo using a subset of `mt_data`
mt_data \%>\%
dplyr::slice(1:20) \%>\%
tm_cooc(lmult = 0.01)
}
\seealso{
Other Text-mining:

Просмотреть файл

@ -9,6 +9,7 @@ validation_report(
meeting_data = NULL,
hrvar = "Organization",
path = "validation report",
hrvar_threshold = 150,
timestamp = TRUE
)
}
@ -22,6 +23,10 @@ but accepts any character vector, e.g. "Organization"}
\item{path}{Pass the file path and the desired file name, \emph{excluding the file extension}.}
\item{hrvar_threshold}{Numeric value determining the maximum number of unique values
to be allowed to qualify as a HR variable. This is passed directly to the \code{threshold}
argument within \code{hrvar_count_all()}.}
\item{timestamp}{Logical vector specifying whether to include a timestamp in the file name.
Defaults to TRUE.}
}

Просмотреть файл

@ -5,7 +5,7 @@ var config = {
lineage: true
},
coreData: {
appId: "JS:JSLLTest"
appId: "wpaGitHubSite"
}
};
awa.init(config);
awa.init(config);