зеркало из https://github.com/microsoft/wpa.git
Merge pull request #32 from microsoft/feature/network-community-detection
Feature: add Community Detection capability for network plots
This commit is contained in:
Коммит
2118a61a1a
|
@ -63,5 +63,6 @@ RoxygenNote: 7.1.1
|
||||||
Roxygen: list(markdown = TRUE)
|
Roxygen: list(markdown = TRUE)
|
||||||
VignetteBuilder: knitr
|
VignetteBuilder: knitr
|
||||||
Suggests:
|
Suggests:
|
||||||
extrafont
|
extrafont,
|
||||||
|
leiden
|
||||||
Language: en-US
|
Language: en-US
|
||||||
|
|
|
@ -49,6 +49,7 @@ export(create_line)
|
||||||
export(create_line_asis)
|
export(create_line_asis)
|
||||||
export(create_period_scatter)
|
export(create_period_scatter)
|
||||||
export(create_rank)
|
export(create_rank)
|
||||||
|
export(create_sankey)
|
||||||
export(create_scatter)
|
export(create_scatter)
|
||||||
export(create_stacked)
|
export(create_stacked)
|
||||||
export(create_trend)
|
export(create_trend)
|
||||||
|
@ -105,7 +106,10 @@ export(meetingtype_sum)
|
||||||
export(meetingtype_summary)
|
export(meetingtype_summary)
|
||||||
export(mgrcoatt_dist)
|
export(mgrcoatt_dist)
|
||||||
export(mgrrel_matrix)
|
export(mgrrel_matrix)
|
||||||
|
export(network_describe)
|
||||||
export(network_g2g)
|
export(network_g2g)
|
||||||
|
export(network_leiden)
|
||||||
|
export(network_louvain)
|
||||||
export(network_p2p)
|
export(network_p2p)
|
||||||
export(one2one_dist)
|
export(one2one_dist)
|
||||||
export(one2one_fizz)
|
export(one2one_fizz)
|
||||||
|
@ -150,16 +154,18 @@ export(workpatterns_rank)
|
||||||
export(wrap)
|
export(wrap)
|
||||||
import(DT)
|
import(DT)
|
||||||
import(Information)
|
import(Information)
|
||||||
import(data.table)
|
|
||||||
import(dplyr)
|
import(dplyr)
|
||||||
import(ggplot2)
|
import(ggplot2)
|
||||||
import(ggraph)
|
import(ggraph)
|
||||||
import(reshape2)
|
import(reshape2)
|
||||||
import(scales)
|
import(scales)
|
||||||
|
import(tidyr)
|
||||||
import(tidyselect)
|
import(tidyselect)
|
||||||
importFrom(data.table,"%between%")
|
importFrom(data.table,"%between%")
|
||||||
importFrom(data.table,"%like%")
|
importFrom(data.table,"%like%")
|
||||||
importFrom(data.table,":=")
|
importFrom(data.table,":=")
|
||||||
|
importFrom(data.table,as.data.table)
|
||||||
|
importFrom(data.table,rbindlist)
|
||||||
importFrom(dplyr,`%>%`)
|
importFrom(dplyr,`%>%`)
|
||||||
importFrom(dplyr,mutate_if)
|
importFrom(dplyr,mutate_if)
|
||||||
importFrom(grDevices,rainbow)
|
importFrom(grDevices,rainbow)
|
||||||
|
|
|
@ -0,0 +1,72 @@
|
||||||
|
# --------------------------------------------------------------------------------------------
|
||||||
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||||
|
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
||||||
|
# --------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#' @title Create a sankey chart from a two-column count table
|
||||||
|
#'
|
||||||
|
#' @description
|
||||||
|
#' Create a networkD3 style sankey chart based on a long count table
|
||||||
|
#' with two variables. The input data should have three columns, where
|
||||||
|
#' each row is a unique group:
|
||||||
|
#' 1. Variable 1
|
||||||
|
#' 2. Variable 2
|
||||||
|
#' 3. Count
|
||||||
|
#'
|
||||||
|
#' @param data Data frame of the long count table.
|
||||||
|
#' @param var1 String containing the name of the variable to be shown on the left.
|
||||||
|
#' @param var2 String containing the name of the variable to be shown on the right.
|
||||||
|
#' @param count String containing the name of the count variable.
|
||||||
|
#'
|
||||||
|
#' @import dplyr
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' \donttest{
|
||||||
|
#' sq_data %>%
|
||||||
|
#' dplyr::count(Organization, FunctionType) %>%
|
||||||
|
#' create_sankey(var1 = "Organization", var2 = "FunctionType")
|
||||||
|
#' }
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
create_sankey <- function(data, var1, var2, count = "n"){
|
||||||
|
|
||||||
|
## Rename
|
||||||
|
data$pre_group <- data[[var1]]
|
||||||
|
data$group <- data[[var2]]
|
||||||
|
|
||||||
|
## Set up `nodes`
|
||||||
|
group_source <- unique(data$pre_group)
|
||||||
|
group_target <- paste0(unique(data$group), " ")
|
||||||
|
|
||||||
|
groups <- c(group_source, group_target)
|
||||||
|
|
||||||
|
nodes_source <- tibble(name = group_source)
|
||||||
|
nodes_target <- tibble(name = group_target)
|
||||||
|
nodes <- rbind(nodes_source, nodes_target) %>% mutate(node = 0:(nrow(.) - 1))
|
||||||
|
|
||||||
|
## Set up `links`
|
||||||
|
links <-
|
||||||
|
data %>%
|
||||||
|
mutate(group = paste0(group, " ")) %>%
|
||||||
|
select(source = "pre_group",
|
||||||
|
target = "group",
|
||||||
|
value = count)
|
||||||
|
|
||||||
|
nodes_source <- nodes_source %>% select(name) # Make `nodes` a single column data frame
|
||||||
|
nodes_target <- nodes_target %>% select(name) # Make `nodes` a single column data frame
|
||||||
|
|
||||||
|
links <-
|
||||||
|
links %>%
|
||||||
|
left_join(nodes %>% rename(IDsource = "node"), by = c("source" = "name")) %>%
|
||||||
|
left_join(nodes %>% rename(IDtarget = "node"), by = c("target" = "name"))
|
||||||
|
|
||||||
|
|
||||||
|
networkD3::sankeyNetwork(Links = as.data.frame(links),
|
||||||
|
Nodes = as.data.frame(nodes),
|
||||||
|
Source = 'IDsource', # Change reference to IDsource
|
||||||
|
Target = 'IDtarget', # Change reference to IDtarget
|
||||||
|
Value = 'value',
|
||||||
|
NodeID = 'name',
|
||||||
|
units="count",
|
||||||
|
sinksRight = FALSE)
|
||||||
|
}
|
|
@ -0,0 +1,125 @@
|
||||||
|
# --------------------------------------------------------------------------------------------
|
||||||
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||||
|
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
||||||
|
# --------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#' @title Uncover HR attributes which best represent a population for a Person to Person query
|
||||||
|
#'
|
||||||
|
#' @author Tannaz Sattari Tabrizi <Tannaz.Sattari@microsoft.com>
|
||||||
|
#'
|
||||||
|
#' @description
|
||||||
|
#' Returns a data frame that gives a percentage of the group combinations that best represent
|
||||||
|
#' the population provided. Uses a person to person query.
|
||||||
|
#'
|
||||||
|
#' @param data Data frame for a person to person query.
|
||||||
|
#' @param hrvar Character vector of length 3 containing the HR attributes to be used.
|
||||||
|
#'
|
||||||
|
#' @import dplyr
|
||||||
|
#' @import tidyr
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
network_describe <- function(data, hrvar = c("Organization", "LevelDesignation", "FunctionType")){
|
||||||
|
|
||||||
|
if(length(hrvar) != 3){
|
||||||
|
|
||||||
|
stop("Please provide a character vector of length 3 for `hrvar`")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
## De-duplicated data containing only TieOrigins
|
||||||
|
filtered_Data <- unique(select(data, starts_with("TieOrigin_")))
|
||||||
|
|
||||||
|
## Select features
|
||||||
|
features <- select(filtered_Data, paste0("TieOrigin_", hrvar))
|
||||||
|
|
||||||
|
## Feature set: 1
|
||||||
|
max_percentages_1f <-
|
||||||
|
features %>%
|
||||||
|
colnames() %>%
|
||||||
|
purrr::map(function(c){
|
||||||
|
|
||||||
|
agg <-
|
||||||
|
features %>%
|
||||||
|
group_by_at(.vars = vars(c)) %>%
|
||||||
|
summarise(count = n(), .groups = "drop") %>%
|
||||||
|
mutate(percentage = count / sum(count, na.rm = TRUE))
|
||||||
|
|
||||||
|
agg %>%
|
||||||
|
arrange(desc(percentage)) %>%
|
||||||
|
slice(1) %>% # Extract first row
|
||||||
|
mutate(feature_1 = c,
|
||||||
|
feature_1_value = !!sym(c)) %>%
|
||||||
|
select(feature_1, feature_1_value, Percentage = "percentage")
|
||||||
|
}) %>%
|
||||||
|
bind_rows()
|
||||||
|
|
||||||
|
## Feature set: 2
|
||||||
|
max_percentages_2f <-
|
||||||
|
list(c1 = colnames(features),
|
||||||
|
c2 = colnames(features)) %>%
|
||||||
|
expand.grid(stringsAsFactors = FALSE) %>%
|
||||||
|
filter(c1 != c2) %>%
|
||||||
|
purrr::pmap(function(c1, c2){
|
||||||
|
agg <-
|
||||||
|
features %>%
|
||||||
|
group_by_at(.vars=vars(c1, c2)) %>%
|
||||||
|
summarise(count = n(), .groups = "drop") %>%
|
||||||
|
mutate(percentage = count / sum(count, na.rm = TRUE))
|
||||||
|
|
||||||
|
agg %>%
|
||||||
|
arrange(desc(percentage)) %>%
|
||||||
|
slice(1) %>% # Extract first row
|
||||||
|
mutate(feature_1 = c1,
|
||||||
|
feature_1_value = !!sym(as.character(c1)),
|
||||||
|
feature_2 = c2,
|
||||||
|
feature_2_value = !!sym(as.character(c2))) %>%
|
||||||
|
select(feature_1,
|
||||||
|
feature_1_value,
|
||||||
|
feature_2,
|
||||||
|
feature_2_value,
|
||||||
|
Percentage = "percentage")
|
||||||
|
}) %>%
|
||||||
|
bind_rows()
|
||||||
|
|
||||||
|
|
||||||
|
## Feature set: 3
|
||||||
|
max_percentages_3f <-
|
||||||
|
list(c1 = colnames(features),
|
||||||
|
c2 = colnames(features),
|
||||||
|
c3 = colnames(features)) %>%
|
||||||
|
expand.grid(stringsAsFactors = FALSE) %>%
|
||||||
|
filter(c1 != c2,
|
||||||
|
c2 != c3,
|
||||||
|
c3 != c1) %>%
|
||||||
|
purrr::pmap(function(c1, c2, c3){
|
||||||
|
agg <-
|
||||||
|
features %>%
|
||||||
|
group_by_at(.vars=vars(c1, c2, c3)) %>%
|
||||||
|
summarise(count = n(), .groups = "drop") %>%
|
||||||
|
mutate(percentage = count / sum(count, na.rm = TRUE))
|
||||||
|
|
||||||
|
agg %>%
|
||||||
|
arrange(desc(percentage)) %>%
|
||||||
|
slice(1) %>% # Extract first row
|
||||||
|
mutate(feature_1 = c1,
|
||||||
|
feature_1_value = !!sym(c1),
|
||||||
|
feature_2 = c2,
|
||||||
|
feature_2_value = !!sym(c2),
|
||||||
|
feature_3 = c3,
|
||||||
|
feature_3_value = !!sym(c3)) %>%
|
||||||
|
select(feature_1,
|
||||||
|
feature_1_value,
|
||||||
|
feature_2,
|
||||||
|
feature_2_value,
|
||||||
|
feature_3,
|
||||||
|
feature_3_value,
|
||||||
|
Percentage = "percentage")
|
||||||
|
}) %>%
|
||||||
|
bind_rows()
|
||||||
|
|
||||||
|
list(max_percentages_1f,
|
||||||
|
max_percentages_2f,
|
||||||
|
max_percentages_3f) %>%
|
||||||
|
bind_rows() %>%
|
||||||
|
select(starts_with("feature"), Percentage)
|
||||||
|
}
|
|
@ -0,0 +1,225 @@
|
||||||
|
# --------------------------------------------------------------------------------------------
|
||||||
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||||
|
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
||||||
|
# --------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#' @title Implement the Leiden community detection on a Person to Person network query
|
||||||
|
#'
|
||||||
|
#' @description
|
||||||
|
#' Take a P2P network query and implement the Leiden community detection method. To run
|
||||||
|
#' this function, you will require all the pre-requisites of the **leiden** package installed,
|
||||||
|
#' which includes Python and **reticulate**.
|
||||||
|
#'
|
||||||
|
#' @param data Data frame containing a Person to Person query.
|
||||||
|
#' @param hrvar String containing the HR attribute to be matched in the dataset.
|
||||||
|
#' @param bg_fill String to specify background fill colour.
|
||||||
|
#' @param font_col String to specify font and link colour.
|
||||||
|
#' @param node_alpha A numeric value between 0 and 1 to specify the transparency of the nodes.
|
||||||
|
#' @param path File path for saving the PDF output. Defaults to "network_p2p_leiden".
|
||||||
|
#' Since the network outputs are computationally intensive, the default behaviour is to save time by
|
||||||
|
#' saving the plot output directly as a PDF in the specified path. To override this behaviour and return
|
||||||
|
#' a plot object instead, you can pass `NULL` to `path`. What is passed to `path` makes no difference
|
||||||
|
#' if returning anything other than "plot-leiden" or "plot-hrvar".
|
||||||
|
#'
|
||||||
|
#' @param algorithm String to specify the node placement algorithm to be used. Defaults to "mds" to perform
|
||||||
|
#' a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
|
||||||
|
#' See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options.
|
||||||
|
#'
|
||||||
|
#' @param res Resolution parameter to be passed to `leiden::leiden()`. Defaults to 0.5.
|
||||||
|
#' @param desc_hrvar Character vector of length 3 containing the HR attributes to use when returning the
|
||||||
|
#' "describe" output. See `network_describe()`.
|
||||||
|
#' @param return String specifying what output to return. Valid return options include:
|
||||||
|
#' - 'plot-leiden': return a network plot coloured by leiden communities.
|
||||||
|
#' - 'plot-hrvar': return a network plot coloured by HR attribute.
|
||||||
|
#' - 'plot-sankey': return a sankey plot combining communities and HR attribute.
|
||||||
|
#' - 'table': return a vertex summary table with counts in communities and HR attribute.
|
||||||
|
#' - 'data': return a vertex data file that matches vertices with communities and HR attributes.
|
||||||
|
#' - 'describe': return a list of data frames which describe each of the identified communities.
|
||||||
|
#' - 'network': return igraph object.
|
||||||
|
#'
|
||||||
|
#' @import dplyr
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
network_leiden <- function(data,
|
||||||
|
hrvar,
|
||||||
|
bg_fill = "#000000",
|
||||||
|
font_col = "#FFFFFF",
|
||||||
|
algorithm = "mds",
|
||||||
|
path = "network_p2p_leiden",
|
||||||
|
node_alpha = 0.8,
|
||||||
|
res = 0.5,
|
||||||
|
desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
|
||||||
|
return){
|
||||||
|
|
||||||
|
## Set variables
|
||||||
|
TO_hrvar <- paste0("TieOrigin_", hrvar)
|
||||||
|
TD_hrvar <- paste0("TieDestination_", hrvar)
|
||||||
|
|
||||||
|
## Set edges df
|
||||||
|
edges <-
|
||||||
|
data %>%
|
||||||
|
select(from = "TieOrigin_PersonId",
|
||||||
|
to = "TieDestination_PersonId",
|
||||||
|
weight = "StrongTieScore")
|
||||||
|
|
||||||
|
## Vertices data frame to provide meta-data
|
||||||
|
vert_ft <-
|
||||||
|
rbind(
|
||||||
|
# TieOrigin
|
||||||
|
edges %>%
|
||||||
|
select(from) %>% # Single column
|
||||||
|
unique() %>% # Remove duplications
|
||||||
|
left_join(select(data, TieOrigin_PersonId, TO_hrvar),
|
||||||
|
by = c("from" = "TieOrigin_PersonId")) %>%
|
||||||
|
select(node = "from", !!sym(hrvar) := TO_hrvar),
|
||||||
|
|
||||||
|
# TieDestination
|
||||||
|
edges %>%
|
||||||
|
select(to) %>% # Single column
|
||||||
|
unique() %>% # Remove duplications
|
||||||
|
left_join(select(data, TieDestination_PersonId, TD_hrvar),
|
||||||
|
by = c("to" = "TieDestination_PersonId")) %>%
|
||||||
|
select(node = "to", !!sym(hrvar) := TD_hrvar)
|
||||||
|
)
|
||||||
|
|
||||||
|
## Create igraph object
|
||||||
|
g_raw <-
|
||||||
|
igraph::graph_from_data_frame(edges,
|
||||||
|
directed = TRUE, # Directed, but FALSE for visualization
|
||||||
|
vertices = unique(vert_ft)) # remove duplicates
|
||||||
|
|
||||||
|
## Return a numeric vector of partitions / clusters / modules
|
||||||
|
## Set a low resolution parameter to have fewer groups
|
||||||
|
ld <- leiden::leiden(g_raw, resolution_parameter = res) # create partitions
|
||||||
|
|
||||||
|
## Add cluster
|
||||||
|
g <-
|
||||||
|
g_raw %>%
|
||||||
|
# Add leiden partitions to graph object
|
||||||
|
igraph::set_vertex_attr("cluster", value = as.character(ld)) %>%
|
||||||
|
igraph::simplify()
|
||||||
|
|
||||||
|
## Create vertex table
|
||||||
|
vertex_tb <-
|
||||||
|
g %>%
|
||||||
|
igraph::get.vertex.attribute() %>%
|
||||||
|
as_tibble()
|
||||||
|
|
||||||
|
g_layout <-
|
||||||
|
g %>%
|
||||||
|
ggraph::ggraph(layout = "igraph", algorithm = algorithm)
|
||||||
|
|
||||||
|
## Return
|
||||||
|
if(return == "plot-leiden"){
|
||||||
|
|
||||||
|
plot_output <-
|
||||||
|
g_layout +
|
||||||
|
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
|
||||||
|
ggraph::geom_node_point(aes(colour = cluster),
|
||||||
|
alpha = node_alpha,
|
||||||
|
pch = 16) +
|
||||||
|
theme_void() +
|
||||||
|
theme(legend.position = "bottom",
|
||||||
|
legend.background = element_rect(fill = bg_fill),
|
||||||
|
plot.background = element_rect(fill = bg_fill),
|
||||||
|
text = element_text(colour = font_col),
|
||||||
|
axis.line = element_blank()) +
|
||||||
|
labs(title = "Person to person collaboration with Community Detection",
|
||||||
|
subtitle = "Based on Leiden algorithm and Strong Tie Score",
|
||||||
|
y = "",
|
||||||
|
x = "")
|
||||||
|
|
||||||
|
# Default PDF output unless NULL supplied to path
|
||||||
|
if(is.null(path)){
|
||||||
|
|
||||||
|
plot_output
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
ggsave(paste0(path, tstamp(), ".pdf"),
|
||||||
|
plot = plot_output,
|
||||||
|
width = 16,
|
||||||
|
height = 9)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if(return == "plot-hrvar"){
|
||||||
|
|
||||||
|
plot_output <-
|
||||||
|
g_layout +
|
||||||
|
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
|
||||||
|
ggraph::geom_node_point(aes(colour = !!sym(hrvar)),
|
||||||
|
alpha = node_alpha,
|
||||||
|
pch = 16) +
|
||||||
|
theme_void() +
|
||||||
|
theme(legend.position = "bottom",
|
||||||
|
legend.background = element_rect(fill = bg_fill),
|
||||||
|
plot.background = element_rect(fill = bg_fill),
|
||||||
|
text = element_text(colour = font_col),
|
||||||
|
axis.line = element_blank()) +
|
||||||
|
labs(title = "Person to person collaboration",
|
||||||
|
subtitle = paste0("Showing ", hrvar),
|
||||||
|
y = "",
|
||||||
|
x = "")
|
||||||
|
|
||||||
|
# Default PDF output unless NULL supplied to path
|
||||||
|
if(is.null(path)){
|
||||||
|
|
||||||
|
plot_output
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
ggsave(paste0(path, tstamp(), ".pdf"),
|
||||||
|
plot = plot_output,
|
||||||
|
width = 16,
|
||||||
|
height = 9)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if(return == "table"){
|
||||||
|
|
||||||
|
vertex_tb %>%
|
||||||
|
count(!!sym(hrvar), cluster)
|
||||||
|
|
||||||
|
} else if(return == "data"){
|
||||||
|
|
||||||
|
vertex_tb
|
||||||
|
|
||||||
|
} else if(return == "network"){
|
||||||
|
|
||||||
|
g
|
||||||
|
|
||||||
|
|
||||||
|
} else if(return == "plot-sankey"){
|
||||||
|
|
||||||
|
create_sankey(data = vertex_tb %>% count(!!sym(hrvar), cluster),
|
||||||
|
var1 = hrvar,
|
||||||
|
var2 = "cluster",
|
||||||
|
count = "n")
|
||||||
|
|
||||||
|
} else if(return == "describe"){
|
||||||
|
|
||||||
|
describe_tb <-
|
||||||
|
vertex_tb %>%
|
||||||
|
left_join(select(data, starts_with("TieOrigin_")),
|
||||||
|
by = c("name" = "TieOrigin_PersonId"))
|
||||||
|
|
||||||
|
desc_str <-
|
||||||
|
describe_tb %>%
|
||||||
|
pull(cluster) %>%
|
||||||
|
unique()
|
||||||
|
|
||||||
|
desc_str %>%
|
||||||
|
purrr::map(function(x){
|
||||||
|
describe_tb %>%
|
||||||
|
filter(cluster == x) %>%
|
||||||
|
network_describe(hrvar = desc_hrvar)
|
||||||
|
}) %>%
|
||||||
|
setNames(nm = desc_str)
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
stop("Please enter a valid input for `return`.")
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,222 @@
|
||||||
|
# --------------------------------------------------------------------------------------------
|
||||||
|
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||||
|
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
||||||
|
# --------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
|
#' @title Implement the Louvain community detection on a Person to Person network query
|
||||||
|
#'
|
||||||
|
#' @description
|
||||||
|
#' Take a P2P network query and implement the Louvain community detection method. The
|
||||||
|
#' **igraph** implementation of the Louvain method is used.
|
||||||
|
#'
|
||||||
|
#' @param data Data frame containing a Person to Person query.
|
||||||
|
#' @param hrvar String containing the HR attribute to be matched in the dataset.
|
||||||
|
#' @param bg_fill String to specify background fill colour.
|
||||||
|
#' @param font_col String to specify font and link colour.
|
||||||
|
#' @param node_alpha A numeric value between 0 and 1 to specify the transparency of the nodes.
|
||||||
|
#' @param algorithm String to specify the node placement algorithm to be used. Defaults to "mds" to perform
|
||||||
|
#' a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
|
||||||
|
#' See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options.
|
||||||
|
#' @param path File path for saving the PDF output. Defaults to "network_p2p_louvain".
|
||||||
|
#' Since the network outputs are computationally intensive, the default behaviour is to save time by
|
||||||
|
#' saving the plot output directly as a PDF in the specified path. To override this behaviour and return
|
||||||
|
#' a plot object instead, you can pass `NULL` to `path`. What is passed to `path` makes no difference
|
||||||
|
#' if returning anything other than "plot-louvain" or "plot-hrvar".
|
||||||
|
#' @param desc_hrvar Character vector of length 3 containing the HR attributes to use when returning the
|
||||||
|
#' "describe" output. See `network_describe()`.
|
||||||
|
#'
|
||||||
|
#' @param return String specifying what output to return.Valid return options include:
|
||||||
|
#' - 'plot-louvain': return a network plot coloured by louvain communities.
|
||||||
|
#' - 'plot-hrvar': return a network plot coloured by HR attribute.
|
||||||
|
#' - 'plot-sankey': return a sankey plot combining communities and HR attribute.
|
||||||
|
#' - 'table': return a vertex summary table with counts in communities and HR attribute.
|
||||||
|
#' - 'data': return a vertex data file that matches vertices with communities and HR attributes.
|
||||||
|
#' - 'describe': returns a list of data frames which describe each of the identified communities.
|
||||||
|
#' - 'network': return igraph object.
|
||||||
|
#'
|
||||||
|
#' @import ggraph
|
||||||
|
#' @import dplyr
|
||||||
|
#'
|
||||||
|
#' @export
|
||||||
|
network_louvain <- function(data,
|
||||||
|
hrvar,
|
||||||
|
bg_fill = "#000000",
|
||||||
|
font_col = "#FFFFFF",
|
||||||
|
node_alpha = 0.8,
|
||||||
|
algorithm = "mds",
|
||||||
|
path = "network_p2p_louvain",
|
||||||
|
desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
|
||||||
|
return){
|
||||||
|
|
||||||
|
## Set variables
|
||||||
|
TO_hrvar <- paste0("TieOrigin_", hrvar)
|
||||||
|
TD_hrvar <- paste0("TieDestination_", hrvar)
|
||||||
|
|
||||||
|
## Set edges df
|
||||||
|
edges <-
|
||||||
|
data %>%
|
||||||
|
select(from = "TieOrigin_PersonId",
|
||||||
|
to = "TieDestination_PersonId",
|
||||||
|
weight = "StrongTieScore")
|
||||||
|
|
||||||
|
## Vertices data frame to provide meta-data
|
||||||
|
vert_ft <-
|
||||||
|
rbind(
|
||||||
|
# TieOrigin
|
||||||
|
edges %>%
|
||||||
|
select(from) %>% # Single column
|
||||||
|
unique() %>% # Remove duplications
|
||||||
|
left_join(select(data, TieOrigin_PersonId, TO_hrvar),
|
||||||
|
by = c("from" = "TieOrigin_PersonId")) %>%
|
||||||
|
select(node = "from", !!sym(hrvar) := TO_hrvar),
|
||||||
|
|
||||||
|
# TieDestination
|
||||||
|
edges %>%
|
||||||
|
select(to) %>% # Single column
|
||||||
|
unique() %>% # Remove duplications
|
||||||
|
left_join(select(data, TieDestination_PersonId, TD_hrvar),
|
||||||
|
by = c("to" = "TieDestination_PersonId")) %>%
|
||||||
|
select(node = "to", !!sym(hrvar) := TD_hrvar)
|
||||||
|
)
|
||||||
|
|
||||||
|
## Create igraph object
|
||||||
|
g_raw <-
|
||||||
|
igraph::graph_from_data_frame(edges,
|
||||||
|
directed = FALSE, # Set to undirected for clustering
|
||||||
|
vertices = unique(vert_ft)) # remove duplicates
|
||||||
|
|
||||||
|
## Return a numeric vector of partitions / clusters / modules
|
||||||
|
## Set a low resolution parameter to have fewer groups
|
||||||
|
lc <- igraph::cluster_louvain(g_raw)
|
||||||
|
|
||||||
|
## Add cluster
|
||||||
|
g <-
|
||||||
|
g_raw %>%
|
||||||
|
# Add louvain partitions to graph object
|
||||||
|
igraph::set_vertex_attr("cluster", value = as.character(igraph::membership(lc))) %>% # Return membership - diff from Leiden
|
||||||
|
igraph::simplify()
|
||||||
|
|
||||||
|
## Create vertex table
|
||||||
|
vertex_tb <-
|
||||||
|
g %>%
|
||||||
|
igraph::get.vertex.attribute() %>%
|
||||||
|
as_tibble()
|
||||||
|
|
||||||
|
g_layout <-
|
||||||
|
g %>%
|
||||||
|
ggraph::ggraph(layout = "igraph", algorithm = algorithm)
|
||||||
|
|
||||||
|
## Return
|
||||||
|
if(return == "plot-louvain"){
|
||||||
|
|
||||||
|
plot_output <-
|
||||||
|
g_layout +
|
||||||
|
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
|
||||||
|
ggraph::geom_node_point(aes(colour = cluster),
|
||||||
|
alpha = node_alpha,
|
||||||
|
pch = 16) +
|
||||||
|
theme_void() +
|
||||||
|
theme(legend.position = "bottom",
|
||||||
|
legend.background = element_rect(fill = bg_fill),
|
||||||
|
plot.background = element_rect(fill = bg_fill),
|
||||||
|
text = element_text(colour = font_col),
|
||||||
|
axis.line = element_blank()) +
|
||||||
|
labs(title = "Person to person collaboration with Community Detection",
|
||||||
|
subtitle = "Based on Louvain algorithm and Strong Tie Score",
|
||||||
|
y = "",
|
||||||
|
x = "")
|
||||||
|
|
||||||
|
# Default PDF output unless NULL supplied to path
|
||||||
|
if(is.null(path)){
|
||||||
|
|
||||||
|
plot_output
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
ggsave(paste0(path, tstamp(), ".pdf"),
|
||||||
|
plot = plot_output,
|
||||||
|
width = 16,
|
||||||
|
height = 9)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if(return == "plot-hrvar"){
|
||||||
|
|
||||||
|
plot_output <-
|
||||||
|
g_layout +
|
||||||
|
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
|
||||||
|
ggraph::geom_node_point(aes(colour = !!sym(hrvar)),
|
||||||
|
alpha = node_alpha,
|
||||||
|
pch = 16) +
|
||||||
|
theme_void() +
|
||||||
|
theme(legend.position = "bottom",
|
||||||
|
legend.background = element_rect(fill = bg_fill),
|
||||||
|
plot.background = element_rect(fill = bg_fill),
|
||||||
|
text = element_text(colour = font_col),
|
||||||
|
axis.line = element_blank()) +
|
||||||
|
labs(title = "Person to person collaboration",
|
||||||
|
subtitle = paste0("Showing ", hrvar),
|
||||||
|
y = "",
|
||||||
|
x = "")
|
||||||
|
|
||||||
|
# Default PDF output unless NULL supplied to path
|
||||||
|
if(is.null(path)){
|
||||||
|
|
||||||
|
plot_output
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
ggsave(paste0(path, tstamp(), ".pdf"),
|
||||||
|
plot = plot_output,
|
||||||
|
width = 16,
|
||||||
|
height = 9)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if(return == "table"){
|
||||||
|
|
||||||
|
vertex_tb %>%
|
||||||
|
count(!!sym(hrvar), cluster)
|
||||||
|
|
||||||
|
} else if(return == "data"){
|
||||||
|
|
||||||
|
vertex_tb
|
||||||
|
|
||||||
|
} else if(return == "network"){
|
||||||
|
|
||||||
|
g
|
||||||
|
|
||||||
|
|
||||||
|
} else if(return == "plot-sankey"){
|
||||||
|
|
||||||
|
create_sankey(data = vertex_tb %>% count(!!sym(hrvar), cluster),
|
||||||
|
var1 = hrvar,
|
||||||
|
var2 = "cluster",
|
||||||
|
count = "n")
|
||||||
|
|
||||||
|
} else if(return == "describe"){
|
||||||
|
|
||||||
|
describe_tb <-
|
||||||
|
vertex_tb %>%
|
||||||
|
left_join(select(data, starts_with("TieOrigin_")),
|
||||||
|
by = c("name" = "TieOrigin_PersonId"))
|
||||||
|
|
||||||
|
desc_str <-
|
||||||
|
describe_tb %>%
|
||||||
|
pull(cluster) %>%
|
||||||
|
unique()
|
||||||
|
|
||||||
|
desc_str %>%
|
||||||
|
purrr::map(function(x){
|
||||||
|
describe_tb %>%
|
||||||
|
filter(cluster == x) %>%
|
||||||
|
network_describe(hrvar = desc_hrvar)
|
||||||
|
}) %>%
|
||||||
|
setNames(nm = desc_str)
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
stop("Please enter a valid input for `return`.")
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
|
@ -5,16 +5,25 @@
|
||||||
#' within `tm_cooc()`.
|
#' within `tm_cooc()`.
|
||||||
#'
|
#'
|
||||||
#' @param data Data frame output from `tm_clean()`.
|
#' @param data Data frame output from `tm_clean()`.
|
||||||
#' @param id String to represent the id variable. Defaults to "word".
|
#' @param id String to represent the id variable. Defaults to "line".
|
||||||
#' @param word String to represent the word variable. Defaults to "word".
|
#' @param word String to represent the word variable. Defaults to "word".
|
||||||
#'
|
#'
|
||||||
#' @import data.table
|
#' @importFrom data.table ":=" "%like%" "%between%" rbindlist as.data.table
|
||||||
|
#'
|
||||||
|
#' @examples
|
||||||
|
#' td <- data.frame(line = c(1, 1, 2, 2),
|
||||||
|
#' word = c("work", "meeting", "catch", "up"))
|
||||||
|
#'
|
||||||
|
#' pairwise_count(td, id = "line", word = "word")
|
||||||
#'
|
#'
|
||||||
#' @export
|
#' @export
|
||||||
pairwise_count <- function(data,
|
pairwise_count <- function(data,
|
||||||
id = "line",
|
id = "line",
|
||||||
word = "word"){
|
word = "word"){
|
||||||
|
|
||||||
|
# Make sure data.table knows we know we're using it
|
||||||
|
.datatable.aware = TRUE
|
||||||
|
|
||||||
data <-
|
data <-
|
||||||
data %>%
|
data %>%
|
||||||
dplyr::rename(word := !!sym(word),
|
dplyr::rename(word := !!sym(word),
|
||||||
|
|
|
@ -0,0 +1,35 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/create_sankey.R
|
||||||
|
\name{create_sankey}
|
||||||
|
\alias{create_sankey}
|
||||||
|
\title{Create a sankey chart from a two-column count table}
|
||||||
|
\usage{
|
||||||
|
create_sankey(data, var1, var2, count = "n")
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{Data frame of the long count table.}
|
||||||
|
|
||||||
|
\item{var1}{String containing the name of the variable to be shown on the left.}
|
||||||
|
|
||||||
|
\item{var2}{String containing the name of the variable to be shown on the right.}
|
||||||
|
|
||||||
|
\item{count}{String containing the name of the count variable.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Create a networkD3 style sankey chart based on a long count table
|
||||||
|
with two variables. The input data should have three columns, where
|
||||||
|
each row is a unique group:
|
||||||
|
\enumerate{
|
||||||
|
\item Variable 1
|
||||||
|
\item Variable 2
|
||||||
|
\item Count
|
||||||
|
}
|
||||||
|
}
|
||||||
|
\examples{
|
||||||
|
\donttest{
|
||||||
|
sq_data \%>\%
|
||||||
|
dplyr::count(Organization, FunctionType) \%>\%
|
||||||
|
create_sankey(var1 = "Organization", var2 = "FunctionType")
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/network_describe.R
|
||||||
|
\name{network_describe}
|
||||||
|
\alias{network_describe}
|
||||||
|
\title{Uncover HR attributes which best represent a population for a Person to Person query}
|
||||||
|
\usage{
|
||||||
|
network_describe(
|
||||||
|
data,
|
||||||
|
hrvar = c("Organization", "LevelDesignation", "FunctionType")
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{Data frame for a person to person query.}
|
||||||
|
|
||||||
|
\item{hrvar}{Character vector of length 3 containing the HR attributes to be used.}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Returns a data frame that gives a percentage of the group combinations that best represent
|
||||||
|
the population provided. Uses a person to person query.
|
||||||
|
}
|
||||||
|
\author{
|
||||||
|
Tannaz Sattari Tabrizi \href{mailto:Tannaz.Sattari@microsoft.com}{Tannaz.Sattari@microsoft.com}
|
||||||
|
}
|
|
@ -0,0 +1,61 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/network_leiden.R
|
||||||
|
\name{network_leiden}
|
||||||
|
\alias{network_leiden}
|
||||||
|
\title{Implement the Leiden community detection on a Person to Person network query}
|
||||||
|
\usage{
|
||||||
|
network_leiden(
|
||||||
|
data,
|
||||||
|
hrvar,
|
||||||
|
bg_fill = "#000000",
|
||||||
|
font_col = "#FFFFFF",
|
||||||
|
algorithm = "mds",
|
||||||
|
path = "network_p2p_leiden",
|
||||||
|
node_alpha = 0.8,
|
||||||
|
res = 0.5,
|
||||||
|
desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
|
||||||
|
return
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{Data frame containing a Person to Person query.}
|
||||||
|
|
||||||
|
\item{hrvar}{String containing the HR attribute to be matched in the dataset.}
|
||||||
|
|
||||||
|
\item{bg_fill}{String to specify background fill colour.}
|
||||||
|
|
||||||
|
\item{font_col}{String to specify font and link colour.}
|
||||||
|
|
||||||
|
\item{algorithm}{String to specify the node placement algorithm to be used. Defaults to "mds" to perform
|
||||||
|
a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
|
||||||
|
See \url{https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html} for a full list of options.}
|
||||||
|
|
||||||
|
\item{path}{File path for saving the PDF output. Defaults to "network_p2p_leiden".
|
||||||
|
Since the network outputs are computationally intensive, the default behaviour is to save time by
|
||||||
|
saving the plot output directly as a PDF in the specified path. To override this behaviour and return
|
||||||
|
a plot object instead, you can pass \code{NULL} to \code{path}. What is passed to \code{path} makes no difference
|
||||||
|
if returning anything other than "plot-leiden" or "plot-hrvar".}
|
||||||
|
|
||||||
|
\item{node_alpha}{A numeric value between 0 and 1 to specify the transparency of the nodes.}
|
||||||
|
|
||||||
|
\item{res}{Resolution parameter to be passed to \code{leiden::leiden()}. Defaults to 0.5.}
|
||||||
|
|
||||||
|
\item{desc_hrvar}{Character vector of length 3 containing the HR attributes to use when returning the
|
||||||
|
"describe" output. See \code{network_describe()}.}
|
||||||
|
|
||||||
|
\item{return}{String specifying what output to return. Valid return options include:
|
||||||
|
\itemize{
|
||||||
|
\item 'plot-leiden': return a network plot coloured by leiden communities.
|
||||||
|
\item 'plot-hrvar': return a network plot coloured by HR attribute.
|
||||||
|
\item 'plot-sankey': return a sankey plot combining communities and HR attribute.
|
||||||
|
\item 'table': return a vertex summary table with counts in communities and HR attribute.
|
||||||
|
\item 'data': return a vertex data file that matches vertices with communities and HR attributes.
|
||||||
|
\item 'describe': return a list of data frames which describe each of the identified communities.
|
||||||
|
\item 'network': return igraph object.
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Take a P2P network query and implement the Leiden community detection method. To run
|
||||||
|
this function, you will require all the pre-requisites of the \strong{leiden} package installed,
|
||||||
|
which includes Python and \strong{reticulate}.
|
||||||
|
}
|
|
@ -0,0 +1,57 @@
|
||||||
|
% Generated by roxygen2: do not edit by hand
|
||||||
|
% Please edit documentation in R/network_louvain.R
|
||||||
|
\name{network_louvain}
|
||||||
|
\alias{network_louvain}
|
||||||
|
\title{Implement the Louvain community detection on a Person to Person network query}
|
||||||
|
\usage{
|
||||||
|
network_louvain(
|
||||||
|
data,
|
||||||
|
hrvar,
|
||||||
|
bg_fill = "#000000",
|
||||||
|
font_col = "#FFFFFF",
|
||||||
|
node_alpha = 0.8,
|
||||||
|
algorithm = "mds",
|
||||||
|
path = "network_p2p_louvain",
|
||||||
|
desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
|
||||||
|
return
|
||||||
|
)
|
||||||
|
}
|
||||||
|
\arguments{
|
||||||
|
\item{data}{Data frame containing a Person to Person query.}
|
||||||
|
|
||||||
|
\item{hrvar}{String containing the HR attribute to be matched in the dataset.}
|
||||||
|
|
||||||
|
\item{bg_fill}{String to specify background fill colour.}
|
||||||
|
|
||||||
|
\item{font_col}{String to specify font and link colour.}
|
||||||
|
|
||||||
|
\item{node_alpha}{A numeric value between 0 and 1 to specify the transparency of the nodes.}
|
||||||
|
|
||||||
|
\item{algorithm}{String to specify the node placement algorithm to be used. Defaults to "mds" to perform
|
||||||
|
a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
|
||||||
|
See \url{https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html} for a full list of options.}
|
||||||
|
|
||||||
|
\item{path}{File path for saving the PDF output. Defaults to "network_p2p_louvain".
|
||||||
|
Since the network outputs are computationally intensive, the default behaviour is to save time by
|
||||||
|
saving the plot output directly as a PDF in the specified path. To override this behaviour and return
|
||||||
|
a plot object instead, you can pass \code{NULL} to \code{path}. What is passed to \code{path} makes no difference
|
||||||
|
if returning anything other than "plot-louvain" or "plot-hrvar".}
|
||||||
|
|
||||||
|
\item{desc_hrvar}{Character vector of length 3 containing the HR attributes to use when returning the
|
||||||
|
"describe" output. See \code{network_describe()}.}
|
||||||
|
|
||||||
|
\item{return}{String specifying what output to return.Valid return options include:
|
||||||
|
\itemize{
|
||||||
|
\item 'plot-louvain': return a network plot coloured by louvain communities.
|
||||||
|
\item 'plot-hrvar': return a network plot coloured by HR attribute.
|
||||||
|
\item 'plot-sankey': return a sankey plot combining communities and HR attribute.
|
||||||
|
\item 'table': return a vertex summary table with counts in communities and HR attribute.
|
||||||
|
\item 'data': return a vertex data file that matches vertices with communities and HR attributes.
|
||||||
|
\item 'describe': returns a list of data frames which describe each of the identified communities.
|
||||||
|
\item 'network': return igraph object.
|
||||||
|
}}
|
||||||
|
}
|
||||||
|
\description{
|
||||||
|
Take a P2P network query and implement the Louvain community detection method. The
|
||||||
|
\strong{igraph} implementation of the Louvain method is used.
|
||||||
|
}
|
|
@ -9,7 +9,7 @@ pairwise_count(data, id = "line", word = "word")
|
||||||
\arguments{
|
\arguments{
|
||||||
\item{data}{Data frame output from \code{tm_clean()}.}
|
\item{data}{Data frame output from \code{tm_clean()}.}
|
||||||
|
|
||||||
\item{id}{String to represent the id variable. Defaults to "word".}
|
\item{id}{String to represent the id variable. Defaults to "line".}
|
||||||
|
|
||||||
\item{word}{String to represent the word variable. Defaults to "word".}
|
\item{word}{String to represent the word variable. Defaults to "word".}
|
||||||
}
|
}
|
||||||
|
@ -18,3 +18,10 @@ This is a \strong{data.table} implementation that mimics the output of
|
||||||
\code{widyr::pairwise_count()} to reduce package dependency. This is used internally
|
\code{widyr::pairwise_count()} to reduce package dependency. This is used internally
|
||||||
within \code{tm_cooc()}.
|
within \code{tm_cooc()}.
|
||||||
}
|
}
|
||||||
|
\examples{
|
||||||
|
td <- data.frame(line = c(1, 1, 2, 2),
|
||||||
|
word = c("work", "meeting", "catch", "up"))
|
||||||
|
|
||||||
|
pairwise_count(td, id = "line", word = "word")
|
||||||
|
|
||||||
|
}
|
||||||
|
|
Загрузка…
Ссылка в новой задаче