зеркало из https://github.com/microsoft/wpa.git
Merge pull request #32 from microsoft/feature/network-community-detection
Feature: add Community Detection capability for network plots
This commit is contained in:
Коммит
2118a61a1a
|
@ -63,5 +63,6 @@ RoxygenNote: 7.1.1
|
|||
Roxygen: list(markdown = TRUE)
|
||||
VignetteBuilder: knitr
|
||||
Suggests:
|
||||
extrafont
|
||||
extrafont,
|
||||
leiden
|
||||
Language: en-US
|
||||
|
|
|
@ -49,6 +49,7 @@ export(create_line)
|
|||
export(create_line_asis)
|
||||
export(create_period_scatter)
|
||||
export(create_rank)
|
||||
export(create_sankey)
|
||||
export(create_scatter)
|
||||
export(create_stacked)
|
||||
export(create_trend)
|
||||
|
@ -105,7 +106,10 @@ export(meetingtype_sum)
|
|||
export(meetingtype_summary)
|
||||
export(mgrcoatt_dist)
|
||||
export(mgrrel_matrix)
|
||||
export(network_describe)
|
||||
export(network_g2g)
|
||||
export(network_leiden)
|
||||
export(network_louvain)
|
||||
export(network_p2p)
|
||||
export(one2one_dist)
|
||||
export(one2one_fizz)
|
||||
|
@ -150,16 +154,18 @@ export(workpatterns_rank)
|
|||
export(wrap)
|
||||
import(DT)
|
||||
import(Information)
|
||||
import(data.table)
|
||||
import(dplyr)
|
||||
import(ggplot2)
|
||||
import(ggraph)
|
||||
import(reshape2)
|
||||
import(scales)
|
||||
import(tidyr)
|
||||
import(tidyselect)
|
||||
importFrom(data.table,"%between%")
|
||||
importFrom(data.table,"%like%")
|
||||
importFrom(data.table,":=")
|
||||
importFrom(data.table,as.data.table)
|
||||
importFrom(data.table,rbindlist)
|
||||
importFrom(dplyr,`%>%`)
|
||||
importFrom(dplyr,mutate_if)
|
||||
importFrom(grDevices,rainbow)
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
# --------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
||||
# --------------------------------------------------------------------------------------------
|
||||
|
||||
#' @title Create a sankey chart from a two-column count table
|
||||
#'
|
||||
#' @description
|
||||
#' Create a networkD3 style sankey chart based on a long count table
|
||||
#' with two variables. The input data should have three columns, where
|
||||
#' each row is a unique group:
|
||||
#' 1. Variable 1
|
||||
#' 2. Variable 2
|
||||
#' 3. Count
|
||||
#'
|
||||
#' @param data Data frame of the long count table.
|
||||
#' @param var1 String containing the name of the variable to be shown on the left.
|
||||
#' @param var2 String containing the name of the variable to be shown on the right.
|
||||
#' @param count String containing the name of the count variable.
|
||||
#'
|
||||
#' @import dplyr
|
||||
#'
|
||||
#' @examples
|
||||
#' \donttest{
|
||||
#' sq_data %>%
|
||||
#' dplyr::count(Organization, FunctionType) %>%
|
||||
#' create_sankey(var1 = "Organization", var2 = "FunctionType")
|
||||
#' }
|
||||
#'
|
||||
#' @export
|
||||
create_sankey <- function(data, var1, var2, count = "n"){
|
||||
|
||||
## Rename
|
||||
data$pre_group <- data[[var1]]
|
||||
data$group <- data[[var2]]
|
||||
|
||||
## Set up `nodes`
|
||||
group_source <- unique(data$pre_group)
|
||||
group_target <- paste0(unique(data$group), " ")
|
||||
|
||||
groups <- c(group_source, group_target)
|
||||
|
||||
nodes_source <- tibble(name = group_source)
|
||||
nodes_target <- tibble(name = group_target)
|
||||
nodes <- rbind(nodes_source, nodes_target) %>% mutate(node = 0:(nrow(.) - 1))
|
||||
|
||||
## Set up `links`
|
||||
links <-
|
||||
data %>%
|
||||
mutate(group = paste0(group, " ")) %>%
|
||||
select(source = "pre_group",
|
||||
target = "group",
|
||||
value = count)
|
||||
|
||||
nodes_source <- nodes_source %>% select(name) # Make `nodes` a single column data frame
|
||||
nodes_target <- nodes_target %>% select(name) # Make `nodes` a single column data frame
|
||||
|
||||
links <-
|
||||
links %>%
|
||||
left_join(nodes %>% rename(IDsource = "node"), by = c("source" = "name")) %>%
|
||||
left_join(nodes %>% rename(IDtarget = "node"), by = c("target" = "name"))
|
||||
|
||||
|
||||
networkD3::sankeyNetwork(Links = as.data.frame(links),
|
||||
Nodes = as.data.frame(nodes),
|
||||
Source = 'IDsource', # Change reference to IDsource
|
||||
Target = 'IDtarget', # Change reference to IDtarget
|
||||
Value = 'value',
|
||||
NodeID = 'name',
|
||||
units="count",
|
||||
sinksRight = FALSE)
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
# --------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
||||
# --------------------------------------------------------------------------------------------
|
||||
|
||||
#' @title Uncover HR attributes which best represent a population for a Person to Person query
|
||||
#'
|
||||
#' @author Tannaz Sattari Tabrizi <Tannaz.Sattari@microsoft.com>
|
||||
#'
|
||||
#' @description
|
||||
#' Returns a data frame that gives a percentage of the group combinations that best represent
|
||||
#' the population provided. Uses a person to person query.
|
||||
#'
|
||||
#' @param data Data frame for a person to person query.
|
||||
#' @param hrvar Character vector of length 3 containing the HR attributes to be used.
|
||||
#'
|
||||
#' @import dplyr
|
||||
#' @import tidyr
|
||||
#'
|
||||
#' @export
|
||||
network_describe <- function(data, hrvar = c("Organization", "LevelDesignation", "FunctionType")){
|
||||
|
||||
if(length(hrvar) != 3){
|
||||
|
||||
stop("Please provide a character vector of length 3 for `hrvar`")
|
||||
|
||||
}
|
||||
|
||||
## De-duplicated data containing only TieOrigins
|
||||
filtered_Data <- unique(select(data, starts_with("TieOrigin_")))
|
||||
|
||||
## Select features
|
||||
features <- select(filtered_Data, paste0("TieOrigin_", hrvar))
|
||||
|
||||
## Feature set: 1
|
||||
max_percentages_1f <-
|
||||
features %>%
|
||||
colnames() %>%
|
||||
purrr::map(function(c){
|
||||
|
||||
agg <-
|
||||
features %>%
|
||||
group_by_at(.vars = vars(c)) %>%
|
||||
summarise(count = n(), .groups = "drop") %>%
|
||||
mutate(percentage = count / sum(count, na.rm = TRUE))
|
||||
|
||||
agg %>%
|
||||
arrange(desc(percentage)) %>%
|
||||
slice(1) %>% # Extract first row
|
||||
mutate(feature_1 = c,
|
||||
feature_1_value = !!sym(c)) %>%
|
||||
select(feature_1, feature_1_value, Percentage = "percentage")
|
||||
}) %>%
|
||||
bind_rows()
|
||||
|
||||
## Feature set: 2
|
||||
max_percentages_2f <-
|
||||
list(c1 = colnames(features),
|
||||
c2 = colnames(features)) %>%
|
||||
expand.grid(stringsAsFactors = FALSE) %>%
|
||||
filter(c1 != c2) %>%
|
||||
purrr::pmap(function(c1, c2){
|
||||
agg <-
|
||||
features %>%
|
||||
group_by_at(.vars=vars(c1, c2)) %>%
|
||||
summarise(count = n(), .groups = "drop") %>%
|
||||
mutate(percentage = count / sum(count, na.rm = TRUE))
|
||||
|
||||
agg %>%
|
||||
arrange(desc(percentage)) %>%
|
||||
slice(1) %>% # Extract first row
|
||||
mutate(feature_1 = c1,
|
||||
feature_1_value = !!sym(as.character(c1)),
|
||||
feature_2 = c2,
|
||||
feature_2_value = !!sym(as.character(c2))) %>%
|
||||
select(feature_1,
|
||||
feature_1_value,
|
||||
feature_2,
|
||||
feature_2_value,
|
||||
Percentage = "percentage")
|
||||
}) %>%
|
||||
bind_rows()
|
||||
|
||||
|
||||
## Feature set: 3
|
||||
max_percentages_3f <-
|
||||
list(c1 = colnames(features),
|
||||
c2 = colnames(features),
|
||||
c3 = colnames(features)) %>%
|
||||
expand.grid(stringsAsFactors = FALSE) %>%
|
||||
filter(c1 != c2,
|
||||
c2 != c3,
|
||||
c3 != c1) %>%
|
||||
purrr::pmap(function(c1, c2, c3){
|
||||
agg <-
|
||||
features %>%
|
||||
group_by_at(.vars=vars(c1, c2, c3)) %>%
|
||||
summarise(count = n(), .groups = "drop") %>%
|
||||
mutate(percentage = count / sum(count, na.rm = TRUE))
|
||||
|
||||
agg %>%
|
||||
arrange(desc(percentage)) %>%
|
||||
slice(1) %>% # Extract first row
|
||||
mutate(feature_1 = c1,
|
||||
feature_1_value = !!sym(c1),
|
||||
feature_2 = c2,
|
||||
feature_2_value = !!sym(c2),
|
||||
feature_3 = c3,
|
||||
feature_3_value = !!sym(c3)) %>%
|
||||
select(feature_1,
|
||||
feature_1_value,
|
||||
feature_2,
|
||||
feature_2_value,
|
||||
feature_3,
|
||||
feature_3_value,
|
||||
Percentage = "percentage")
|
||||
}) %>%
|
||||
bind_rows()
|
||||
|
||||
list(max_percentages_1f,
|
||||
max_percentages_2f,
|
||||
max_percentages_3f) %>%
|
||||
bind_rows() %>%
|
||||
select(starts_with("feature"), Percentage)
|
||||
}
|
|
@ -0,0 +1,225 @@
|
|||
# --------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
||||
# --------------------------------------------------------------------------------------------
|
||||
|
||||
#' @title Implement the Leiden community detection on a Person to Person network query
|
||||
#'
|
||||
#' @description
|
||||
#' Take a P2P network query and implement the Leiden community detection method. To run
|
||||
#' this function, you will require all the pre-requisites of the **leiden** package installed,
|
||||
#' which includes Python and **reticulate**.
|
||||
#'
|
||||
#' @param data Data frame containing a Person to Person query.
|
||||
#' @param hrvar String containing the HR attribute to be matched in the dataset.
|
||||
#' @param bg_fill String to specify background fill colour.
|
||||
#' @param font_col String to specify font and link colour.
|
||||
#' @param node_alpha A numeric value between 0 and 1 to specify the transparency of the nodes.
|
||||
#' @param path File path for saving the PDF output. Defaults to "network_p2p_leiden".
|
||||
#' Since the network outputs are computationally intensive, the default behaviour is to save time by
|
||||
#' saving the plot output directly as a PDF in the specified path. To override this behaviour and return
|
||||
#' a plot object instead, you can pass `NULL` to `path`. What is passed to `path` makes no difference
|
||||
#' if returning anything other than "plot-leiden" or "plot-hrvar".
|
||||
#'
|
||||
#' @param algorithm String to specify the node placement algorithm to be used. Defaults to "mds" to perform
|
||||
#' a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
|
||||
#' See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options.
|
||||
#'
|
||||
#' @param res Resolution parameter to be passed to `leiden::leiden()`. Defaults to 0.5.
|
||||
#' @param desc_hrvar Character vector of length 3 containing the HR attributes to use when returning the
|
||||
#' "describe" output. See `network_describe()`.
|
||||
#' @param return String specifying what output to return. Valid return options include:
|
||||
#' - 'plot-leiden': return a network plot coloured by leiden communities.
|
||||
#' - 'plot-hrvar': return a network plot coloured by HR attribute.
|
||||
#' - 'plot-sankey': return a sankey plot combining communities and HR attribute.
|
||||
#' - 'table': return a vertex summary table with counts in communities and HR attribute.
|
||||
#' - 'data': return a vertex data file that matches vertices with communities and HR attributes.
|
||||
#' - 'describe': return a list of data frames which describe each of the identified communities.
|
||||
#' - 'network': return igraph object.
|
||||
#'
|
||||
#' @import dplyr
|
||||
#'
|
||||
#' @export
|
||||
network_leiden <- function(data,
|
||||
hrvar,
|
||||
bg_fill = "#000000",
|
||||
font_col = "#FFFFFF",
|
||||
algorithm = "mds",
|
||||
path = "network_p2p_leiden",
|
||||
node_alpha = 0.8,
|
||||
res = 0.5,
|
||||
desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
|
||||
return){
|
||||
|
||||
## Set variables
|
||||
TO_hrvar <- paste0("TieOrigin_", hrvar)
|
||||
TD_hrvar <- paste0("TieDestination_", hrvar)
|
||||
|
||||
## Set edges df
|
||||
edges <-
|
||||
data %>%
|
||||
select(from = "TieOrigin_PersonId",
|
||||
to = "TieDestination_PersonId",
|
||||
weight = "StrongTieScore")
|
||||
|
||||
## Vertices data frame to provide meta-data
|
||||
vert_ft <-
|
||||
rbind(
|
||||
# TieOrigin
|
||||
edges %>%
|
||||
select(from) %>% # Single column
|
||||
unique() %>% # Remove duplications
|
||||
left_join(select(data, TieOrigin_PersonId, TO_hrvar),
|
||||
by = c("from" = "TieOrigin_PersonId")) %>%
|
||||
select(node = "from", !!sym(hrvar) := TO_hrvar),
|
||||
|
||||
# TieDestination
|
||||
edges %>%
|
||||
select(to) %>% # Single column
|
||||
unique() %>% # Remove duplications
|
||||
left_join(select(data, TieDestination_PersonId, TD_hrvar),
|
||||
by = c("to" = "TieDestination_PersonId")) %>%
|
||||
select(node = "to", !!sym(hrvar) := TD_hrvar)
|
||||
)
|
||||
|
||||
## Create igraph object
|
||||
g_raw <-
|
||||
igraph::graph_from_data_frame(edges,
|
||||
directed = TRUE, # Directed, but FALSE for visualization
|
||||
vertices = unique(vert_ft)) # remove duplicates
|
||||
|
||||
## Return a numeric vector of partitions / clusters / modules
|
||||
## Set a low resolution parameter to have fewer groups
|
||||
ld <- leiden::leiden(g_raw, resolution_parameter = res) # create partitions
|
||||
|
||||
## Add cluster
|
||||
g <-
|
||||
g_raw %>%
|
||||
# Add leiden partitions to graph object
|
||||
igraph::set_vertex_attr("cluster", value = as.character(ld)) %>%
|
||||
igraph::simplify()
|
||||
|
||||
## Create vertex table
|
||||
vertex_tb <-
|
||||
g %>%
|
||||
igraph::get.vertex.attribute() %>%
|
||||
as_tibble()
|
||||
|
||||
g_layout <-
|
||||
g %>%
|
||||
ggraph::ggraph(layout = "igraph", algorithm = algorithm)
|
||||
|
||||
## Return
|
||||
if(return == "plot-leiden"){
|
||||
|
||||
plot_output <-
|
||||
g_layout +
|
||||
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
|
||||
ggraph::geom_node_point(aes(colour = cluster),
|
||||
alpha = node_alpha,
|
||||
pch = 16) +
|
||||
theme_void() +
|
||||
theme(legend.position = "bottom",
|
||||
legend.background = element_rect(fill = bg_fill),
|
||||
plot.background = element_rect(fill = bg_fill),
|
||||
text = element_text(colour = font_col),
|
||||
axis.line = element_blank()) +
|
||||
labs(title = "Person to person collaboration with Community Detection",
|
||||
subtitle = "Based on Leiden algorithm and Strong Tie Score",
|
||||
y = "",
|
||||
x = "")
|
||||
|
||||
# Default PDF output unless NULL supplied to path
|
||||
if(is.null(path)){
|
||||
|
||||
plot_output
|
||||
|
||||
} else {
|
||||
|
||||
ggsave(paste0(path, tstamp(), ".pdf"),
|
||||
plot = plot_output,
|
||||
width = 16,
|
||||
height = 9)
|
||||
|
||||
}
|
||||
|
||||
} else if(return == "plot-hrvar"){
|
||||
|
||||
plot_output <-
|
||||
g_layout +
|
||||
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
|
||||
ggraph::geom_node_point(aes(colour = !!sym(hrvar)),
|
||||
alpha = node_alpha,
|
||||
pch = 16) +
|
||||
theme_void() +
|
||||
theme(legend.position = "bottom",
|
||||
legend.background = element_rect(fill = bg_fill),
|
||||
plot.background = element_rect(fill = bg_fill),
|
||||
text = element_text(colour = font_col),
|
||||
axis.line = element_blank()) +
|
||||
labs(title = "Person to person collaboration",
|
||||
subtitle = paste0("Showing ", hrvar),
|
||||
y = "",
|
||||
x = "")
|
||||
|
||||
# Default PDF output unless NULL supplied to path
|
||||
if(is.null(path)){
|
||||
|
||||
plot_output
|
||||
|
||||
} else {
|
||||
|
||||
ggsave(paste0(path, tstamp(), ".pdf"),
|
||||
plot = plot_output,
|
||||
width = 16,
|
||||
height = 9)
|
||||
|
||||
}
|
||||
|
||||
} else if(return == "table"){
|
||||
|
||||
vertex_tb %>%
|
||||
count(!!sym(hrvar), cluster)
|
||||
|
||||
} else if(return == "data"){
|
||||
|
||||
vertex_tb
|
||||
|
||||
} else if(return == "network"){
|
||||
|
||||
g
|
||||
|
||||
|
||||
} else if(return == "plot-sankey"){
|
||||
|
||||
create_sankey(data = vertex_tb %>% count(!!sym(hrvar), cluster),
|
||||
var1 = hrvar,
|
||||
var2 = "cluster",
|
||||
count = "n")
|
||||
|
||||
} else if(return == "describe"){
|
||||
|
||||
describe_tb <-
|
||||
vertex_tb %>%
|
||||
left_join(select(data, starts_with("TieOrigin_")),
|
||||
by = c("name" = "TieOrigin_PersonId"))
|
||||
|
||||
desc_str <-
|
||||
describe_tb %>%
|
||||
pull(cluster) %>%
|
||||
unique()
|
||||
|
||||
desc_str %>%
|
||||
purrr::map(function(x){
|
||||
describe_tb %>%
|
||||
filter(cluster == x) %>%
|
||||
network_describe(hrvar = desc_hrvar)
|
||||
}) %>%
|
||||
setNames(nm = desc_str)
|
||||
|
||||
} else {
|
||||
|
||||
stop("Please enter a valid input for `return`.")
|
||||
|
||||
}
|
||||
}
|
|
@ -0,0 +1,222 @@
|
|||
# --------------------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft Corporation. All rights reserved.
|
||||
# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
|
||||
# --------------------------------------------------------------------------------------------
|
||||
|
||||
#' @title Implement the Louvain community detection on a Person to Person network query
|
||||
#'
|
||||
#' @description
|
||||
#' Take a P2P network query and implement the Louvain community detection method. The
|
||||
#' **igraph** implementation of the Louvain method is used.
|
||||
#'
|
||||
#' @param data Data frame containing a Person to Person query.
|
||||
#' @param hrvar String containing the HR attribute to be matched in the dataset.
|
||||
#' @param bg_fill String to specify background fill colour.
|
||||
#' @param font_col String to specify font and link colour.
|
||||
#' @param node_alpha A numeric value between 0 and 1 to specify the transparency of the nodes.
|
||||
#' @param algorithm String to specify the node placement algorithm to be used. Defaults to "mds" to perform
|
||||
#' a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
|
||||
#' See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options.
|
||||
#' @param path File path for saving the PDF output. Defaults to "network_p2p_louvain".
|
||||
#' Since the network outputs are computationally intensive, the default behaviour is to save time by
|
||||
#' saving the plot output directly as a PDF in the specified path. To override this behaviour and return
|
||||
#' a plot object instead, you can pass `NULL` to `path`. What is passed to `path` makes no difference
|
||||
#' if returning anything other than "plot-louvain" or "plot-hrvar".
|
||||
#' @param desc_hrvar Character vector of length 3 containing the HR attributes to use when returning the
|
||||
#' "describe" output. See `network_describe()`.
|
||||
#'
|
||||
#' @param return String specifying what output to return.Valid return options include:
|
||||
#' - 'plot-louvain': return a network plot coloured by louvain communities.
|
||||
#' - 'plot-hrvar': return a network plot coloured by HR attribute.
|
||||
#' - 'plot-sankey': return a sankey plot combining communities and HR attribute.
|
||||
#' - 'table': return a vertex summary table with counts in communities and HR attribute.
|
||||
#' - 'data': return a vertex data file that matches vertices with communities and HR attributes.
|
||||
#' - 'describe': returns a list of data frames which describe each of the identified communities.
|
||||
#' - 'network': return igraph object.
|
||||
#'
|
||||
#' @import ggraph
|
||||
#' @import dplyr
|
||||
#'
|
||||
#' @export
|
||||
network_louvain <- function(data,
|
||||
hrvar,
|
||||
bg_fill = "#000000",
|
||||
font_col = "#FFFFFF",
|
||||
node_alpha = 0.8,
|
||||
algorithm = "mds",
|
||||
path = "network_p2p_louvain",
|
||||
desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
|
||||
return){
|
||||
|
||||
## Set variables
|
||||
TO_hrvar <- paste0("TieOrigin_", hrvar)
|
||||
TD_hrvar <- paste0("TieDestination_", hrvar)
|
||||
|
||||
## Set edges df
|
||||
edges <-
|
||||
data %>%
|
||||
select(from = "TieOrigin_PersonId",
|
||||
to = "TieDestination_PersonId",
|
||||
weight = "StrongTieScore")
|
||||
|
||||
## Vertices data frame to provide meta-data
|
||||
vert_ft <-
|
||||
rbind(
|
||||
# TieOrigin
|
||||
edges %>%
|
||||
select(from) %>% # Single column
|
||||
unique() %>% # Remove duplications
|
||||
left_join(select(data, TieOrigin_PersonId, TO_hrvar),
|
||||
by = c("from" = "TieOrigin_PersonId")) %>%
|
||||
select(node = "from", !!sym(hrvar) := TO_hrvar),
|
||||
|
||||
# TieDestination
|
||||
edges %>%
|
||||
select(to) %>% # Single column
|
||||
unique() %>% # Remove duplications
|
||||
left_join(select(data, TieDestination_PersonId, TD_hrvar),
|
||||
by = c("to" = "TieDestination_PersonId")) %>%
|
||||
select(node = "to", !!sym(hrvar) := TD_hrvar)
|
||||
)
|
||||
|
||||
## Create igraph object
|
||||
g_raw <-
|
||||
igraph::graph_from_data_frame(edges,
|
||||
directed = FALSE, # Set to undirected for clustering
|
||||
vertices = unique(vert_ft)) # remove duplicates
|
||||
|
||||
## Return a numeric vector of partitions / clusters / modules
|
||||
## Set a low resolution parameter to have fewer groups
|
||||
lc <- igraph::cluster_louvain(g_raw)
|
||||
|
||||
## Add cluster
|
||||
g <-
|
||||
g_raw %>%
|
||||
# Add louvain partitions to graph object
|
||||
igraph::set_vertex_attr("cluster", value = as.character(igraph::membership(lc))) %>% # Return membership - diff from Leiden
|
||||
igraph::simplify()
|
||||
|
||||
## Create vertex table
|
||||
vertex_tb <-
|
||||
g %>%
|
||||
igraph::get.vertex.attribute() %>%
|
||||
as_tibble()
|
||||
|
||||
g_layout <-
|
||||
g %>%
|
||||
ggraph::ggraph(layout = "igraph", algorithm = algorithm)
|
||||
|
||||
## Return
|
||||
if(return == "plot-louvain"){
|
||||
|
||||
plot_output <-
|
||||
g_layout +
|
||||
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
|
||||
ggraph::geom_node_point(aes(colour = cluster),
|
||||
alpha = node_alpha,
|
||||
pch = 16) +
|
||||
theme_void() +
|
||||
theme(legend.position = "bottom",
|
||||
legend.background = element_rect(fill = bg_fill),
|
||||
plot.background = element_rect(fill = bg_fill),
|
||||
text = element_text(colour = font_col),
|
||||
axis.line = element_blank()) +
|
||||
labs(title = "Person to person collaboration with Community Detection",
|
||||
subtitle = "Based on Louvain algorithm and Strong Tie Score",
|
||||
y = "",
|
||||
x = "")
|
||||
|
||||
# Default PDF output unless NULL supplied to path
|
||||
if(is.null(path)){
|
||||
|
||||
plot_output
|
||||
|
||||
} else {
|
||||
|
||||
ggsave(paste0(path, tstamp(), ".pdf"),
|
||||
plot = plot_output,
|
||||
width = 16,
|
||||
height = 9)
|
||||
|
||||
}
|
||||
|
||||
} else if(return == "plot-hrvar"){
|
||||
|
||||
plot_output <-
|
||||
g_layout +
|
||||
ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
|
||||
ggraph::geom_node_point(aes(colour = !!sym(hrvar)),
|
||||
alpha = node_alpha,
|
||||
pch = 16) +
|
||||
theme_void() +
|
||||
theme(legend.position = "bottom",
|
||||
legend.background = element_rect(fill = bg_fill),
|
||||
plot.background = element_rect(fill = bg_fill),
|
||||
text = element_text(colour = font_col),
|
||||
axis.line = element_blank()) +
|
||||
labs(title = "Person to person collaboration",
|
||||
subtitle = paste0("Showing ", hrvar),
|
||||
y = "",
|
||||
x = "")
|
||||
|
||||
# Default PDF output unless NULL supplied to path
|
||||
if(is.null(path)){
|
||||
|
||||
plot_output
|
||||
|
||||
} else {
|
||||
|
||||
ggsave(paste0(path, tstamp(), ".pdf"),
|
||||
plot = plot_output,
|
||||
width = 16,
|
||||
height = 9)
|
||||
|
||||
}
|
||||
|
||||
} else if(return == "table"){
|
||||
|
||||
vertex_tb %>%
|
||||
count(!!sym(hrvar), cluster)
|
||||
|
||||
} else if(return == "data"){
|
||||
|
||||
vertex_tb
|
||||
|
||||
} else if(return == "network"){
|
||||
|
||||
g
|
||||
|
||||
|
||||
} else if(return == "plot-sankey"){
|
||||
|
||||
create_sankey(data = vertex_tb %>% count(!!sym(hrvar), cluster),
|
||||
var1 = hrvar,
|
||||
var2 = "cluster",
|
||||
count = "n")
|
||||
|
||||
} else if(return == "describe"){
|
||||
|
||||
describe_tb <-
|
||||
vertex_tb %>%
|
||||
left_join(select(data, starts_with("TieOrigin_")),
|
||||
by = c("name" = "TieOrigin_PersonId"))
|
||||
|
||||
desc_str <-
|
||||
describe_tb %>%
|
||||
pull(cluster) %>%
|
||||
unique()
|
||||
|
||||
desc_str %>%
|
||||
purrr::map(function(x){
|
||||
describe_tb %>%
|
||||
filter(cluster == x) %>%
|
||||
network_describe(hrvar = desc_hrvar)
|
||||
}) %>%
|
||||
setNames(nm = desc_str)
|
||||
|
||||
} else {
|
||||
|
||||
stop("Please enter a valid input for `return`.")
|
||||
|
||||
}
|
||||
}
|
|
@ -5,16 +5,25 @@
|
|||
#' within `tm_cooc()`.
|
||||
#'
|
||||
#' @param data Data frame output from `tm_clean()`.
|
||||
#' @param id String to represent the id variable. Defaults to "word".
|
||||
#' @param id String to represent the id variable. Defaults to "line".
|
||||
#' @param word String to represent the word variable. Defaults to "word".
|
||||
#'
|
||||
#' @import data.table
|
||||
#' @importFrom data.table ":=" "%like%" "%between%" rbindlist as.data.table
|
||||
#'
|
||||
#' @examples
|
||||
#' td <- data.frame(line = c(1, 1, 2, 2),
|
||||
#' word = c("work", "meeting", "catch", "up"))
|
||||
#'
|
||||
#' pairwise_count(td, id = "line", word = "word")
|
||||
#'
|
||||
#' @export
|
||||
pairwise_count <- function(data,
|
||||
id = "line",
|
||||
word = "word"){
|
||||
|
||||
# Make sure data.table knows we know we're using it
|
||||
.datatable.aware = TRUE
|
||||
|
||||
data <-
|
||||
data %>%
|
||||
dplyr::rename(word := !!sym(word),
|
||||
|
|
|
@ -0,0 +1,35 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/create_sankey.R
|
||||
\name{create_sankey}
|
||||
\alias{create_sankey}
|
||||
\title{Create a sankey chart from a two-column count table}
|
||||
\usage{
|
||||
create_sankey(data, var1, var2, count = "n")
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{Data frame of the long count table.}
|
||||
|
||||
\item{var1}{String containing the name of the variable to be shown on the left.}
|
||||
|
||||
\item{var2}{String containing the name of the variable to be shown on the right.}
|
||||
|
||||
\item{count}{String containing the name of the count variable.}
|
||||
}
|
||||
\description{
|
||||
Create a networkD3 style sankey chart based on a long count table
|
||||
with two variables. The input data should have three columns, where
|
||||
each row is a unique group:
|
||||
\enumerate{
|
||||
\item Variable 1
|
||||
\item Variable 2
|
||||
\item Count
|
||||
}
|
||||
}
|
||||
\examples{
|
||||
\donttest{
|
||||
sq_data \%>\%
|
||||
dplyr::count(Organization, FunctionType) \%>\%
|
||||
create_sankey(var1 = "Organization", var2 = "FunctionType")
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/network_describe.R
|
||||
\name{network_describe}
|
||||
\alias{network_describe}
|
||||
\title{Uncover HR attributes which best represent a population for a Person to Person query}
|
||||
\usage{
|
||||
network_describe(
|
||||
data,
|
||||
hrvar = c("Organization", "LevelDesignation", "FunctionType")
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{Data frame for a person to person query.}
|
||||
|
||||
\item{hrvar}{Character vector of length 3 containing the HR attributes to be used.}
|
||||
}
|
||||
\description{
|
||||
Returns a data frame that gives a percentage of the group combinations that best represent
|
||||
the population provided. Uses a person to person query.
|
||||
}
|
||||
\author{
|
||||
Tannaz Sattari Tabrizi \href{mailto:Tannaz.Sattari@microsoft.com}{Tannaz.Sattari@microsoft.com}
|
||||
}
|
|
@ -0,0 +1,61 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/network_leiden.R
|
||||
\name{network_leiden}
|
||||
\alias{network_leiden}
|
||||
\title{Implement the Leiden community detection on a Person to Person network query}
|
||||
\usage{
|
||||
network_leiden(
|
||||
data,
|
||||
hrvar,
|
||||
bg_fill = "#000000",
|
||||
font_col = "#FFFFFF",
|
||||
algorithm = "mds",
|
||||
path = "network_p2p_leiden",
|
||||
node_alpha = 0.8,
|
||||
res = 0.5,
|
||||
desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
|
||||
return
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{Data frame containing a Person to Person query.}
|
||||
|
||||
\item{hrvar}{String containing the HR attribute to be matched in the dataset.}
|
||||
|
||||
\item{bg_fill}{String to specify background fill colour.}
|
||||
|
||||
\item{font_col}{String to specify font and link colour.}
|
||||
|
||||
\item{algorithm}{String to specify the node placement algorithm to be used. Defaults to "mds" to perform
|
||||
a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
|
||||
See \url{https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html} for a full list of options.}
|
||||
|
||||
\item{path}{File path for saving the PDF output. Defaults to "network_p2p_leiden".
|
||||
Since the network outputs are computationally intensive, the default behaviour is to save time by
|
||||
saving the plot output directly as a PDF in the specified path. To override this behaviour and return
|
||||
a plot object instead, you can pass \code{NULL} to \code{path}. What is passed to \code{path} makes no difference
|
||||
if returning anything other than "plot-leiden" or "plot-hrvar".}
|
||||
|
||||
\item{node_alpha}{A numeric value between 0 and 1 to specify the transparency of the nodes.}
|
||||
|
||||
\item{res}{Resolution parameter to be passed to \code{leiden::leiden()}. Defaults to 0.5.}
|
||||
|
||||
\item{desc_hrvar}{Character vector of length 3 containing the HR attributes to use when returning the
|
||||
"describe" output. See \code{network_describe()}.}
|
||||
|
||||
\item{return}{String specifying what output to return. Valid return options include:
|
||||
\itemize{
|
||||
\item 'plot-leiden': return a network plot coloured by leiden communities.
|
||||
\item 'plot-hrvar': return a network plot coloured by HR attribute.
|
||||
\item 'plot-sankey': return a sankey plot combining communities and HR attribute.
|
||||
\item 'table': return a vertex summary table with counts in communities and HR attribute.
|
||||
\item 'data': return a vertex data file that matches vertices with communities and HR attributes.
|
||||
\item 'describe': return a list of data frames which describe each of the identified communities.
|
||||
\item 'network': return igraph object.
|
||||
}}
|
||||
}
|
||||
\description{
|
||||
Take a P2P network query and implement the Leiden community detection method. To run
|
||||
this function, you will require all the pre-requisites of the \strong{leiden} package installed,
|
||||
which includes Python and \strong{reticulate}.
|
||||
}
|
|
@ -0,0 +1,57 @@
|
|||
% Generated by roxygen2: do not edit by hand
|
||||
% Please edit documentation in R/network_louvain.R
|
||||
\name{network_louvain}
|
||||
\alias{network_louvain}
|
||||
\title{Implement the Louvain community detection on a Person to Person network query}
|
||||
\usage{
|
||||
network_louvain(
|
||||
data,
|
||||
hrvar,
|
||||
bg_fill = "#000000",
|
||||
font_col = "#FFFFFF",
|
||||
node_alpha = 0.8,
|
||||
algorithm = "mds",
|
||||
path = "network_p2p_louvain",
|
||||
desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
|
||||
return
|
||||
)
|
||||
}
|
||||
\arguments{
|
||||
\item{data}{Data frame containing a Person to Person query.}
|
||||
|
||||
\item{hrvar}{String containing the HR attribute to be matched in the dataset.}
|
||||
|
||||
\item{bg_fill}{String to specify background fill colour.}
|
||||
|
||||
\item{font_col}{String to specify font and link colour.}
|
||||
|
||||
\item{node_alpha}{A numeric value between 0 and 1 to specify the transparency of the nodes.}
|
||||
|
||||
\item{algorithm}{String to specify the node placement algorithm to be used. Defaults to "mds" to perform
|
||||
a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
|
||||
See \url{https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html} for a full list of options.}
|
||||
|
||||
\item{path}{File path for saving the PDF output. Defaults to "network_p2p_louvain".
|
||||
Since the network outputs are computationally intensive, the default behaviour is to save time by
|
||||
saving the plot output directly as a PDF in the specified path. To override this behaviour and return
|
||||
a plot object instead, you can pass \code{NULL} to \code{path}. What is passed to \code{path} makes no difference
|
||||
if returning anything other than "plot-louvain" or "plot-hrvar".}
|
||||
|
||||
\item{desc_hrvar}{Character vector of length 3 containing the HR attributes to use when returning the
|
||||
"describe" output. See \code{network_describe()}.}
|
||||
|
||||
\item{return}{String specifying what output to return.Valid return options include:
|
||||
\itemize{
|
||||
\item 'plot-louvain': return a network plot coloured by louvain communities.
|
||||
\item 'plot-hrvar': return a network plot coloured by HR attribute.
|
||||
\item 'plot-sankey': return a sankey plot combining communities and HR attribute.
|
||||
\item 'table': return a vertex summary table with counts in communities and HR attribute.
|
||||
\item 'data': return a vertex data file that matches vertices with communities and HR attributes.
|
||||
\item 'describe': returns a list of data frames which describe each of the identified communities.
|
||||
\item 'network': return igraph object.
|
||||
}}
|
||||
}
|
||||
\description{
|
||||
Take a P2P network query and implement the Louvain community detection method. The
|
||||
\strong{igraph} implementation of the Louvain method is used.
|
||||
}
|
|
@ -9,7 +9,7 @@ pairwise_count(data, id = "line", word = "word")
|
|||
\arguments{
|
||||
\item{data}{Data frame output from \code{tm_clean()}.}
|
||||
|
||||
\item{id}{String to represent the id variable. Defaults to "word".}
|
||||
\item{id}{String to represent the id variable. Defaults to "line".}
|
||||
|
||||
\item{word}{String to represent the word variable. Defaults to "word".}
|
||||
}
|
||||
|
@ -18,3 +18,10 @@ This is a \strong{data.table} implementation that mimics the output of
|
|||
\code{widyr::pairwise_count()} to reduce package dependency. This is used internally
|
||||
within \code{tm_cooc()}.
|
||||
}
|
||||
\examples{
|
||||
td <- data.frame(line = c(1, 1, 2, 2),
|
||||
word = c("work", "meeting", "catch", "up"))
|
||||
|
||||
pairwise_count(td, id = "line", word = "word")
|
||||
|
||||
}
|
||||
|
|
Загрузка…
Ссылка в новой задаче