Merge pull request #32 from microsoft/feature/network-community-detection

Feature: add Community Detection capability for network plots
2021-01-06 12:13:28 +00:00 · 2021-01-06 12:13:28 +00:00 · 2118a61a1a
--- a/3
+++ b/3
@ -63,5 +63,6 @@ RoxygenNote: 7.1.1
 Roxygen: list(markdown = TRUE)
 VignetteBuilder: knitr
 Suggests: 
-    extrafont
+    extrafont,
+    leiden
 Language: en-US
--- a/8
+++ b/8
@ -49,6 +49,7 @@ export(create_line)
 export(create_line_asis)
 export(create_period_scatter)
 export(create_rank)
+export(create_sankey)
 export(create_scatter)
 export(create_stacked)
 export(create_trend)
@ -105,7 +106,10 @@ export(meetingtype_sum)
 export(meetingtype_summary)
 export(mgrcoatt_dist)
 export(mgrrel_matrix)
+export(network_describe)
 export(network_g2g)
+export(network_leiden)
+export(network_louvain)
 export(network_p2p)
 export(one2one_dist)
 export(one2one_fizz)
@ -150,16 +154,18 @@ export(workpatterns_rank)
 export(wrap)
 import(DT)
 import(Information)
-import(data.table)
 import(dplyr)
 import(ggplot2)
 import(ggraph)
 import(reshape2)
 import(scales)
+import(tidyr)
 import(tidyselect)
 importFrom(data.table,"%between%")
 importFrom(data.table,"%like%")
 importFrom(data.table,":=")
+importFrom(data.table,as.data.table)
+importFrom(data.table,rbindlist)
 importFrom(dplyr,`%>%`)
 importFrom(dplyr,mutate_if)
 importFrom(grDevices,rainbow)
--- a/R/create_sankey.R
+++ b/R/create_sankey.R
@ -0,0 +1,72 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+#' @title Create a sankey chart from a two-column count table
+#'
+#' @description
+#' Create a networkD3 style sankey chart based on a long count table
+#' with two variables. The input data should have three columns, where
+#' each row is a unique group:
+#'   1. Variable 1
+#'   2. Variable 2
+#'   3. Count
+#'
+#' @param data Data frame of the long count table.
+#' @param var1 String containing the name of the variable to be shown on the left.
+#' @param var2 String containing the name of the variable to be shown on the right.
+#' @param count String containing the name of the count variable.
+#'
+#' @import dplyr
+#'
+#' @examples
+#' \donttest{
+#' sq_data %>%
+#'   dplyr::count(Organization, FunctionType) %>%
+#'   create_sankey(var1 = "Organization", var2 = "FunctionType")
+#' }
+#'
+#' @export
+create_sankey <- function(data, var1, var2, count = "n"){
+
+  ## Rename
+  data$pre_group <- data[[var1]]
+  data$group <- data[[var2]]
+
+  ## Set up `nodes`
+  group_source <- unique(data$pre_group)
+  group_target <- paste0(unique(data$group), " ")
+
+  groups <- c(group_source, group_target)
+
+  nodes_source <- tibble(name = group_source)
+  nodes_target <- tibble(name = group_target)
+  nodes <- rbind(nodes_source, nodes_target) %>% mutate(node = 0:(nrow(.) - 1))
+
+  ## Set up `links`
+  links <-
+    data %>%
+    mutate(group = paste0(group, " ")) %>%
+    select(source = "pre_group",
+           target = "group",
+           value = count)
+
+  nodes_source <- nodes_source %>% select(name) # Make `nodes` a single column data frame
+  nodes_target <- nodes_target %>% select(name) # Make `nodes` a single column data frame
+
+  links <-
+    links %>%
+    left_join(nodes %>% rename(IDsource = "node"), by = c("source" = "name")) %>%
+    left_join(nodes %>% rename(IDtarget = "node"), by = c("target" = "name"))
+
+
+  networkD3::sankeyNetwork(Links = as.data.frame(links),
+                           Nodes = as.data.frame(nodes),
+                           Source = 'IDsource', # Change reference to IDsource
+                           Target = 'IDtarget', # Change reference to IDtarget
+                           Value = 'value',
+                           NodeID = 'name',
+                           units="count",
+                           sinksRight = FALSE)
+}
--- a/R/network_describe.R
+++ b/R/network_describe.R
@ -0,0 +1,125 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+#' @title Uncover HR attributes which best represent a population for a Person to Person query
+#'
+#' @author Tannaz Sattari Tabrizi <Tannaz.Sattari@microsoft.com>
+#'
+#' @description
+#' Returns a data frame that gives a percentage of the group combinations that best represent
+#' the population provided. Uses a person to person query.
+#'
+#' @param data Data frame for a person to person query.
+#' @param hrvar Character vector of length 3 containing the HR attributes to be used.
+#'
+#' @import dplyr
+#' @import tidyr
+#'
+#' @export
+network_describe <- function(data, hrvar = c("Organization", "LevelDesignation", "FunctionType")){
+
+  if(length(hrvar) != 3){
+
+    stop("Please provide a character vector of length 3 for `hrvar`")
+
+  }
+
+  ## De-duplicated data containing only TieOrigins
+  filtered_Data <- unique(select(data, starts_with("TieOrigin_")))
+
+  ## Select features
+  features <- select(filtered_Data, paste0("TieOrigin_", hrvar))
+
+  ## Feature set: 1
+  max_percentages_1f <-
+    features %>%
+    colnames() %>%
+    purrr::map(function(c){
+
+      agg <-
+        features %>%
+        group_by_at(.vars = vars(c)) %>%
+        summarise(count = n(), .groups = "drop") %>%
+        mutate(percentage = count / sum(count, na.rm = TRUE))
+
+      agg %>%
+        arrange(desc(percentage)) %>%
+        slice(1) %>% # Extract first row
+        mutate(feature_1 = c,
+               feature_1_value = !!sym(c)) %>%
+        select(feature_1, feature_1_value, Percentage = "percentage")
+    }) %>%
+    bind_rows()
+
+  ## Feature set: 2
+  max_percentages_2f <-
+    list(c1 = colnames(features),
+         c2 = colnames(features)) %>%
+    expand.grid(stringsAsFactors = FALSE) %>%
+    filter(c1 != c2) %>%
+    purrr::pmap(function(c1, c2){
+      agg <-
+        features %>%
+        group_by_at(.vars=vars(c1, c2)) %>%
+        summarise(count = n(), .groups = "drop") %>%
+        mutate(percentage = count / sum(count, na.rm = TRUE))
+
+      agg %>%
+        arrange(desc(percentage)) %>%
+        slice(1) %>% # Extract first row
+        mutate(feature_1 = c1,
+               feature_1_value = !!sym(as.character(c1)),
+               feature_2 = c2,
+               feature_2_value = !!sym(as.character(c2))) %>%
+      select(feature_1,
+             feature_1_value,
+             feature_2,
+             feature_2_value,
+             Percentage = "percentage")
+    }) %>%
+    bind_rows()
+
+
+  ## Feature set: 3
+  max_percentages_3f <-
+    list(c1 = colnames(features),
+         c2 = colnames(features),
+         c3 = colnames(features)) %>%
+    expand.grid(stringsAsFactors = FALSE) %>%
+    filter(c1 != c2,
+           c2 != c3,
+           c3 != c1) %>%
+    purrr::pmap(function(c1, c2, c3){
+      agg <-
+        features %>%
+        group_by_at(.vars=vars(c1, c2, c3)) %>%
+        summarise(count = n(), .groups = "drop") %>%
+        mutate(percentage = count / sum(count, na.rm = TRUE))
+
+      agg %>%
+        arrange(desc(percentage)) %>%
+        slice(1) %>% # Extract first row
+        mutate(feature_1 = c1,
+               feature_1_value = !!sym(c1),
+               feature_2 = c2,
+               feature_2_value = !!sym(c2),
+               feature_3 = c3,
+               feature_3_value = !!sym(c3)) %>%
+        select(feature_1,
+               feature_1_value,
+               feature_2,
+               feature_2_value,
+               feature_3,
+               feature_3_value,
+               Percentage = "percentage")
+    }) %>%
+    bind_rows()
+
+  list(max_percentages_1f,
+       max_percentages_2f,
+       max_percentages_3f) %>%
+    bind_rows() %>%
+    select(starts_with("feature"), Percentage)
+}
--- a/R/network_leiden.R
+++ b/R/network_leiden.R
@ -0,0 +1,225 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+#' @title Implement the Leiden community detection on a Person to Person network query
+#'
+#' @description
+#' Take a P2P network query and implement the Leiden community detection method. To run
+#' this function, you will require all the pre-requisites of the **leiden** package installed,
+#' which includes Python and **reticulate**.
+#'
+#' @param data Data frame containing a Person to Person query.
+#' @param hrvar String containing the HR attribute to be matched in the dataset.
+#' @param bg_fill String to specify background fill colour.
+#' @param font_col String to specify font and link colour.
+#' @param node_alpha A numeric value between 0 and 1 to specify the transparency of the nodes.
+#' @param path File path for saving the PDF output. Defaults to "network_p2p_leiden".
+#' Since the network outputs are computationally intensive, the default behaviour is to save time by
+#' saving the plot output directly as a PDF in the specified path. To override this behaviour and return
+#' a plot object instead, you can pass `NULL` to `path`. What is passed to `path` makes no difference
+#' if returning anything other than "plot-leiden" or "plot-hrvar".
+#'
+#' @param algorithm String to specify the node placement algorithm to be used. Defaults to "mds" to perform
+#' a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
+#' See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options.
+#'
+#' @param res Resolution parameter to be passed to `leiden::leiden()`. Defaults to 0.5.
+#' @param desc_hrvar Character vector of length 3 containing the HR attributes to use when returning the
+#' "describe" output. See `network_describe()`.
+#' @param return String specifying what output to return. Valid return options include:
+#'   - 'plot-leiden': return a network plot coloured by leiden communities.
+#'   - 'plot-hrvar': return a network plot coloured by HR attribute.
+#'   - 'plot-sankey': return a sankey plot combining communities and HR attribute.
+#'   - 'table': return a vertex summary table with counts in communities and HR attribute.
+#'   - 'data': return a vertex data file that matches vertices with communities and HR attributes.
+#'   - 'describe': return a list of data frames which describe each of the identified communities.
+#'   - 'network': return igraph object.
+#'
+#' @import dplyr
+#'
+#' @export
+network_leiden <- function(data,
+                           hrvar,
+                           bg_fill = "#000000",
+                           font_col = "#FFFFFF",
+                           algorithm = "mds",
+                           path = "network_p2p_leiden",
+                           node_alpha = 0.8,
+                           res = 0.5,
+                           desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
+                           return){
+
+  ## Set variables
+  TO_hrvar <- paste0("TieOrigin_", hrvar)
+  TD_hrvar <- paste0("TieDestination_", hrvar)
+
+  ## Set edges df
+  edges <-
+    data %>%
+    select(from = "TieOrigin_PersonId",
+           to = "TieDestination_PersonId",
+           weight = "StrongTieScore")
+
+  ## Vertices data frame to provide meta-data
+  vert_ft <-
+    rbind(
+      # TieOrigin
+      edges %>%
+        select(from) %>% # Single column
+        unique() %>% # Remove duplications
+        left_join(select(data, TieOrigin_PersonId, TO_hrvar),
+                  by = c("from"  = "TieOrigin_PersonId")) %>%
+        select(node = "from", !!sym(hrvar) := TO_hrvar),
+
+      # TieDestination
+      edges %>%
+        select(to) %>% # Single column
+        unique() %>% # Remove duplications
+        left_join(select(data, TieDestination_PersonId, TD_hrvar),
+                  by = c("to"  = "TieDestination_PersonId")) %>%
+        select(node = "to", !!sym(hrvar) := TD_hrvar)
+    )
+
+  ## Create igraph object
+  g_raw <-
+    igraph::graph_from_data_frame(edges,
+                                  directed = TRUE, # Directed, but FALSE for visualization
+                                  vertices = unique(vert_ft)) # remove duplicates
+
+  ## Return a numeric vector of partitions / clusters / modules
+  ## Set a low resolution parameter to have fewer groups
+  ld <- leiden::leiden(g_raw, resolution_parameter = res) # create partitions
+
+  ## Add cluster
+  g <-
+    g_raw %>%
+    # Add leiden partitions to graph object
+    igraph::set_vertex_attr("cluster", value = as.character(ld)) %>%
+    igraph::simplify()
+
+  ## Create vertex table
+  vertex_tb <-
+    g %>%
+    igraph::get.vertex.attribute() %>%
+    as_tibble()
+
+  g_layout <-
+    g %>%
+    ggraph::ggraph(layout = "igraph", algorithm = algorithm)
+
+  ## Return
+  if(return == "plot-leiden"){
+
+    plot_output <-
+      g_layout +
+      ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
+      ggraph::geom_node_point(aes(colour = cluster),
+                              alpha = node_alpha,
+                              pch = 16) +
+      theme_void() +
+      theme(legend.position = "bottom",
+            legend.background = element_rect(fill = bg_fill),
+            plot.background = element_rect(fill = bg_fill),
+            text = element_text(colour = font_col),
+            axis.line = element_blank()) +
+      labs(title = "Person to person collaboration with Community Detection",
+           subtitle = "Based on Leiden algorithm and Strong Tie Score",
+           y = "",
+           x = "")
+
+    # Default PDF output unless NULL supplied to path
+    if(is.null(path)){
+
+      plot_output
+
+    } else {
+
+     ggsave(paste0(path, tstamp(), ".pdf"),
+            plot = plot_output,
+            width = 16,
+            height = 9)
+
+    }
+
+  } else if(return == "plot-hrvar"){
+
+    plot_output <-
+      g_layout +
+      ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
+      ggraph::geom_node_point(aes(colour = !!sym(hrvar)),
+                              alpha = node_alpha,
+                              pch = 16) +
+      theme_void() +
+      theme(legend.position = "bottom",
+            legend.background = element_rect(fill = bg_fill),
+            plot.background = element_rect(fill = bg_fill),
+            text = element_text(colour = font_col),
+            axis.line = element_blank()) +
+      labs(title = "Person to person collaboration",
+           subtitle = paste0("Showing ", hrvar),
+           y = "",
+           x = "")
+
+    # Default PDF output unless NULL supplied to path
+    if(is.null(path)){
+
+      plot_output
+
+    } else {
+
+      ggsave(paste0(path, tstamp(), ".pdf"),
+             plot = plot_output,
+             width = 16,
+             height = 9)
+
+    }
+
+  } else if(return == "table"){
+
+    vertex_tb %>%
+      count(!!sym(hrvar), cluster)
+
+  } else if(return == "data"){
+
+    vertex_tb
+
+  } else if(return == "network"){
+
+    g
+
+
+  } else if(return == "plot-sankey"){
+
+    create_sankey(data = vertex_tb %>% count(!!sym(hrvar), cluster),
+                  var1 = hrvar,
+                  var2 = "cluster",
+                  count = "n")
+
+  } else if(return == "describe"){
+
+    describe_tb <-
+      vertex_tb %>%
+      left_join(select(data, starts_with("TieOrigin_")),
+                by = c("name" = "TieOrigin_PersonId"))
+
+    desc_str <-
+      describe_tb %>%
+      pull(cluster) %>%
+      unique()
+
+    desc_str %>%
+      purrr::map(function(x){
+        describe_tb %>%
+          filter(cluster == x) %>%
+          network_describe(hrvar = desc_hrvar)
+      }) %>%
+      setNames(nm = desc_str)
+
+  } else {
+
+    stop("Please enter a valid input for `return`.")
+
+  }
+}
--- a/R/network_louvain.R
+++ b/R/network_louvain.R
@ -0,0 +1,222 @@
+# --------------------------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License. See LICENSE.txt in the project root for license information.
+# --------------------------------------------------------------------------------------------
+
+#' @title Implement the Louvain community detection on a Person to Person network query
+#'
+#' @description
+#' Take a P2P network query and implement the Louvain community detection method. The
+#' **igraph** implementation of the Louvain method is used.
+#'
+#' @param data Data frame containing a Person to Person query.
+#' @param hrvar String containing the HR attribute to be matched in the dataset.
+#' @param bg_fill String to specify background fill colour.
+#' @param font_col String to specify font and link colour.
+#' @param node_alpha A numeric value between 0 and 1 to specify the transparency of the nodes.
+#' @param algorithm String to specify the node placement algorithm to be used. Defaults to "mds" to perform
+#' a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
+#' See <https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html> for a full list of options.
+#' @param path File path for saving the PDF output. Defaults to "network_p2p_louvain".
+#' Since the network outputs are computationally intensive, the default behaviour is to save time by
+#' saving the plot output directly as a PDF in the specified path. To override this behaviour and return
+#' a plot object instead, you can pass `NULL` to `path`. What is passed to `path` makes no difference
+#' if returning anything other than "plot-louvain" or "plot-hrvar".
+#' @param desc_hrvar Character vector of length 3 containing the HR attributes to use when returning the
+#' "describe" output. See `network_describe()`.
+#'
+#' @param return String specifying what output to return.Valid return options include:
+#'   - 'plot-louvain': return a network plot coloured by louvain communities.
+#'   - 'plot-hrvar': return a network plot coloured by HR attribute.
+#'   - 'plot-sankey': return a sankey plot combining communities and HR attribute.
+#'   - 'table': return a vertex summary table with counts in communities and HR attribute.
+#'   - 'data': return a vertex data file that matches vertices with communities and HR attributes.
+#'   - 'describe': returns a list of data frames which describe each of the identified communities.
+#'   - 'network': return igraph object.
+#'
+#' @import ggraph
+#' @import dplyr
+#'
+#' @export
+network_louvain <- function(data,
+                            hrvar,
+                            bg_fill = "#000000",
+                            font_col = "#FFFFFF",
+                            node_alpha = 0.8,
+                            algorithm = "mds",
+                            path = "network_p2p_louvain",
+                            desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
+                            return){
+
+  ## Set variables
+  TO_hrvar <- paste0("TieOrigin_", hrvar)
+  TD_hrvar <- paste0("TieDestination_", hrvar)
+
+  ## Set edges df
+  edges <-
+    data %>%
+    select(from = "TieOrigin_PersonId",
+           to = "TieDestination_PersonId",
+           weight = "StrongTieScore")
+
+  ## Vertices data frame to provide meta-data
+  vert_ft <-
+    rbind(
+      # TieOrigin
+      edges %>%
+        select(from) %>% # Single column
+        unique() %>% # Remove duplications
+        left_join(select(data, TieOrigin_PersonId, TO_hrvar),
+                  by = c("from"  = "TieOrigin_PersonId")) %>%
+        select(node = "from", !!sym(hrvar) := TO_hrvar),
+
+      # TieDestination
+      edges %>%
+        select(to) %>% # Single column
+        unique() %>% # Remove duplications
+        left_join(select(data, TieDestination_PersonId, TD_hrvar),
+                  by = c("to"  = "TieDestination_PersonId")) %>%
+        select(node = "to", !!sym(hrvar) := TD_hrvar)
+    )
+
+  ## Create igraph object
+  g_raw <-
+    igraph::graph_from_data_frame(edges,
+                                  directed = FALSE, # Set to undirected for clustering
+                                  vertices = unique(vert_ft)) # remove duplicates
+
+  ## Return a numeric vector of partitions / clusters / modules
+  ## Set a low resolution parameter to have fewer groups
+  lc <- igraph::cluster_louvain(g_raw)
+
+  ## Add cluster
+  g <-
+    g_raw %>%
+    # Add louvain partitions to graph object
+    igraph::set_vertex_attr("cluster", value = as.character(igraph::membership(lc))) %>% # Return membership - diff from Leiden
+    igraph::simplify()
+
+  ## Create vertex table
+  vertex_tb <-
+    g %>%
+    igraph::get.vertex.attribute() %>%
+    as_tibble()
+
+  g_layout <-
+    g %>%
+    ggraph::ggraph(layout = "igraph", algorithm = algorithm)
+
+  ## Return
+  if(return == "plot-louvain"){
+
+    plot_output <-
+      g_layout +
+      ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
+      ggraph::geom_node_point(aes(colour = cluster),
+                              alpha = node_alpha,
+                              pch = 16) +
+      theme_void() +
+      theme(legend.position = "bottom",
+            legend.background = element_rect(fill = bg_fill),
+            plot.background = element_rect(fill = bg_fill),
+            text = element_text(colour = font_col),
+            axis.line = element_blank()) +
+      labs(title = "Person to person collaboration with Community Detection",
+           subtitle = "Based on Louvain algorithm and Strong Tie Score",
+           y = "",
+           x = "")
+
+    # Default PDF output unless NULL supplied to path
+    if(is.null(path)){
+
+      plot_output
+
+    } else {
+
+      ggsave(paste0(path, tstamp(), ".pdf"),
+             plot = plot_output,
+             width = 16,
+             height = 9)
+
+    }
+
+  } else if(return == "plot-hrvar"){
+
+    plot_output <-
+      g_layout +
+      ggraph::geom_edge_link(colour = "lightgrey", edge_width = 0.01, alpha = 0.15) +
+      ggraph::geom_node_point(aes(colour = !!sym(hrvar)),
+                              alpha = node_alpha,
+                              pch = 16) +
+      theme_void() +
+      theme(legend.position = "bottom",
+            legend.background = element_rect(fill = bg_fill),
+            plot.background = element_rect(fill = bg_fill),
+            text = element_text(colour = font_col),
+            axis.line = element_blank()) +
+      labs(title = "Person to person collaboration",
+           subtitle = paste0("Showing ", hrvar),
+           y = "",
+           x = "")
+
+    # Default PDF output unless NULL supplied to path
+    if(is.null(path)){
+
+      plot_output
+
+    } else {
+
+      ggsave(paste0(path, tstamp(), ".pdf"),
+             plot = plot_output,
+             width = 16,
+             height = 9)
+
+    }
+
+  } else if(return == "table"){
+
+    vertex_tb %>%
+      count(!!sym(hrvar), cluster)
+
+  } else if(return == "data"){
+
+    vertex_tb
+
+  } else if(return == "network"){
+
+    g
+
+
+  } else if(return == "plot-sankey"){
+
+    create_sankey(data = vertex_tb %>% count(!!sym(hrvar), cluster),
+                  var1 = hrvar,
+                  var2 = "cluster",
+                  count = "n")
+
+  } else if(return == "describe"){
+
+    describe_tb <-
+      vertex_tb %>%
+      left_join(select(data, starts_with("TieOrigin_")),
+                by = c("name" = "TieOrigin_PersonId"))
+
+    desc_str <-
+      describe_tb %>%
+      pull(cluster) %>%
+      unique()
+
+    desc_str %>%
+      purrr::map(function(x){
+        describe_tb %>%
+          filter(cluster == x) %>%
+          network_describe(hrvar = desc_hrvar)
+      }) %>%
+      setNames(nm = desc_str)
+
+  } else {
+
+    stop("Please enter a valid input for `return`.")
+
+  }
+}
--- a/R/pairwise_count.R
+++ b/R/pairwise_count.R
@ -5,16 +5,25 @@
 #' within `tm_cooc()`.
 #'
 #' @param data Data frame output from `tm_clean()`.
-#' @param id String to represent the id variable. Defaults to "word".
+#' @param id String to represent the id variable. Defaults to "line".
 #' @param word String to represent the word variable. Defaults to "word".
 #'
-#' @import data.table
+#' @importFrom data.table ":=" "%like%" "%between%" rbindlist as.data.table
+#'
+#' @examples
+#' td <- data.frame(line = c(1, 1, 2, 2),
+#'                  word = c("work", "meeting", "catch", "up"))
+#'
+#' pairwise_count(td, id = "line", word = "word")
 #'
 #' @export
 pairwise_count <- function(data,
                           id = "line",
                           word = "word"){

+  # Make sure data.table knows we know we're using it
+  .datatable.aware = TRUE
+
  data <-
    data %>%
    dplyr::rename(word := !!sym(word),
--- a/man/create_sankey.Rd
+++ b/man/create_sankey.Rd
@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/create_sankey.R
+\name{create_sankey}
+\alias{create_sankey}
+\title{Create a sankey chart from a two-column count table}
+\usage{
+create_sankey(data, var1, var2, count = "n")
+}
+\arguments{
+\item{data}{Data frame of the long count table.}
+
+\item{var1}{String containing the name of the variable to be shown on the left.}
+
+\item{var2}{String containing the name of the variable to be shown on the right.}
+
+\item{count}{String containing the name of the count variable.}
+}
+\description{
+Create a networkD3 style sankey chart based on a long count table
+with two variables. The input data should have three columns, where
+each row is a unique group:
+\enumerate{
+\item Variable 1
+\item Variable 2
+\item Count
+}
+}
+\examples{
+\donttest{
+sq_data \%>\%
+  dplyr::count(Organization, FunctionType) \%>\%
+  create_sankey(var1 = "Organization", var2 = "FunctionType")
+}
+
+}
--- a/man/network_describe.Rd
+++ b/man/network_describe.Rd
@ -0,0 +1,23 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/network_describe.R
+\name{network_describe}
+\alias{network_describe}
+\title{Uncover HR attributes which best represent a population for a Person to Person query}
+\usage{
+network_describe(
+  data,
+  hrvar = c("Organization", "LevelDesignation", "FunctionType")
+)
+}
+\arguments{
+\item{data}{Data frame for a person to person query.}
+
+\item{hrvar}{Character vector of length 3 containing the HR attributes to be used.}
+}
+\description{
+Returns a data frame that gives a percentage of the group combinations that best represent
+the population provided. Uses a person to person query.
+}
+\author{
+Tannaz Sattari Tabrizi \href{mailto:Tannaz.Sattari@microsoft.com}{Tannaz.Sattari@microsoft.com}
+}
--- a/man/network_leiden.Rd
+++ b/man/network_leiden.Rd
@ -0,0 +1,61 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/network_leiden.R
+\name{network_leiden}
+\alias{network_leiden}
+\title{Implement the Leiden community detection on a Person to Person network query}
+\usage{
+network_leiden(
+  data,
+  hrvar,
+  bg_fill = "#000000",
+  font_col = "#FFFFFF",
+  algorithm = "mds",
+  path = "network_p2p_leiden",
+  node_alpha = 0.8,
+  res = 0.5,
+  desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
+  return
+)
+}
+\arguments{
+\item{data}{Data frame containing a Person to Person query.}
+
+\item{hrvar}{String containing the HR attribute to be matched in the dataset.}
+
+\item{bg_fill}{String to specify background fill colour.}
+
+\item{font_col}{String to specify font and link colour.}
+
+\item{algorithm}{String to specify the node placement algorithm to be used. Defaults to "mds" to perform
+a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
+See \url{https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html} for a full list of options.}
+
+\item{path}{File path for saving the PDF output. Defaults to "network_p2p_leiden".
+Since the network outputs are computationally intensive, the default behaviour is to save time by
+saving the plot output directly as a PDF in the specified path. To override this behaviour and return
+a plot object instead, you can pass \code{NULL} to \code{path}. What is passed to \code{path} makes no difference
+if returning anything other than "plot-leiden" or "plot-hrvar".}
+
+\item{node_alpha}{A numeric value between 0 and 1 to specify the transparency of the nodes.}
+
+\item{res}{Resolution parameter to be passed to \code{leiden::leiden()}. Defaults to 0.5.}
+
+\item{desc_hrvar}{Character vector of length 3 containing the HR attributes to use when returning the
+"describe" output. See \code{network_describe()}.}
+
+\item{return}{String specifying what output to return. Valid return options include:
+\itemize{
+\item 'plot-leiden': return a network plot coloured by leiden communities.
+\item 'plot-hrvar': return a network plot coloured by HR attribute.
+\item 'plot-sankey': return a sankey plot combining communities and HR attribute.
+\item 'table': return a vertex summary table with counts in communities and HR attribute.
+\item 'data': return a vertex data file that matches vertices with communities and HR attributes.
+\item 'describe': return a list of data frames which describe each of the identified communities.
+\item 'network': return igraph object.
+}}
+}
+\description{
+Take a P2P network query and implement the Leiden community detection method. To run
+this function, you will require all the pre-requisites of the \strong{leiden} package installed,
+which includes Python and \strong{reticulate}.
+}
--- a/man/network_louvain.Rd
+++ b/man/network_louvain.Rd
@ -0,0 +1,57 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/network_louvain.R
+\name{network_louvain}
+\alias{network_louvain}
+\title{Implement the Louvain community detection on a Person to Person network query}
+\usage{
+network_louvain(
+  data,
+  hrvar,
+  bg_fill = "#000000",
+  font_col = "#FFFFFF",
+  node_alpha = 0.8,
+  algorithm = "mds",
+  path = "network_p2p_louvain",
+  desc_hrvar = c("Organization", "LevelDesignation", "FunctionType"),
+  return
+)
+}
+\arguments{
+\item{data}{Data frame containing a Person to Person query.}
+
+\item{hrvar}{String containing the HR attribute to be matched in the dataset.}
+
+\item{bg_fill}{String to specify background fill colour.}
+
+\item{font_col}{String to specify font and link colour.}
+
+\item{node_alpha}{A numeric value between 0 and 1 to specify the transparency of the nodes.}
+
+\item{algorithm}{String to specify the node placement algorithm to be used. Defaults to "mds" to perform
+a multidimensional scaling of nodes using a shortest path, which is also a deterministic method.
+See \url{https://rdrr.io/cran/ggraph/man/layout_tbl_graph_igraph.html} for a full list of options.}
+
+\item{path}{File path for saving the PDF output. Defaults to "network_p2p_louvain".
+Since the network outputs are computationally intensive, the default behaviour is to save time by
+saving the plot output directly as a PDF in the specified path. To override this behaviour and return
+a plot object instead, you can pass \code{NULL} to \code{path}. What is passed to \code{path} makes no difference
+if returning anything other than "plot-louvain" or "plot-hrvar".}
+
+\item{desc_hrvar}{Character vector of length 3 containing the HR attributes to use when returning the
+"describe" output. See \code{network_describe()}.}
+
+\item{return}{String specifying what output to return.Valid return options include:
+\itemize{
+\item 'plot-louvain': return a network plot coloured by louvain communities.
+\item 'plot-hrvar': return a network plot coloured by HR attribute.
+\item 'plot-sankey': return a sankey plot combining communities and HR attribute.
+\item 'table': return a vertex summary table with counts in communities and HR attribute.
+\item 'data': return a vertex data file that matches vertices with communities and HR attributes.
+\item 'describe': returns a list of data frames which describe each of the identified communities.
+\item 'network': return igraph object.
+}}
+}
+\description{
+Take a P2P network query and implement the Louvain community detection method. The
+\strong{igraph} implementation of the Louvain method is used.
+}
--- a/man/pairwise_count.Rd
+++ b/man/pairwise_count.Rd
@ -9,7 +9,7 @@ pairwise_count(data, id = "line", word = "word")
 \arguments{
 \item{data}{Data frame output from \code{tm_clean()}.}

-\item{id}{String to represent the id variable. Defaults to "word".}
+\item{id}{String to represent the id variable. Defaults to "line".}

 \item{word}{String to represent the word variable. Defaults to "word".}
 }
@ -18,3 +18,10 @@ This is a \strong{data.table} implementation that mimics the output of
 \code{widyr::pairwise_count()} to reduce package dependency. This is used internally
 within \code{tm_cooc()}.
 }
+\examples{
+td <- data.frame(line = c(1, 1, 2, 2),
+                 word = c("work", "meeting", "catch", "up"))
+
+pairwise_count(td, id = "line", word = "word")
+
+}