Modularity partitions (#41)

* Commit of current status prior to actually updating the tests. The current tests are less tests and more an exploration in why our modularity calculation differed from the python-louvai modularity calculation * Updating tests and documentation * Wanted to make sure we handled disconnected nodes appropriately * Added release notes * Restricting us to the 2.x versions of Sphinx, 3.0 breaks us Co-authored-by: Dwayne Pryce <dwpryce@microsoft.com>
2020-05-06 13:51:54 -07:00 · 2020-05-06 13:51:54 -07:00 · da0440722e
--- a/docs/release_notes.md
+++ b/docs/release_notes.md
@ -1,4 +1,6 @@
 # Release Notes
+## 0.1.3
+- Added `modularity` and `modularity_components` functions, and deprecated `q_score`.
 ## 0.1.2
 - Rename `self_loop_augmentation` to `diagonal_augmentation` and use weighted degree to perform calculation instead of degree only.
 - Fix bug when getting the length of edges when performing graph augmentations.
--- a/setup.py
+++ b/setup.py
@ -52,7 +52,7 @@ setuptools.setup(
            'pytest',
            'flake8',
            'mypy',
-            'sphinx',
+            'sphinx>=2.4.4,<3.0.0',
            'sphinx-rtd-theme',
            'testfixtures',
            'recommonmark'
--- a/tests/partition/test_modularity.py
+++ b/tests/partition/test_modularity.py
@ -4,8 +4,24 @@
 import networkx as nx
 import numpy as np
 from topologic import PartitionedGraph
-from topologic.partition import q_score
+from topologic.partition import modularity, modularity_components, q_score
 import unittest
+from typing import Dict
+import community  # python-louvain module
+
+from tests.utils import data_file
+
+
+def _modularity_graph() -> nx.Graph:
+    graph = nx.Graph()
+    graph.add_edge("a", "b", weight=4.0)
+    graph.add_edge("b", "c", weight=3.0)
+    graph.add_edge("e", "f", weight=5.0)
+
+    return graph
+
+
+_PARTITIONS: Dict[str, int] = {'a': 0, 'b': 0, 'c': 0, 'e': 1, 'f': 1}


 class TestModularity(unittest.TestCase):
@ -14,13 +30,56 @@ class TestModularity(unittest.TestCase):
            q_score("foo")

    def test_q_score(self):
-        graph = nx.Graph()
-        graph.add_edge("a", "b", weight=4.0)
-        graph.add_edge("b", "c", weight=3.0)
-        graph.add_edge("e", "f", weight=5.0)
+        graph = _modularity_graph()

-        partition = {'a': 0, 'b': 0, 'c': 0, 'e': 1, 'f': 1}
+        partition = _PARTITIONS
        part_graph = PartitionedGraph(graph, partition)
-        modularity = q_score(part_graph)
-        self.assertIsInstance(modularity, float)
-        np.testing.assert_almost_equal(0.48611111111111105, modularity)
+        modularity_value = q_score(part_graph)
+        self.assertIsInstance(modularity_value, float)
+        np.testing.assert_almost_equal(0.48611111111111105, modularity_value)
+
+    def test_modularity(self):
+        graph = _modularity_graph()  # links = 12.0
+        partition = _PARTITIONS  # in community degree for -> 0: 14, 1: 10, community degree -> 0:14, 1:10
+        # modularity component for partition 0: (14.0 / (2.0 * 12.0)) - (1.0 * ((14.0 / (2 * 12.0)) ** 2.0))
+        # (cont): 0.5833333333333334 - 0.34027777777777785 = 0.24305555555555552
+        # modularity component for partition 1: (10.0 / (2.0 * 12.0)) - (1.0 * ((10.0 / (2 * 12.0)) ** 2.0))
+        # (cont): 0.4166666666666667 - 0.17361111111111113 = 0.24305555555555555
+        modularity_value = modularity(graph, partition)
+
+        np.testing.assert_almost_equal(0.48611111111111105, modularity_value)
+
+    def test_modularity_components(self):
+        graph = nx.Graph()
+        with open(data_file("large-graph.csv"), "r") as edge_list_io:
+            for line in edge_list_io:
+                source, target, weight = line.strip().split(",")
+                previous_weight = graph.get_edge_data(source, target, {"weight": 0})["weight"]
+                weight = float(weight) + previous_weight
+                graph.add_edge(source, target, weight=weight)
+
+        partitions = {}
+        with open(data_file("large-graph-partitions.csv"), "r") as communities_io:
+            for line in communities_io:
+                vertex, comm = line.strip().split(",")
+                partitions[vertex] = int(comm)
+
+        partition_count = max(partitions.values())
+
+        graph.add_node("disconnected_node")
+        partitions["disconnected_node"] = partition_count + 1
+
+        components = modularity_components(graph, partitions)
+
+        # from python louvain
+        community_modularity = community.modularity(partitions, graph)
+        total_modularity = sum(components.values())
+
+        self.assertSetEqual(set(components.keys()), set(partitions.values()))
+        self.assertEqual(0, components[partition_count + 1])
+
+        # the following test is not super inspiring. I am not a floating point number specialist, but as far as I can
+        # tell it's because networkx.Graph().degree() returns 2 times the edge weight for each value, which
+        # we then divide by 2.0 immediately and sum, whereas in our version we don't do this step.
+        # aside from (not) doing that, the only other difference is using math.pow instead of `**`.
+        np.testing.assert_almost_equal(community_modularity, total_modularity, decimal=3)
--- a/tests/test_data/large-graph-partitions.csv
+++ b/tests/test_data/large-graph-partitions.csv
--- a/tests/test_data/large-graph.csv
+++ b/tests/test_data/large-graph.csv
--- a/topologic/assertions.py
+++ b/topologic/assertions.py
@ -88,3 +88,14 @@ def validate_minimal_graph(
        raise ValueError("The graph provided has no edges")
    if not nx.is_weighted(graph, weight=weight_attribute):
        raise ValueError("The graph provided is not fully weighted")
+
+
+def assert_is_undirected(graph: nx.Graph):
+    """
+    Asserts that an object is an undirected graph
+
+    :param graph: Graph to check
+    :raises ValueError: If a graph is not an undirected graph
+    """
+    if graph.is_directed():
+        raise ValueError("graph must be an undirected graph")
--- a/topologic/partition/init.py
+++ b/topologic/partition/init.py
@ -1,8 +1,14 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-from .modularity import q_score
+from .modularity import modularity, modularity_components, q_score
 from .louvain_stub import louvain
 from .induce import induce_graph_by_communities

-__all__ = ['induce_graph_by_communities', 'louvain', 'q_score']
+__all__ = [
+    'induce_graph_by_communities',
+    'louvain',
+    'modularity',
+    'modularity_components',
+    'q_score'
+]
--- a/topologic/partition/modularity.py
+++ b/topologic/partition/modularity.py
@ -1,7 +1,12 @@
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

+import math
+import networkx as nx
 import community
+from collections import defaultdict
+from typing import Any, Dict
+import warnings
 from .. import assertions
 from ..partitioned_graph import PartitionedGraph

@ -11,13 +16,15 @@ def q_score(
    weight_column: str = 'weight'
 ) -> float:
    """
+    Deprecated: See modularity() for replacement.
+
    Given a topologic PartitionedGraph, return the q score - or modularity of a graph.

    See also: https://en.wikipedia.org/wiki/Modularity_(networks)

-    :param graph: Partitioned graph contains a dictionary of all the communities in a graph, optimized for
+    :param partitioned_graph: Partitioned graph contains a dictionary of all the communities in a graph, optimized for
        best modularity.  This partition structure is used when computing final q_score / modularity of graph.
-    :type partitioned_graph: topologic.PartitionedGraph
+    :type partitioned_graph: Optional[topologic.PartitionedGraph]
    :param str weight_column: weight column to use in computing modularity.
    :raise UnweightedGraphError: if graph does not contain weight_column in edge attributes
    :raise KeyError: If the partition is not a partition of all graph nodes.  This should not occur if PartitionedGraph
@ -27,6 +34,10 @@ def q_score(
    :return: q_score, or modularity, of this graph using the provided partitioning scheme.
    :rtype: float
    """
+    warnings.warn(
+        "topologic.partition.q_score() has been deprecated in favor of topologic.partition.modularity()",
+        DeprecationWarning
+    )
    if isinstance(partitioned_graph, PartitionedGraph):
        partition = partitioned_graph.community_partitions
        extracted_graph = partitioned_graph.graph
@ -35,3 +46,96 @@ def q_score(
        raise TypeError('Expected type topologic.PartitionedGraph')

    return community.modularity(partition, extracted_graph, weight_column)
+
+
+def modularity(
+    graph: nx.Graph,
+    partitions: Dict[Any, int],
+    weight_attribute: str = "weight",
+    resolution: float = 1.0
+) -> float:
+    """
+    Given an undirected graph and a dictionary of vertices to community ids, calculate the modularity.
+
+    See also: https://en.wikipedia.org/wiki/Modularity_(networks)
+
+    :param nx.Graph graph: An undirected graph
+    :param Dict[Any, int] partitions: A dictionary representing a community partitioning scheme with the keys being the
+        vertex and the value being a community id. Within topologic, these community ids are required to be ints.
+    :param str weight_attribute: The edge data attribute on the graph that contains a float weight for the edge.
+    :param float resolution: The resolution to use when calculating the modularity.
+    :return: The modularity quality score for the given network and community partition schema.
+    :raise TypeError: If the graph is not a networkx Graph
+    :raise ValueError: If the graph is unweighted
+    :raise ValueError: If the graph is directed
+    """
+    assertions.assert_is_graph(graph)
+    assertions.assert_is_weighted(graph, weight_attribute)
+    assertions.assert_is_undirected(graph)
+
+    components = modularity_components(graph, partitions, weight_attribute, resolution)
+
+    return sum(components.values())
+
+
+def _modularity_component(
+    degree_sum_within_community: float,
+    degree_sum: float,
+    total_network_edge_weight: float,
+    resolution: float
+) -> float:
+    degree_within_community_ratio = degree_sum_within_community / total_network_edge_weight
+    community_degree_ratio = math.pow(degree_sum / (2.0 * total_network_edge_weight), 2.0)
+
+    return degree_within_community_ratio - resolution * community_degree_ratio
+
+
+def modularity_components(
+    graph: nx.Graph,
+    partitions: Dict[Any, int],
+    weight_attribute: str = "weight",
+    resolution: float = 1.0
+) -> Dict[int, float]:
+    """
+    Given an undirected, weighted graph and a community partition dictionary, calculates a modularity quantum for each
+    community ID. The sum of these quanta is the modularity of the graph and partitions provided.
+
+    :param nx.Graph graph: An undirected graph
+    :param Dict[Any, int] partitions: A dictionary representing a community partitioning scheme with the keys being the
+        vertex and the value being a community id. Within topologic, these community ids are required to be ints.
+    :param str weight_attribute: The edge data attribute on the graph that contains a float weight for the edge.
+    :param float resolution: The resolution to use when calculating the modularity.
+    :return: A dictionary of the community id to the modularity component of that community
+    :rtype: Dict[int, float]
+    :raise TypeError: If the graph is not a networkx Graph
+    :raise ValueError: If the graph is unweighted
+    :raise ValueError: If the graph is directed
+    """
+
+    assertions.assert_is_graph(graph)
+    assertions.assert_is_weighted(graph, weight_attribute)
+    assertions.assert_is_undirected(graph)
+
+    total_edge_weight = 0.0
+
+    communities = set(partitions.values())
+
+    degree_sums_within_community: Dict[int, float] = defaultdict(lambda: 0.0)
+    degree_sums_for_community: Dict[int, float] = defaultdict(lambda: 0.0)
+    for vertex, neighbor_vertex, weight in graph.edges(data=weight_attribute):
+        vertex_community = partitions[vertex]
+        neighbor_community = partitions[neighbor_vertex]
+        if vertex_community == neighbor_community:
+            if vertex == neighbor_vertex:
+                degree_sums_within_community[vertex_community] += weight * 2.0
+            else:
+                degree_sums_within_community[vertex_community] += weight
+        degree_sums_for_community[vertex_community] += weight * 2.0
+        total_edge_weight += weight
+
+    return {comm: _modularity_component(
+        degree_sums_within_community[comm],
+        degree_sums_for_community[comm],
+        total_edge_weight,
+        resolution
+    ) for comm in communities}