* Commit of current status prior to actually updating the tests.  The current tests are less tests and more an exploration in why our modularity calculation differed from the python-louvai modularity calculation

* Updating tests and documentation

* Wanted to make sure we handled disconnected nodes appropriately

* Added release notes

* Restricting us to the 2.x versions of Sphinx, 3.0 breaks us

Co-authored-by: Dwayne Pryce <dwpryce@microsoft.com>
This commit is contained in:
Dwayne Pryce 2020-05-06 13:51:54 -07:00 коммит произвёл GitHub
Родитель f1b43627d6
Коммит da0440722e
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
8 изменённых файлов: 159360 добавлений и 14 удалений

Просмотреть файл

@ -1,4 +1,6 @@
# Release Notes
## 0.1.3
- Added `modularity` and `modularity_components` functions, and deprecated `q_score`.
## 0.1.2
- Rename `self_loop_augmentation` to `diagonal_augmentation` and use weighted degree to perform calculation instead of degree only.
- Fix bug when getting the length of edges when performing graph augmentations.

Просмотреть файл

@ -52,7 +52,7 @@ setuptools.setup(
'pytest',
'flake8',
'mypy',
'sphinx',
'sphinx>=2.4.4,<3.0.0',
'sphinx-rtd-theme',
'testfixtures',
'recommonmark'

Просмотреть файл

@ -4,8 +4,24 @@
import networkx as nx
import numpy as np
from topologic import PartitionedGraph
from topologic.partition import q_score
from topologic.partition import modularity, modularity_components, q_score
import unittest
from typing import Dict
import community # python-louvain module
from tests.utils import data_file
def _modularity_graph() -> nx.Graph:
graph = nx.Graph()
graph.add_edge("a", "b", weight=4.0)
graph.add_edge("b", "c", weight=3.0)
graph.add_edge("e", "f", weight=5.0)
return graph
_PARTITIONS: Dict[str, int] = {'a': 0, 'b': 0, 'c': 0, 'e': 1, 'f': 1}
class TestModularity(unittest.TestCase):
@ -14,13 +30,56 @@ class TestModularity(unittest.TestCase):
q_score("foo")
def test_q_score(self):
graph = nx.Graph()
graph.add_edge("a", "b", weight=4.0)
graph.add_edge("b", "c", weight=3.0)
graph.add_edge("e", "f", weight=5.0)
graph = _modularity_graph()
partition = {'a': 0, 'b': 0, 'c': 0, 'e': 1, 'f': 1}
partition = _PARTITIONS
part_graph = PartitionedGraph(graph, partition)
modularity = q_score(part_graph)
self.assertIsInstance(modularity, float)
np.testing.assert_almost_equal(0.48611111111111105, modularity)
modularity_value = q_score(part_graph)
self.assertIsInstance(modularity_value, float)
np.testing.assert_almost_equal(0.48611111111111105, modularity_value)
def test_modularity(self):
graph = _modularity_graph() # links = 12.0
partition = _PARTITIONS # in community degree for -> 0: 14, 1: 10, community degree -> 0:14, 1:10
# modularity component for partition 0: (14.0 / (2.0 * 12.0)) - (1.0 * ((14.0 / (2 * 12.0)) ** 2.0))
# (cont): 0.5833333333333334 - 0.34027777777777785 = 0.24305555555555552
# modularity component for partition 1: (10.0 / (2.0 * 12.0)) - (1.0 * ((10.0 / (2 * 12.0)) ** 2.0))
# (cont): 0.4166666666666667 - 0.17361111111111113 = 0.24305555555555555
modularity_value = modularity(graph, partition)
np.testing.assert_almost_equal(0.48611111111111105, modularity_value)
def test_modularity_components(self):
graph = nx.Graph()
with open(data_file("large-graph.csv"), "r") as edge_list_io:
for line in edge_list_io:
source, target, weight = line.strip().split(",")
previous_weight = graph.get_edge_data(source, target, {"weight": 0})["weight"]
weight = float(weight) + previous_weight
graph.add_edge(source, target, weight=weight)
partitions = {}
with open(data_file("large-graph-partitions.csv"), "r") as communities_io:
for line in communities_io:
vertex, comm = line.strip().split(",")
partitions[vertex] = int(comm)
partition_count = max(partitions.values())
graph.add_node("disconnected_node")
partitions["disconnected_node"] = partition_count + 1
components = modularity_components(graph, partitions)
# from python louvain
community_modularity = community.modularity(partitions, graph)
total_modularity = sum(components.values())
self.assertSetEqual(set(components.keys()), set(partitions.values()))
self.assertEqual(0, components[partition_count + 1])
# the following test is not super inspiring. I am not a floating point number specialist, but as far as I can
# tell it's because networkx.Graph().degree() returns 2 times the edge weight for each value, which
# we then divide by 2.0 immediately and sum, whereas in our version we don't do this step.
# aside from (not) doing that, the only other difference is using math.pow instead of `**`.
np.testing.assert_almost_equal(community_modularity, total_modularity, decimal=3)

Разница между файлами не показана из-за своего большого размера Загрузить разницу

151981
tests/test_data/large-graph.csv Normal file

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -88,3 +88,14 @@ def validate_minimal_graph(
raise ValueError("The graph provided has no edges")
if not nx.is_weighted(graph, weight=weight_attribute):
raise ValueError("The graph provided is not fully weighted")
def assert_is_undirected(graph: nx.Graph):
"""
Asserts that an object is an undirected graph
:param graph: Graph to check
:raises ValueError: If a graph is not an undirected graph
"""
if graph.is_directed():
raise ValueError("graph must be an undirected graph")

Просмотреть файл

@ -1,8 +1,14 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from .modularity import q_score
from .modularity import modularity, modularity_components, q_score
from .louvain_stub import louvain
from .induce import induce_graph_by_communities
__all__ = ['induce_graph_by_communities', 'louvain', 'q_score']
__all__ = [
'induce_graph_by_communities',
'louvain',
'modularity',
'modularity_components',
'q_score'
]

Просмотреть файл

@ -1,7 +1,12 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import math
import networkx as nx
import community
from collections import defaultdict
from typing import Any, Dict
import warnings
from .. import assertions
from ..partitioned_graph import PartitionedGraph
@ -11,13 +16,15 @@ def q_score(
weight_column: str = 'weight'
) -> float:
"""
Deprecated: See modularity() for replacement.
Given a topologic PartitionedGraph, return the q score - or modularity of a graph.
See also: https://en.wikipedia.org/wiki/Modularity_(networks)
:param graph: Partitioned graph contains a dictionary of all the communities in a graph, optimized for
:param partitioned_graph: Partitioned graph contains a dictionary of all the communities in a graph, optimized for
best modularity. This partition structure is used when computing final q_score / modularity of graph.
:type partitioned_graph: topologic.PartitionedGraph
:type partitioned_graph: Optional[topologic.PartitionedGraph]
:param str weight_column: weight column to use in computing modularity.
:raise UnweightedGraphError: if graph does not contain weight_column in edge attributes
:raise KeyError: If the partition is not a partition of all graph nodes. This should not occur if PartitionedGraph
@ -27,6 +34,10 @@ def q_score(
:return: q_score, or modularity, of this graph using the provided partitioning scheme.
:rtype: float
"""
warnings.warn(
"topologic.partition.q_score() has been deprecated in favor of topologic.partition.modularity()",
DeprecationWarning
)
if isinstance(partitioned_graph, PartitionedGraph):
partition = partitioned_graph.community_partitions
extracted_graph = partitioned_graph.graph
@ -35,3 +46,96 @@ def q_score(
raise TypeError('Expected type topologic.PartitionedGraph')
return community.modularity(partition, extracted_graph, weight_column)
def modularity(
graph: nx.Graph,
partitions: Dict[Any, int],
weight_attribute: str = "weight",
resolution: float = 1.0
) -> float:
"""
Given an undirected graph and a dictionary of vertices to community ids, calculate the modularity.
See also: https://en.wikipedia.org/wiki/Modularity_(networks)
:param nx.Graph graph: An undirected graph
:param Dict[Any, int] partitions: A dictionary representing a community partitioning scheme with the keys being the
vertex and the value being a community id. Within topologic, these community ids are required to be ints.
:param str weight_attribute: The edge data attribute on the graph that contains a float weight for the edge.
:param float resolution: The resolution to use when calculating the modularity.
:return: The modularity quality score for the given network and community partition schema.
:raise TypeError: If the graph is not a networkx Graph
:raise ValueError: If the graph is unweighted
:raise ValueError: If the graph is directed
"""
assertions.assert_is_graph(graph)
assertions.assert_is_weighted(graph, weight_attribute)
assertions.assert_is_undirected(graph)
components = modularity_components(graph, partitions, weight_attribute, resolution)
return sum(components.values())
def _modularity_component(
degree_sum_within_community: float,
degree_sum: float,
total_network_edge_weight: float,
resolution: float
) -> float:
degree_within_community_ratio = degree_sum_within_community / total_network_edge_weight
community_degree_ratio = math.pow(degree_sum / (2.0 * total_network_edge_weight), 2.0)
return degree_within_community_ratio - resolution * community_degree_ratio
def modularity_components(
graph: nx.Graph,
partitions: Dict[Any, int],
weight_attribute: str = "weight",
resolution: float = 1.0
) -> Dict[int, float]:
"""
Given an undirected, weighted graph and a community partition dictionary, calculates a modularity quantum for each
community ID. The sum of these quanta is the modularity of the graph and partitions provided.
:param nx.Graph graph: An undirected graph
:param Dict[Any, int] partitions: A dictionary representing a community partitioning scheme with the keys being the
vertex and the value being a community id. Within topologic, these community ids are required to be ints.
:param str weight_attribute: The edge data attribute on the graph that contains a float weight for the edge.
:param float resolution: The resolution to use when calculating the modularity.
:return: A dictionary of the community id to the modularity component of that community
:rtype: Dict[int, float]
:raise TypeError: If the graph is not a networkx Graph
:raise ValueError: If the graph is unweighted
:raise ValueError: If the graph is directed
"""
assertions.assert_is_graph(graph)
assertions.assert_is_weighted(graph, weight_attribute)
assertions.assert_is_undirected(graph)
total_edge_weight = 0.0
communities = set(partitions.values())
degree_sums_within_community: Dict[int, float] = defaultdict(lambda: 0.0)
degree_sums_for_community: Dict[int, float] = defaultdict(lambda: 0.0)
for vertex, neighbor_vertex, weight in graph.edges(data=weight_attribute):
vertex_community = partitions[vertex]
neighbor_community = partitions[neighbor_vertex]
if vertex_community == neighbor_community:
if vertex == neighbor_vertex:
degree_sums_within_community[vertex_community] += weight * 2.0
else:
degree_sums_within_community[vertex_community] += weight
degree_sums_for_community[vertex_community] += weight * 2.0
total_edge_weight += weight
return {comm: _modularity_component(
degree_sums_within_community[comm],
degree_sums_for_community[comm],
total_edge_weight,
resolution
) for comm in communities}