Moved ari scores and reworked it. The similarity module now provides the ari function for use, which will compare 2 dictionaries of partition/clustering/community maps and provide an ARI score. This similarity module is a logical place for any future similarity algorithms that we may want to employ.

Turns out that gitignore won't ignore changes to files it is tracking
This commit is contained in:
Dwayne Pryce 2020-02-19 11:58:26 -08:00
Родитель f752ddad2a
Коммит f2597381e3
6 изменённых файлов: 63 добавлений и 154 удалений

2
.gitignore поставляемый
Просмотреть файл

@ -122,5 +122,3 @@ venv.bak/
*.code-workspace
!.github/build
# ignore any changes to topologic/version/version.txt (this is easy to do - we want this file to exist, but be empty and not include any changes)
topologic/version/version.txt

Просмотреть файл

@ -1,79 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import unittest
import networkx as nx
import topologic as tc
import numpy as np
class TestAriScores(unittest.TestCase):
def test_smoke_test_simple(self):
a = [1, 1, 2, 3]
b = [1, 2, 2, 3]
score: float = tc.calculate_ari(a, b)
# Check the distances
self.assertEqual(-0.20, round(score, ndigits=1))
def test_smoke_test(self):
graph = nx.Graph()
graph.add_edge("one", "two")
graph.add_edge("two", "three")
graph.add_edge("three", "four")
graph.add_edge("four", "one")
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
graph.add_node("two", reference_cluster=1, pred_cluster1=2, pred_cluster2=1)
graph.add_node("three", reference_cluster=2, pred_cluster1=2, pred_cluster2=2)
graph.add_node("four", reference_cluster=3, pred_cluster1=3, pred_cluster2=3)
# Check the distance between "pred_cluster1" and "reference_cluster" as well as "pred_cluster2"
# and "reference_cluster"
result: np.ndarray = tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
# Check the distances
self.assertEqual(2, result.size)
self.assertEqual(-0.20, round(result[0], ndigits=1))
self.assertEqual(1.0, result[1])
def test_invalid_reference_cluster_type(self):
graph = nx.Graph()
graph.add_edge("one", "two")
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
graph.add_node("two", reference_cluster="one", pred_cluster1=1, pred_cluster2=1)
with self.assertRaises(ValueError):
tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
def test_invalid_pred_cluster_type(self):
graph = nx.Graph()
graph.add_edge("one", "two")
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
graph.add_node("two", reference_cluster=1, pred_cluster1=1, pred_cluster2="one")
with self.assertRaises(ValueError):
tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
def test_reference_cluster_not_defined(self):
graph = nx.Graph()
graph.add_edge("one", "two")
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
graph.add_node("two", reference_cluster="one", pred_cluster1=1, pred_cluster2=1)
with self.assertRaises(KeyError):
tc.calculate_ari_scores(graph, "reference_cluster_invalid", ["pred_cluster1", "pred_cluster2"])
def test_pred2_cluster_not_defined(self):
graph = nx.Graph()
graph.add_edge("one", "two")
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
graph.add_node("two", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
with self.assertRaises(KeyError):
tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2_invalid"])
def test_graph_not_a_graph(self):
with self.assertRaises(TypeError):
tc.calculate_ari_scores("invalid_object", "reference_cluster", ["pred_cluster1", "pred_cluster2_invalid"])

22
tests/test_similarity.py Normal file
Просмотреть файл

@ -0,0 +1,22 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import unittest
import topologic as tc
class TestSimilarity(unittest.TestCase):
def test_ari(self):
a = {0: 1, 1: 1, 2: 2, 3: 3}
b = {0: 1, 1: 2, 2: 2, 3: 3}
score: float = tc.similarity.ari(a, b)
# Check the distances
self.assertAlmostEqual(-0.20, score, places=2)
def test_ari_wrong_sizes(self):
a = {"foo": 1, "bar": 2, "baz": 4}
b = {"foo": 1, "baz": 4}
self.assertRaises(ValueError, tc.similarity.ari, a, b)

Просмотреть файл

@ -21,11 +21,11 @@ from .distance import cosine_distance, euclidean_distance, mahalanobis_distance
from .io.bipartite_graph_consolidator import consolidate_bipartite
from .io.edge_detector import find_edges
from .ari_scores import calculate_ari_scores, calculate_ari
from .io.potential_edge_column_pair import PotentialEdgeColumnPair
from .io.graph_properties import GraphProperties
from .scree_plot import find_elbows
from . import similarity
from . import io
from . import projection
from . import statistics

Просмотреть файл

@ -1,72 +0,0 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import networkx as nx
from sklearn.metrics.cluster import adjusted_rand_score
import numpy as np
from typing import Any, Iterable
from .assertions import assert_is_graph
def calculate_ari_scores(
graph: nx.Graph,
reference_cluster_attribute: Any,
predicted_clusters: Iterable[str]
) -> np.ndarray:
"""
Calculates Adjusted Rand Index for a graph.
The Rand index is a measure of similarity between two clusters. See https://en.wikipedia.org/wiki/Rand_index
This method assumes that multiple clustering algorithms have been run over the graph and the cluster IDs
are stored in the graph nodes as integers.
:param networkx.Graph graph: NetworkX graph
:param Any reference_cluster_attribute: Attribute on the node that contains the cluster ID. Value of attribute
should be an integer
:param Iterable[str] predicted_clusters: Iterable of node attribute names that contain cluster IDs. Value at each
attribute should be an integer. If we are only comparing the reference cluster with one predicted cluster then
this is an iterable with a single value.
:return: Array of scores. One score will be returned for each predicted cluster in the predicted_clusters
input iterable.
:raises ValueError: When conversion node[reference_cluster_attribute] or node[predicted_clusters[x]] cannot be
converted to an int.
:raises KeyError: When reference_cluster_attribute or any attribute in predicted_clusters is not defined
on any of the graph nodes.
:raises TypeError: When graph is not a networkx.Graph object
"""
assert_is_graph(graph)
clusters_reference: np.ndarray = []
for node in graph.nodes():
clusters_reference = np.append(clusters_reference, int((graph.nodes()[node][reference_cluster_attribute])))
ari_scores_list: np.ndarray = []
for cluster_id_attribute in predicted_clusters:
louvain_clusters: np.ndarray = []
for node in graph.nodes():
louvain_clusters = np.append(louvain_clusters, int((graph.nodes()[node][cluster_id_attribute])))
ari_scores_list = np.append(ari_scores_list, calculate_ari(clusters_reference, louvain_clusters))
return ari_scores_list
def calculate_ari(
reference_clusters: Iterable[int],
predicted_clusters: Iterable[int]
) -> float:
"""
Calculates Adjusted Rand Index for two lists.
The Rand index is a measure of similarity between two clusters. See https://en.wikipedia.org/wiki/Rand_index
This method assumes that multiple clustering algorithms have been run over the graph and the cluster IDs
are stored in the graph nodes as integers.
:param Iterable[int] reference_clusters: An Iterable[int] of values
:param Iterable[int] predicted_clusters: An Iterable[int] of values
:return: The adjusted rand index for the two lists
:rtype float:
"""
return adjusted_rand_score(reference_clusters, predicted_clusters)

40
topologic/similarity.py Normal file
Просмотреть файл

@ -0,0 +1,40 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
from sklearn.metrics.cluster import adjusted_rand_score
from typing import Any, Dict
import numpy as np
__all__ = ["ari"]
def ari(
primary_partition: Dict[Any, int],
test_partition: Dict[Any, int],
) -> float:
"""
Given two partition schemas, a primary partition mapping (the most accurate representation of truth) and the test
partition mapping (to be scored against that accurate representation of truth), calculate the Adjusted Rand Index.
See https://en.wikipedia.org/wiki/Rand_index
:param Dict[Any, int] primary_partition: The most accurate representation of truth for cluster or community
membership of nodes. The keys are vertex labels and the values are the cluster/community/partition labels.
:param Dict[Any, int] test_partition: The partition mapping to compare against the primary partition. The keys are
vertex labels and the values are the cluster/community/partition labels.
:return: The adjusted rand index for the two mappings
:rtype float:
:raises ValueError: If the primary partition and test partition do not have an identical vertex label set.
"""
if primary_partition.keys() != test_partition.keys():
raise ValueError("The reference partition provided does not contain the exact same keys as the predicted "
"clusters; an ari score cannot be generated automatically.")
size = len(primary_partition.keys())
primary = np.empty(size, dtype=int)
test = np.empty(size, dtype=int)
for i, vertex in enumerate(primary_partition.keys()):
primary[i] = primary_partition[vertex]
test[i] = test_partition[vertex]
return adjusted_rand_score(labels_true=primary, labels_pred=test)