Moved ari scores and reworked it. The similarity module now provides the ari function for use, which will compare 2 dictionaries of partition/clustering/community maps and provide an ARI score. This similarity module is a logical place for any future similarity algorithms that we may want to employ.

Turns out that gitignore won't ignore changes to files it is tracking
2020-02-19 11:58:26 -08:00 · 2020-02-19 11:58:26 -08:00 · f2597381e3
--- a/.gitignore
+++ b/.gitignore
@ -122,5 +122,3 @@ venv.bak/
 *.code-workspace

 !.github/build
-# ignore any changes to topologic/version/version.txt (this is easy to do - we want this file to exist, but be empty and not include any changes)
-topologic/version/version.txt
--- a/tests/test_ari_scores.py
+++ b/tests/test_ari_scores.py
@ -1,79 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import unittest
-import networkx as nx
-import topologic as tc
-import numpy as np
-
-
-class TestAriScores(unittest.TestCase):
-    def test_smoke_test_simple(self):
-        a = [1, 1, 2, 3]
-        b = [1, 2, 2, 3]
-
-        score: float = tc.calculate_ari(a, b)
-
-        # Check the distances
-        self.assertEqual(-0.20, round(score, ndigits=1))
-
-    def test_smoke_test(self):
-        graph = nx.Graph()
-        graph.add_edge("one", "two")
-        graph.add_edge("two", "three")
-        graph.add_edge("three", "four")
-        graph.add_edge("four", "one")
-
-        graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
-        graph.add_node("two", reference_cluster=1, pred_cluster1=2, pred_cluster2=1)
-        graph.add_node("three", reference_cluster=2, pred_cluster1=2, pred_cluster2=2)
-        graph.add_node("four", reference_cluster=3, pred_cluster1=3, pred_cluster2=3)
-
-        # Check the distance between "pred_cluster1" and "reference_cluster" as well as "pred_cluster2"
-        # and "reference_cluster"
-        result: np.ndarray = tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
-
-        # Check the distances
-        self.assertEqual(2, result.size)
-        self.assertEqual(-0.20, round(result[0], ndigits=1))
-        self.assertEqual(1.0, result[1])
-
-    def test_invalid_reference_cluster_type(self):
-        graph = nx.Graph()
-        graph.add_edge("one", "two")
-
-        graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
-        graph.add_node("two", reference_cluster="one", pred_cluster1=1, pred_cluster2=1)
-        with self.assertRaises(ValueError):
-            tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
-
-    def test_invalid_pred_cluster_type(self):
-        graph = nx.Graph()
-        graph.add_edge("one", "two")
-
-        graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
-        graph.add_node("two", reference_cluster=1, pred_cluster1=1, pred_cluster2="one")
-        with self.assertRaises(ValueError):
-            tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
-
-    def test_reference_cluster_not_defined(self):
-        graph = nx.Graph()
-        graph.add_edge("one", "two")
-
-        graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
-        graph.add_node("two", reference_cluster="one", pred_cluster1=1, pred_cluster2=1)
-        with self.assertRaises(KeyError):
-            tc.calculate_ari_scores(graph, "reference_cluster_invalid", ["pred_cluster1", "pred_cluster2"])
-
-    def test_pred2_cluster_not_defined(self):
-        graph = nx.Graph()
-        graph.add_edge("one", "two")
-
-        graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
-        graph.add_node("two", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
-        with self.assertRaises(KeyError):
-            tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2_invalid"])
-
-    def test_graph_not_a_graph(self):
-        with self.assertRaises(TypeError):
-            tc.calculate_ari_scores("invalid_object", "reference_cluster", ["pred_cluster1", "pred_cluster2_invalid"])
--- a/tests/test_similarity.py
+++ b/tests/test_similarity.py
@ -0,0 +1,22 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+import unittest
+import topologic as tc
+
+
+class TestSimilarity(unittest.TestCase):
+    def test_ari(self):
+        a = {0: 1, 1: 1, 2: 2, 3: 3}
+        b = {0: 1, 1: 2, 2: 2, 3: 3}
+
+        score: float = tc.similarity.ari(a, b)
+
+        # Check the distances
+        self.assertAlmostEqual(-0.20, score, places=2)
+
+    def test_ari_wrong_sizes(self):
+        a = {"foo": 1, "bar": 2, "baz": 4}
+        b = {"foo": 1, "baz": 4}
+
+        self.assertRaises(ValueError, tc.similarity.ari, a, b)
--- a/topologic/init.py
+++ b/topologic/init.py
@ -21,11 +21,11 @@ from .distance import cosine_distance, euclidean_distance, mahalanobis_distance

 from .io.bipartite_graph_consolidator import consolidate_bipartite
 from .io.edge_detector import find_edges
-from .ari_scores import calculate_ari_scores, calculate_ari
 from .io.potential_edge_column_pair import PotentialEdgeColumnPair
 from .io.graph_properties import GraphProperties
 from .scree_plot import find_elbows

+from . import similarity
 from . import io
 from . import projection
 from . import statistics
--- a/topologic/ari_scores.py
+++ b/topologic/ari_scores.py
@ -1,72 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT license.
-
-import networkx as nx
-from sklearn.metrics.cluster import adjusted_rand_score
-import numpy as np
-from typing import Any, Iterable
-from .assertions import assert_is_graph
-
-
-def calculate_ari_scores(
-        graph: nx.Graph,
-        reference_cluster_attribute: Any,
-        predicted_clusters: Iterable[str]
-) -> np.ndarray:
-    """
-    Calculates Adjusted Rand Index for a graph.
-
-    The Rand index is a measure of similarity between two clusters.  See https://en.wikipedia.org/wiki/Rand_index
-
-    This method assumes that multiple clustering algorithms have been run over the graph and the cluster IDs
-    are stored in the graph nodes as integers.
-
-    :param networkx.Graph graph: NetworkX graph
-    :param Any reference_cluster_attribute: Attribute on the node that contains the cluster ID.  Value of attribute
-        should be an integer
-    :param Iterable[str] predicted_clusters: Iterable of node attribute names that contain cluster IDs.  Value at each
-        attribute should be an integer.  If we are only comparing the reference cluster with one predicted cluster then
-        this is an iterable with a single value.
-    :return: Array of scores.  One score will be returned for each predicted cluster in the predicted_clusters
-        input iterable.
-    :raises ValueError: When conversion node[reference_cluster_attribute] or node[predicted_clusters[x]] cannot be
-        converted to an int.
-    :raises KeyError: When reference_cluster_attribute or any attribute in predicted_clusters is not defined
-        on any of the graph nodes.
-    :raises TypeError: When graph is not a networkx.Graph object
-    """
-
-    assert_is_graph(graph)
-
-    clusters_reference: np.ndarray = []
-    for node in graph.nodes():
-        clusters_reference = np.append(clusters_reference, int((graph.nodes()[node][reference_cluster_attribute])))
-
-    ari_scores_list: np.ndarray = []
-    for cluster_id_attribute in predicted_clusters:
-        louvain_clusters: np.ndarray = []
-        for node in graph.nodes():
-            louvain_clusters = np.append(louvain_clusters, int((graph.nodes()[node][cluster_id_attribute])))
-
-        ari_scores_list = np.append(ari_scores_list, calculate_ari(clusters_reference, louvain_clusters))
-    return ari_scores_list
-
-
-def calculate_ari(
-        reference_clusters: Iterable[int],
-        predicted_clusters: Iterable[int]
-) -> float:
-    """
-    Calculates Adjusted Rand Index for two lists.
-
-    The Rand index is a measure of similarity between two clusters.  See https://en.wikipedia.org/wiki/Rand_index
-
-    This method assumes that multiple clustering algorithms have been run over the graph and the cluster IDs
-    are stored in the graph nodes as integers.
-
-    :param Iterable[int] reference_clusters: An Iterable[int] of values
-    :param Iterable[int] predicted_clusters: An Iterable[int] of values
-    :return: The adjusted rand index for the two lists
-    :rtype float:
-    """
-    return adjusted_rand_score(reference_clusters, predicted_clusters)
--- a/topologic/similarity.py
+++ b/topologic/similarity.py
@ -0,0 +1,40 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+
+from sklearn.metrics.cluster import adjusted_rand_score
+from typing import Any, Dict
+import numpy as np
+
+__all__ = ["ari"]
+
+
+def ari(
+    primary_partition: Dict[Any, int],
+    test_partition: Dict[Any, int],
+) -> float:
+    """
+    Given two partition schemas, a primary partition mapping (the most accurate representation of truth) and the test
+    partition mapping (to be scored against that accurate representation of truth), calculate the Adjusted Rand Index.
+
+    See https://en.wikipedia.org/wiki/Rand_index
+
+    :param Dict[Any, int] primary_partition: The most accurate representation of truth for cluster or community
+        membership of nodes. The keys are vertex labels and the values are the cluster/community/partition labels.
+    :param Dict[Any, int] test_partition: The partition mapping to compare against the primary partition. The keys are
+        vertex labels and the values are the cluster/community/partition labels.
+    :return: The adjusted rand index for the two mappings
+    :rtype float:
+    :raises ValueError: If the primary partition and test partition do not have an identical vertex label set.
+    """
+    if primary_partition.keys() != test_partition.keys():
+        raise ValueError("The reference partition provided does not contain the exact same keys as the predicted "
+                         "clusters; an ari score cannot be generated automatically.")
+
+    size = len(primary_partition.keys())
+    primary = np.empty(size, dtype=int)
+    test = np.empty(size, dtype=int)
+    for i, vertex in enumerate(primary_partition.keys()):
+        primary[i] = primary_partition[vertex]
+        test[i] = test_partition[vertex]
+
+    return adjusted_rand_score(labels_true=primary, labels_pred=test)