зеркало из https://github.com/microsoft/topologic.git
Moved ari scores and reworked it. The similarity module now provides the ari function for use, which will compare 2 dictionaries of partition/clustering/community maps and provide an ARI score. This similarity module is a logical place for any future similarity algorithms that we may want to employ.
Turns out that gitignore won't ignore changes to files it is tracking
This commit is contained in:
Родитель
f752ddad2a
Коммит
f2597381e3
|
@ -122,5 +122,3 @@ venv.bak/
|
|||
*.code-workspace
|
||||
|
||||
!.github/build
|
||||
# ignore any changes to topologic/version/version.txt (this is easy to do - we want this file to exist, but be empty and not include any changes)
|
||||
topologic/version/version.txt
|
||||
|
|
|
@ -1,79 +0,0 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import unittest
|
||||
import networkx as nx
|
||||
import topologic as tc
|
||||
import numpy as np
|
||||
|
||||
|
||||
class TestAriScores(unittest.TestCase):
|
||||
def test_smoke_test_simple(self):
|
||||
a = [1, 1, 2, 3]
|
||||
b = [1, 2, 2, 3]
|
||||
|
||||
score: float = tc.calculate_ari(a, b)
|
||||
|
||||
# Check the distances
|
||||
self.assertEqual(-0.20, round(score, ndigits=1))
|
||||
|
||||
def test_smoke_test(self):
|
||||
graph = nx.Graph()
|
||||
graph.add_edge("one", "two")
|
||||
graph.add_edge("two", "three")
|
||||
graph.add_edge("three", "four")
|
||||
graph.add_edge("four", "one")
|
||||
|
||||
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
|
||||
graph.add_node("two", reference_cluster=1, pred_cluster1=2, pred_cluster2=1)
|
||||
graph.add_node("three", reference_cluster=2, pred_cluster1=2, pred_cluster2=2)
|
||||
graph.add_node("four", reference_cluster=3, pred_cluster1=3, pred_cluster2=3)
|
||||
|
||||
# Check the distance between "pred_cluster1" and "reference_cluster" as well as "pred_cluster2"
|
||||
# and "reference_cluster"
|
||||
result: np.ndarray = tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
|
||||
|
||||
# Check the distances
|
||||
self.assertEqual(2, result.size)
|
||||
self.assertEqual(-0.20, round(result[0], ndigits=1))
|
||||
self.assertEqual(1.0, result[1])
|
||||
|
||||
def test_invalid_reference_cluster_type(self):
|
||||
graph = nx.Graph()
|
||||
graph.add_edge("one", "two")
|
||||
|
||||
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
|
||||
graph.add_node("two", reference_cluster="one", pred_cluster1=1, pred_cluster2=1)
|
||||
with self.assertRaises(ValueError):
|
||||
tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
|
||||
|
||||
def test_invalid_pred_cluster_type(self):
|
||||
graph = nx.Graph()
|
||||
graph.add_edge("one", "two")
|
||||
|
||||
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
|
||||
graph.add_node("two", reference_cluster=1, pred_cluster1=1, pred_cluster2="one")
|
||||
with self.assertRaises(ValueError):
|
||||
tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2"])
|
||||
|
||||
def test_reference_cluster_not_defined(self):
|
||||
graph = nx.Graph()
|
||||
graph.add_edge("one", "two")
|
||||
|
||||
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
|
||||
graph.add_node("two", reference_cluster="one", pred_cluster1=1, pred_cluster2=1)
|
||||
with self.assertRaises(KeyError):
|
||||
tc.calculate_ari_scores(graph, "reference_cluster_invalid", ["pred_cluster1", "pred_cluster2"])
|
||||
|
||||
def test_pred2_cluster_not_defined(self):
|
||||
graph = nx.Graph()
|
||||
graph.add_edge("one", "two")
|
||||
|
||||
graph.add_node("one", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
|
||||
graph.add_node("two", reference_cluster=1, pred_cluster1=1, pred_cluster2=1)
|
||||
with self.assertRaises(KeyError):
|
||||
tc.calculate_ari_scores(graph, "reference_cluster", ["pred_cluster1", "pred_cluster2_invalid"])
|
||||
|
||||
def test_graph_not_a_graph(self):
|
||||
with self.assertRaises(TypeError):
|
||||
tc.calculate_ari_scores("invalid_object", "reference_cluster", ["pred_cluster1", "pred_cluster2_invalid"])
|
|
@ -0,0 +1,22 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import unittest
|
||||
import topologic as tc
|
||||
|
||||
|
||||
class TestSimilarity(unittest.TestCase):
|
||||
def test_ari(self):
|
||||
a = {0: 1, 1: 1, 2: 2, 3: 3}
|
||||
b = {0: 1, 1: 2, 2: 2, 3: 3}
|
||||
|
||||
score: float = tc.similarity.ari(a, b)
|
||||
|
||||
# Check the distances
|
||||
self.assertAlmostEqual(-0.20, score, places=2)
|
||||
|
||||
def test_ari_wrong_sizes(self):
|
||||
a = {"foo": 1, "bar": 2, "baz": 4}
|
||||
b = {"foo": 1, "baz": 4}
|
||||
|
||||
self.assertRaises(ValueError, tc.similarity.ari, a, b)
|
|
@ -21,11 +21,11 @@ from .distance import cosine_distance, euclidean_distance, mahalanobis_distance
|
|||
|
||||
from .io.bipartite_graph_consolidator import consolidate_bipartite
|
||||
from .io.edge_detector import find_edges
|
||||
from .ari_scores import calculate_ari_scores, calculate_ari
|
||||
from .io.potential_edge_column_pair import PotentialEdgeColumnPair
|
||||
from .io.graph_properties import GraphProperties
|
||||
from .scree_plot import find_elbows
|
||||
|
||||
from . import similarity
|
||||
from . import io
|
||||
from . import projection
|
||||
from . import statistics
|
||||
|
|
|
@ -1,72 +0,0 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import networkx as nx
|
||||
from sklearn.metrics.cluster import adjusted_rand_score
|
||||
import numpy as np
|
||||
from typing import Any, Iterable
|
||||
from .assertions import assert_is_graph
|
||||
|
||||
|
||||
def calculate_ari_scores(
|
||||
graph: nx.Graph,
|
||||
reference_cluster_attribute: Any,
|
||||
predicted_clusters: Iterable[str]
|
||||
) -> np.ndarray:
|
||||
"""
|
||||
Calculates Adjusted Rand Index for a graph.
|
||||
|
||||
The Rand index is a measure of similarity between two clusters. See https://en.wikipedia.org/wiki/Rand_index
|
||||
|
||||
This method assumes that multiple clustering algorithms have been run over the graph and the cluster IDs
|
||||
are stored in the graph nodes as integers.
|
||||
|
||||
:param networkx.Graph graph: NetworkX graph
|
||||
:param Any reference_cluster_attribute: Attribute on the node that contains the cluster ID. Value of attribute
|
||||
should be an integer
|
||||
:param Iterable[str] predicted_clusters: Iterable of node attribute names that contain cluster IDs. Value at each
|
||||
attribute should be an integer. If we are only comparing the reference cluster with one predicted cluster then
|
||||
this is an iterable with a single value.
|
||||
:return: Array of scores. One score will be returned for each predicted cluster in the predicted_clusters
|
||||
input iterable.
|
||||
:raises ValueError: When conversion node[reference_cluster_attribute] or node[predicted_clusters[x]] cannot be
|
||||
converted to an int.
|
||||
:raises KeyError: When reference_cluster_attribute or any attribute in predicted_clusters is not defined
|
||||
on any of the graph nodes.
|
||||
:raises TypeError: When graph is not a networkx.Graph object
|
||||
"""
|
||||
|
||||
assert_is_graph(graph)
|
||||
|
||||
clusters_reference: np.ndarray = []
|
||||
for node in graph.nodes():
|
||||
clusters_reference = np.append(clusters_reference, int((graph.nodes()[node][reference_cluster_attribute])))
|
||||
|
||||
ari_scores_list: np.ndarray = []
|
||||
for cluster_id_attribute in predicted_clusters:
|
||||
louvain_clusters: np.ndarray = []
|
||||
for node in graph.nodes():
|
||||
louvain_clusters = np.append(louvain_clusters, int((graph.nodes()[node][cluster_id_attribute])))
|
||||
|
||||
ari_scores_list = np.append(ari_scores_list, calculate_ari(clusters_reference, louvain_clusters))
|
||||
return ari_scores_list
|
||||
|
||||
|
||||
def calculate_ari(
|
||||
reference_clusters: Iterable[int],
|
||||
predicted_clusters: Iterable[int]
|
||||
) -> float:
|
||||
"""
|
||||
Calculates Adjusted Rand Index for two lists.
|
||||
|
||||
The Rand index is a measure of similarity between two clusters. See https://en.wikipedia.org/wiki/Rand_index
|
||||
|
||||
This method assumes that multiple clustering algorithms have been run over the graph and the cluster IDs
|
||||
are stored in the graph nodes as integers.
|
||||
|
||||
:param Iterable[int] reference_clusters: An Iterable[int] of values
|
||||
:param Iterable[int] predicted_clusters: An Iterable[int] of values
|
||||
:return: The adjusted rand index for the two lists
|
||||
:rtype float:
|
||||
"""
|
||||
return adjusted_rand_score(reference_clusters, predicted_clusters)
|
|
@ -0,0 +1,40 @@
|
|||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
from sklearn.metrics.cluster import adjusted_rand_score
|
||||
from typing import Any, Dict
|
||||
import numpy as np
|
||||
|
||||
__all__ = ["ari"]
|
||||
|
||||
|
||||
def ari(
|
||||
primary_partition: Dict[Any, int],
|
||||
test_partition: Dict[Any, int],
|
||||
) -> float:
|
||||
"""
|
||||
Given two partition schemas, a primary partition mapping (the most accurate representation of truth) and the test
|
||||
partition mapping (to be scored against that accurate representation of truth), calculate the Adjusted Rand Index.
|
||||
|
||||
See https://en.wikipedia.org/wiki/Rand_index
|
||||
|
||||
:param Dict[Any, int] primary_partition: The most accurate representation of truth for cluster or community
|
||||
membership of nodes. The keys are vertex labels and the values are the cluster/community/partition labels.
|
||||
:param Dict[Any, int] test_partition: The partition mapping to compare against the primary partition. The keys are
|
||||
vertex labels and the values are the cluster/community/partition labels.
|
||||
:return: The adjusted rand index for the two mappings
|
||||
:rtype float:
|
||||
:raises ValueError: If the primary partition and test partition do not have an identical vertex label set.
|
||||
"""
|
||||
if primary_partition.keys() != test_partition.keys():
|
||||
raise ValueError("The reference partition provided does not contain the exact same keys as the predicted "
|
||||
"clusters; an ari score cannot be generated automatically.")
|
||||
|
||||
size = len(primary_partition.keys())
|
||||
primary = np.empty(size, dtype=int)
|
||||
test = np.empty(size, dtype=int)
|
||||
for i, vertex in enumerate(primary_partition.keys()):
|
||||
primary[i] = primary_partition[vertex]
|
||||
test[i] = test_partition[vertex]
|
||||
|
||||
return adjusted_rand_score(labels_true=primary, labels_pred=test)
|
Загрузка…
Ссылка в новой задаче