-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathclustering_metrics.pyx
executable file
·180 lines (145 loc) · 5.06 KB
/
clustering_metrics.pyx
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
from __future__ import division
import math
import numpy as np
import sys
cimport cython
cimport numpy as np
class _GroupDict(dict):
def __getitem__(self, key):
if key not in self:
self[key] = []
return dict.__getitem__(self, key)
def load_graph(input_file):
graph = {}
with open(input_file, 'r') as fin:
for line in fin:
tokens = [ int(x)-1 for x in line.split() ]
graph[tokens[0]] = tokens[1:]
return graph
def load_community(input_file):
clusters = []
with open(input_file, 'r') as fin:
for line in fin:
clusters.append(int(line.split()[1]))
return np.array(clusters)
def load_community_groups(input_file):
clusters_groups = _GroupDict()
clusters = []
with open(input_file, 'r') as fin:
for line in fin:
node_id, cluster_id = line.split()
clusters_groups[int(cluster_id)].append(int(node_id)-1)
clusters.append(int(cluster_id))
return np.array(clusters), clusters_groups
@cython.boundscheck(False)
@cython.wraparound(False)
def _get_min_a_ci(graph, np.ndarray[np.int_t , ndim=1] clusters, int ci):
cdef int a_ci, a_non_ci, node, neighbor_sum
a_ci = 0
a_non_ci = 0
for node in graph:
neighbors_num = len(graph[node])
if clusters[node] == ci:
a_ci += neighbors_num
else:
a_non_ci += neighbors_num
return min(a_ci, a_non_ci)
@cython.boundscheck(False)
@cython.wraparound(False)
def _get_cut_edges_a(graph, ci_nodes,
np.ndarray[np.int_t , ndim=1] clusters, int ci):
cdef int sum_cut_edge_a, node, neighbor
sum_cut_edge_a = 0
for node in ci_nodes:
for neighbor in graph[node]:
if clusters[neighbor] != ci:
sum_cut_edge_a += 1
return sum_cut_edge_a
@cython.boundscheck(False)
@cython.wraparound(False)
def conductance(graph,
np.ndarray[np.int_t , ndim=1] clusters, clusters_groups):
cdef int ci
cdef double min_a_ci, cut_a_ci
conductances = []
for ci in clusters_groups:
min_a_ci = _get_min_a_ci(graph, clusters, ci)
cut_a_ci = _get_cut_edges_a(graph, clusters_groups[ci], clusters, ci)
if min_a_ci > 0:
conductances.append([len(clusters_groups[ci]),
cut_a_ci/min_a_ci])
# return for each cluster its size and its conductance value
return conductances
#---------- RI/NMI -----------#
@cython.boundscheck(False)
@cython.wraparound(False)
def ri(np.ndarray[np.int_t , ndim=1] real,
np.ndarray[np.int_t , ndim=1] cluster):
cdef int i, j
cdef long tp, fn, fp, tn
cdef int loop = real.shape[0]
tp = 0
fn = 0
fp = 0
tn = 0
for i in range(loop):
for j in range(i+1, loop):
if cluster[i] == cluster[j]:
if real[i] == real[j]:
tp += 1 # true positive
else:
fn += 1 # false negative
else:
if real[i] == real[j]:
fp += 1 # false positive
else:
tn += 1 # true negative
return (tp+tn)/(tp+fn+fp+tn)
@cython.boundscheck(False)
@cython.wraparound(False)
def nmi(np.ndarray[np.int_t, ndim=1] real,
np.ndarray[np.int_t, ndim=1] cluster):
cdef int i, j
cdef int nodes_size = real.shape[0]
cdef int r_max_cl = real.max()+1
cdef int a_max_cl = cluster.max()+1
cdef np.ndarray[np.float64_t, ndim=1] probc
probc = np.zeros(a_max_cl, dtype=np.dtype(float))
cdef np.ndarray[np.float64_t, ndim=1] probcl
probcl = np.zeros(r_max_cl, dtype= np.dtype(float))
cdef int probc_size = probc.shape[0]
cdef int probcl_size = probcl.shape[0]
cdef np.ndarray[np.float64_t, ndim=2] probinter
probinter = np.zeros([probc_size, probcl_size], dtype= np.dtype(float))
cdef int alg_cluster, real_cluster
cdef int node_loop = real.shape[0]
for i in range(node_loop):
alg_cluster = int(cluster[i])
real_cluster = int(real[i])
probc[alg_cluster] += 1
probcl[real_cluster] += 1
probinter[alg_cluster][real_cluster] += 1
for i in range(probc_size):
probc[i] = probc[i] / nodes_size
for i in range(probcl_size):
probcl[i] = probcl[i] / nodes_size
for i in range(probc_size):
for j in range(probcl_size):
probinter[i][j] = probinter[i][j] / nodes_size
cdef double in_value = 0
for i in range(probc_size):
for j in range(probcl_size):
if probinter[i][j] != 0 and probc[i] !=0 and probcl[j] != 0:
in_value += probinter[i][j] * \
math.log10(probinter[i][j] / (probc[i]*probcl[j]))
cdef double hc_value = 0
for i in range(probc_size):
if probc[i] != 0:
hc_value += probc[i] * math.log10(probc[i])
hc_value = -hc_value
cdef double hcl_value = 0
for i in range(probcl_size):
if probcl[i] != 0:
hcl_value += probcl[i] * math.log10(probcl[i])
hcl_value = -hcl_value
return 2 * (in_value / (hc_value+hcl_value))