forked from LBM-EPFL/CLoNe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
clone.py
360 lines (297 loc) · 13.4 KB
/
clone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
import operator
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.stats import gaussian_kde
class CLoNe(object):
'''
CLoNe clustering
Parameters
----------
pdc: float, optional, default: 4
The only input parameter.
pdc is a percentage of all distances between neighbours.
Used to extract the cut-off distance dc for the Gaussian kernel.
dc is hence the distance at that percentile of the distances.
Rule of thumb:
> If you think there should be more clusters, decrease this value (integer increments are usually enough).
> If you think there should be less clusters, increase this value.
n_resize: int, optional, default: 4
It decreases the number of neighbour computed for each point to N/n_resize.
Only few neighbors are needed to compute what's needed by CLoNe.
While perfectly optional, this can be used to decreases computing time and memory use.
If set too high, some important points may not have the neighbours they need.
filt: float, optional, default: 0.1
Points whose local density is below 10% of the local density of their cluster center are classified as outliers.
Setting it to 0 disables it (but disables removal of noise clusters as well)
Change this value if you think it needs tuning.
verbose: boolean, optional, default: False
Enables verbose output of CLoNe
Attributes
------------------------------
labels_: array, shape (n_samples,)
Labels of each point after outlier removal.
labels_all_: array, shape (n_samples,)
Labels of each point before outlier removal.
centers_: array, shape (n, )
Index of cluster centers.
core_card_: array, (n_samples,)
Core cardinalities of each data point.
rho_: array, (n_samples,)
Local densities of each point as computed by a Gaussian kernel
Examples
--------
See Jupyter notebook "clone_examples" or "run_test.py".
'''
def __init__(self, pdc=4,
n_resize=4,
filt=0.1,
verbose=False):
self.pdc = pdc
self.n_resize = n_resize
self.filt = filt
self.verbose = verbose
def fit(self, vectors):
# 1. Build neighbour distance matrix
# Reduce size of distance matrix to be computed
# (reduces memory use + speeds things up)
nb_el = len(vectors)
effective_nb_el = int(nb_el / self.n_resize)
if self.verbose:
print("> Computing kNN...")
# Use kNN to get neighbours + idx
vectors = vectors.astype(np.float32)
nbrs = NearestNeighbors(n_neighbors=effective_nb_el, algorithm="auto", n_jobs=1, metric='l2').fit(vectors)
knn_dist, knn_idx = nbrs.kneighbors(vectors)
knn_dist = knn_dist.astype(np.float32)
knn_idx = knn_idx.astype(np.int32)
# Get Gaussian sigma for local density based on neighbour matrix
position = int(round(nb_el ** 2 * self.pdc * 0.01))
if position >= np.prod(knn_dist.shape):
position = -1
dc = np.sort(knn_dist.flatten())[position]
# 3. Compute local densities
inv_dc = 1 / dc
if self.verbose:
print("> Computing local densities...")
# Apply gaussian kernel on knn distances
ordered_knn_dens = np.exp(-1 * np.square(knn_dist * inv_dc))
# Get local densities by summing over rows
summed_okd = ordered_knn_dens.cumsum(axis=1) - 1
rho = summed_okd[:, -1].copy()
# Weight contributions
summed_okd /= int(round(self.pdc * 0.01 * nb_el))
# Count contributing neighbours (= putative cluster cores)
core_card = np.unravel_index(np.argmax(summed_okd > ordered_knn_dens, axis=1), summed_okd.shape)[1]
# 4. Get real cluster centers
if self.verbose:
print("> Identifying real centers...")
nneigh = [0] * nb_el # Nearest neighbour of higher density; used for assignation
centers = []
labels = [-1] * nb_el
nb_clust = 0
for x in range(0, nb_el):
cur_idx_list = knn_idx[x]
is_set = False
for y in range(1, effective_nb_el):
# if y's density is higher, record it as nearest neigh of higher density
if rho[cur_idx_list[y]] > rho[x]:
is_set = True
nneigh[x] = cur_idx_list[y]
# If y is not in the core of x, x is a center
if y > core_card[x] and core_card[x]:
centers.append(x)
nb_clust += 1
break
# The point of highest density is a cluster center by definition
# If by chance a few points have the same value that is the highest density, this takes care of it
# (spiral quartet dataset is one such case)
if not is_set:
centers.append(x)
nb_clust += 1
if nb_clust == 1:
print("Unique cluster found with set parameters.")
self.labels_ = labels
self.labels_all = labels
self.centers = centers
self.core_card = core_card
self.rho = rho
return
# Sort cluster centers according to local densities
to_sort = []
for x in range(nb_clust):
to_sort.append([rho[centers[x]], centers[x]])
to_sort = sorted(to_sort, key=operator.itemgetter(0))
centers = [el[1] for el in to_sort]
for x in range(nb_clust):
labels[centers[x]] = x
idx_rho_sorted = np.argsort(-np.array(rho))
# 4. Cluster assignation
# Assign points to the same cluster as its nearest neighbour of higher density
# > do so in order of decreasing density
if self.verbose:
print("> Assigning points to clusters...")
labels = np.array(labels)
for x in range(nb_el):
if labels[idx_rho_sorted[x]] == -1:
labels[idx_rho_sorted[x]] = labels[nneigh[idx_rho_sorted[x]]]
# 5. Merging
outliers = np.empty(0, dtype=int)
core_points = np.empty(0, dtype=int)
if self.verbose:
print("> Merging centers...")
# Get cluster cores
core_dict = {}
for cl in centers:
lab = labels[cl]
core_dict[lab] = []
for j in knn_idx[cl, :core_card[cl]]:
core_dict[lab].append(j)
# ... border points
border_points = {}
for c1_idx in range(1, len(centers) - 1):
c1 = centers[c1_idx]
lab1 = labels[c1]
mask1 = np.where(labels == lab1)[0]
for p1 in mask1:
for neigh in range(effective_nb_el):
if knn_dist[p1, neigh] > dc:
break
p2 = knn_idx[p1, neigh]
lab2 = labels[p2]
if lab2 == -1:
continue
if lab1 != lab2:
# Order labels by density to merge in order + avoid duplicate keys
if rho[c1] > rho[centers[lab2]]:
key = tuple((lab2, lab1))
else:
key = tuple((lab1, lab2))
if key not in border_points.keys():
border_points[key] = []
border_points[key].append(p1)
border_points[key].append(p2)
# Merge based on Bhattacaryaa distance > 0.65
cnt = 0
for k in sorted(border_points.keys(), reverse=True):
k0, k1 = k
if labels[centers[k0]] == labels[centers[k1]]:
continue
border_points[k] = np.unique(border_points[k])
mask0 = np.where(labels == labels[centers[k0]])[0]
mask1 = np.where(labels == labels[centers[k1]])[0]
try:
kde_bord = gaussian_kde(core_card[border_points[k]])
kde_core0 = gaussian_kde(core_card[core_dict[k0]])
kde_core1 = gaussian_kde(core_card[core_dict[k1]])
except Exception:
continue
mb = min(min(core_card[border_points[k]]), min(core_card[core_dict[k0]]), min(core_card[core_dict[k1]]))
mc = max(max(core_card[border_points[k]]), max(core_card[core_dict[k0]]), max(core_card[core_dict[k1]]))
support = np.linspace(mb, mc, 100)
c2_0 = kde_core0.evaluate(support)
c2_1 = kde_core1.evaluate(support)
b2 = kde_bord.evaluate(support)
c2_1 /= np.sum(c2_1)
c2_0 /= np.sum(c2_0)
b2 /= np.sum(b2)
bc_0 = np.sum(np.sqrt(np.multiply(b2, c2_0)))
bc_1 = np.sum(np.sqrt(np.multiply(b2, c2_1)))
bc_01 = 0.5 * (bc_0 + bc_1)
if bc_01 > 0.65:
labels[mask0] = labels[centers[k1]]
# Clean centers
cur_lab = 0
centers = []
for ulab in np.unique(labels):
if ulab == -1:
continue
mask_l = np.where(labels == ulab)[0]
centers.append(mask_l[np.argmax(rho[mask_l])])
labels[mask_l] = cur_lab
cur_lab += 1
# Get new core points
for cl in centers:
lab = labels[cl]
for j in knn_idx[cl, :core_card[cl]]:
core_points = np.append(core_points, j)
# Removing outliers
labels_all = labels.copy()
outliers = np.array([], dtype=int)
outlier_clusters = []
if self.filt:
if self.verbose:
print("> Removing outliers...")
# Recompute local densities and estimate noise
# > recomputed to avoid bias from high density from other clusters if applicable
for cl in centers:
lab = labels[cl]
mask = np.where(labels == lab)[0]
# Isolate cluster + cut-off distance based on its neighborhood
cl_v = vectors[mask]
core_dc = knn_dist[cl, core_card[cl]]
# KNN only on cluster points (reduce n_neighbors)
nbrs = NearestNeighbors(n_neighbors=len(cl_v), algorithm="auto", n_jobs=1, metric='l2').fit(cl_v)
cl_knn_dist, cl_knn_idx = nbrs.kneighbors(cl_v)
# Gaussian based on core of center
cl_ordered_knn_dens = np.exp(-1 * np.square(cl_knn_dist / core_dc))
# Get local densities by summing over rows
cl_summed_okd = cl_ordered_knn_dens.cumsum(axis=1) - 1
cl_rho = cl_summed_okd[:, -1]
# Set new densities and get noise approximation for this cluster
rho[mask] = cl_rho
rho_center = rho[cl]
rho_thresh = self.filt * rho_center
noise_mask = np.where(cl_rho < rho_thresh)[0]
outliers = np.append(outliers, mask[noise_mask])
labels[mask[noise_mask]] = -1
# If enough noise, check for clusters from noise fluctuation and remove them
# 2 is the lower limit for KDE to run...
if len(outliers) > 2:
# All core points
for cl in centers:
lab = labels[cl]
cnt = 0
for j in knn_idx[cl, :core_card[cl]]:
cnt += 1
core_points = np.append(core_points, j)
kde_noise = gaussian_kde(rho[outliers])
kde_core = gaussian_kde(rho[core_points])
mb = np.amin(rho)
mc = np.amax(rho)
support = np.linspace(mb, mc, 1000)
log_pN = np.log(len(outliers) / len(vectors))
log_pC = np.log(len(core_points) / len(vectors))
for cl in centers:
lab = labels[cl]
mask_core = knn_idx[cl, :core_card[cl]]
rho_core = [rho[i] for i in mask_core if labels[i] == lab]
core_size = len(rho_core)
d_core = kde_core.evaluate(rho_core)
d_noise = kde_noise.evaluate(rho_core)
if len(d_noise[d_noise > 1e-16]) < core_size:
log_pN_pNX = np.NINF
else:
log_pN_pNX = np.sum(np.log(d_noise[d_noise > 1e-16])) + log_pN
if not len(d_core[d_core > 1e-16]):
log_pC_pCX = np.NINF
else:
log_pC_pCX = np.sum(np.log(d_core[d_core > 1e-16])) + log_pC
if log_pN_pNX > log_pC_pCX:
outlier_clusters.append(np.where(labels == lab)[0])
labels[labels == lab] = -1
# Re-label after removing noise clusters
init_c_idx = 0
updated_centers = []
for c in centers:
lab = labels[c]
if lab > -1:
labels_all[labels_all == lab] = init_c_idx
labels[labels == lab] = init_c_idx
init_c_idx += 1
updated_centers.append(c)
centers = updated_centers
self.rho = rho
self.core_card = core_card
self.centers = centers
self.labels_ = labels
self.labels_all = labels_all