This repository has been archived by the owner on Feb 23, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 15
/
cluster.py
181 lines (139 loc) · 6.13 KB
/
cluster.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#import _version
from pandas import DataFrame, Series
import pandas as pd
import numpy as np
import warnings
from numbers import Integral
class KMeansPlusPlus:
def __init__(self, data_frame, k, columns=None, max_iterations=None,
appended_column_name=None):
if not isinstance(data_frame, DataFrame):
raise Exception("data_frame argument is not a pandas DataFrame")
elif data_frame.empty:
raise Exception("The given data frame is empty")
if max_iterations is not None and max_iterations <= 0:
raise Exception("max_iterations must be positive!")
if not isinstance(k, Integral) or k <= 0:
raise Exception("The value of k must be a positive integer")
self.data_frame = data_frame # m x n
self.numRows = data_frame.shape[0] # m
# k x n, the i,j entry being the jth coordinate of center i
self.centers = None
# m x k , the i,j entry represents the distance
# from point i to center j
# (where i and j start at 0)
self.distance_matrix = None
# Series of length m, consisting of integers 0,1,...,k-1
self.clusters = None
# To keep track of clusters in the previous iteration
self.previous_clusters = None
self.max_iterations = max_iterations
self.appended_column_name = appended_column_name
self.k = k
if columns is None:
self.columns = data_frame.columns
else:
for col in columns:
if col not in data_frame.columns:
raise Exception(
"Column '%s' not found in the given DataFrame" % col)
if not self._is_numeric(col):
raise Exception(
"The column '%s' is either not numeric or contains NaN values" % col)
self.columns = columns
def _populate_initial_centers(self):
rows = []
rows.append(self._grab_random_point())
distances = None
while len(rows) < self.k:
if distances is None:
distances = self._distances_from_point(rows[0])
else:
distances = self._distances_from_point_list(rows)
normalized_distances = distances / distances.sum()
normalized_distances.sort_values()
dice_roll = np.random.rand()
min_over_roll = normalized_distances[
normalized_distances.cumsum() >= dice_roll].min()
index = normalized_distances[
normalized_distances == min_over_roll].index[0]
rows.append(self.data_frame[self.columns].iloc[index, :])
self.centers = DataFrame(rows, columns=self.columns)
def _compute_distances(self):
if self.centers is None:
raise Exception(
"Must populate centers before distances can be calculated!")
column_dict = {}
for i in list(range(self.k)):
column_dict[i] = self._distances_from_point(
self.centers.iloc[i, :])
self.distance_matrix = DataFrame(
column_dict, columns=list(range(self.k)))
def _get_clusters(self):
if self.distance_matrix is None:
raise Exception(
"Must compute distances before closest centers can be calculated")
min_distances = self.distance_matrix.min(axis=1)
# We need to make sure the index
min_distances.index = list(range(self.numRows))
cluster_list = [boolean_series.index[j]
for boolean_series in
[
self.distance_matrix.iloc[i,
:] == min_distances.iloc[i]
for i in list(range(self.numRows))
]
for j in list(range(self.k))
if boolean_series[j]
]
self.clusters = Series(cluster_list, index=self.data_frame.index)
def _compute_new_centers(self):
if self.centers is None:
raise Exception("Centers not initialized!")
if self.clusters is None:
raise Exception("Clusters not computed!")
for i in list(range(self.k)):
self.centers.ix[i, :] = self.data_frame[
self.columns].ix[self.clusters == i].mean()
def cluster(self):
self._populate_initial_centers()
self._compute_distances()
self._get_clusters()
counter = 0
while True:
counter += 1
self.previous_clusters = self.clusters.copy()
self._compute_new_centers()
self._compute_distances()
self._get_clusters()
if self.max_iterations is not None and counter >= self.max_iterations:
break
elif all(self.clusters == self.previous_clusters):
break
if self.appended_column_name is not None:
try:
self.data_frame[self.appended_column_name] = self.clusters
except:
warnings.warn(
"Unable to append a column named %s to your data." %
self.appended_column_name)
warnings.warn(
"However, the clusters are available via the cluster attribute")
def _distances_from_point(self, point):
# pandas Series
return np.power(self.data_frame[self.columns] - point, 2).sum(axis=1)
def _distances_from_point_list(self, point_list):
result = None
for point in point_list:
if result is None:
result = self._distances_from_point(point)
else:
result = pd.concat(
[result, self._distances_from_point(point)], axis=1).min(axis=1)
return result
def _grab_random_point(self):
index = np.random.random_integers(0, self.numRows - 1)
# NumPy array
return self.data_frame[self.columns].iloc[index, :].values
def _is_numeric(self, col):
return all(np.isreal(self.data_frame[col])) and not any(np.isnan(self.data_frame[col]))