-
Notifications
You must be signed in to change notification settings - Fork 65
/
optics.cl
executable file
·272 lines (250 loc) · 10.7 KB
/
optics.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
;;; -*- mode: lisp; syntax: common-lisp -*-
;;; Clustering Method : OPTICS
;;; NAGANUMA Shigeta, 2009
;;; Reference:
;;; Mihael Ankerst, Markus M. Breuning, Hans-Peter Kriegel and Jorg Sander.
;;; "OPTICS:Ordering Points To Identify the Clustering Structure."
;;; Institute for Computer Science, University of Munich
(defpackage :optics
(:use :cl
:hjs.learn.read-data
:hjs.util.matrix
:statistics)
(:export :optics
:optics-main
:make-optics-input))
(in-package :optics)
(defclass optics-input ()
((input-data :initform nil :initarg :input-data :accessor input-data)
(distance :initform nil :initarg :distance :accessor distance)
(epsilon :initform nil :initarg :epsilon :accessor epsilon)
(min-pts :initform nil :initarg :min-pts :accessor min-pts)
(r-epsilon :initform nil :initarg :r-epsilon :accessor r-epsilon)
(normalize :initform nil :initarg :normalize :accessor normalize)
(target-cols :initform nil :initarg :target-cols :accessor target-cols)
(point-objs :initform nil :initarg :point-objs :accessor point-objs)
))
(defclass optics-point-object ()
((id :initform nil :initarg :id :accessor id)
(c-id :initform nil :initarg :c-id :accessor c-id)
(processed :initform nil :initarg :processed :accessor processed)
(reachability-d :initform nil :initarg :reachability-d
:accessor reachability-d)
(core-d :initform nil :initarg :core-d :accessor core-d)
(optics-input :initform nil :initarg :optics-input :accessor optics-input)))
(defclass optics-output ()
((ordered-data :initform nil :initarg :ordered-data
:accessor ordered-data)
(cluster-info :initform nil :initarg :cluster-info
:accessor cluster-info)))
(defmethod print-object ((o optics-output) stream)
(with-accessors ((info cluster-info)) o
(print-unreadable-object (o stream :type t :identity nil))
(format stream "~&[ClusterID] SIZE |~{ [~A] ~A~^ |~}~%"
(loop for (c-id . size) in info
append `(,c-id ,size)))))
(defclass order-seeds ()
((seeds :initform nil :initarg :seeds :accessor seeds)))
;;; optics function
;;; optics
;;; -input
;;; input-path 入力ファイルのパス, :csv or :sexp (1行目はカラム名, 以降データ)
;;; epsilon 近傍半径
;;; min-pts 近傍最小データ数
;;; r-epsilon reachability への閾値
;;; target-cols 対象カラム(sequence)
;;; &key
;;; file-type :csv or :sexp
;;; csv-type-spec csvファイルの各列の型 string double-float integer etc.
;;; distance def. of distance :manhattan, :euclid or :cosine
;;; normalize t or nil (now only nil)
;;; -output
;;; object of class :optics-output
(defun optics (input-path epsilon min-pts r-epsilon
target-cols &key (file-type :sexp)
(csv-type-spec '(string double-float double-float))
(distance :manhattan)
(normalize nil)
(external-format :default))
(assert (plusp epsilon))
(assert (plusp min-pts))
(assert (and (plusp r-epsilon) (<= r-epsilon epsilon)))
(assert target-cols)
(let* ((dataset
(read-data-from-file input-path
:type file-type
:csv-type-spec csv-type-spec
:external-format external-format))
(range (map 'list
#'(lambda (col-name)
(dimension-index
(find col-name (dataset-dimensions dataset)
:key #'dimension-name :test #'string-equal)))
target-cols)))
(setq dataset
(pick-and-specialize-data dataset
:range range
:data-types
(make-list (length range)
:initial-element :numeric)))
(%optics dataset epsilon min-pts r-epsilon
:distance distance :normalize normalize)))
(defmethod %optics ((numeric-dataset numeric-dataset) epsilon min-pts r-epsilon
&key (distance :manhattan) normalize)
(when normalize
(setf (dataset-numeric-points numeric-dataset)
(standardize (dataset-numeric-points numeric-dataset))))
(let* ((optics-input (make-optics-input
(dataset-numeric-points numeric-dataset)
epsilon min-pts r-epsilon
:distance distance :normalize normalize))
(optics-output (make-instance 'optics-output)))
(optics-main optics-input optics-output)
optics-output))
(defun optics-main (optics-input optics-output)
(loop for object across (point-objs optics-input)
when (not (processed object))
do (let* ((neighbors (get-neighbors optics-input object))
(order-seeds
(make-instance 'order-seeds :seeds nil)))
(setf (processed object) t)
(add optics-output object)
(when (set-core-d object neighbors)
(update order-seeds neighbors object)
(loop while (seeds order-seeds)
as current-obj = (next order-seeds)
as neighbors = (get-neighbors optics-input current-obj)
do (setf (processed current-obj) t)
(add optics-output current-obj)
(when (set-core-d current-obj neighbors)
(update order-seeds neighbors current-obj))))))
(cluster-numbering optics-output optics-input))
(defun make-optics-input (input-data epsilon min-pts r-epsilon
&key (distance :manhattan)
(normalize nil))
(assert (<= r-epsilon epsilon))
(setq distance
(case distance
(:manhattan #'hjs.util.vector:manhattan-distance)
(:euclid #'hjs.util.vector:euclid-distance)
(:cosine #'hjs.util.vector:cosine-distance)
(t (error "illegal name for distance | ~A" distance))))
(let* ((optics-input (make-instance 'optics-input
:input-data input-data :distance distance
:epsilon epsilon :min-pts min-pts
:r-epsilon r-epsilon :normalize normalize))
(point-objs
(coerce
(loop for point across input-data
for id from 0
collect (make-instance 'optics-point-object
:id id :optics-input optics-input))
'vector)))
(setf (point-objs optics-input) point-objs)
optics-input))
(defmethod point ((obj optics-point-object))
(svref (input-data (optics-input obj)) (id obj)))
(defmethod get-neighbors ((input optics-input) object)
(let* ((d (distance input))
(e (epsilon input))
(line-id (id object))
(target-vals (point object)))
(coerce
(loop for point-obj across (point-objs input)
for i from 0
as obj-vals = (point point-obj)
as distance = (funcall d target-vals obj-vals)
when (and (/= i line-id)
(<= distance e))
collect point-obj)
'vector)))
(defmethod set-core-d ((object optics-point-object)
neighbors)
(let* ((input (optics-input object))
(d (distance input))
(minpts (min-pts input)))
(if (>= (length neighbors) minpts)
(let* ((d-list (sort (map 'list
#'(lambda (obj)
(funcall d
(point object)
(point obj)))
neighbors)
#'<)))
(setf (core-d object)
(car (last (subseq d-list 0 minpts)))))
(setf (core-d object) nil))
(core-d object)))
(defmethod update ((o-seeds order-seeds) neighbors center-obj)
(let* ((d (distance (optics-input center-obj)))
(c-d (core-d center-obj)))
(assert (numberp c-d))
(loop for obj across neighbors
when (not (processed obj))
do (let ((r-dist (max c-d (funcall d
(point obj)
(point center-obj)))))
(cond ((not (reachability-d obj))
(setf (reachability-d obj) r-dist)
(insert o-seeds obj))
((< r-dist (reachability-d obj))
(setf (reachability-d obj) r-dist)
(decrease o-seeds obj)))))))
(defmethod insert ((o-seeds order-seeds) obj)
(if (seeds o-seeds)
(setf (seeds o-seeds)
(merge 'list
(sort (seeds o-seeds) #'< :key #'reachability-d)
`(,obj)
#'< :key #'reachability-d))
(setf (seeds o-seeds) `(,obj))))
(defmethod decrease ((o-seeds order-seeds) obj)
(declare (ignore obj))
(setf (seeds o-seeds)
(sort (seeds o-seeds) #'< :key #'reachability-d)))
(defmethod next ((o-seeds order-seeds))
(let* ((seeds (seeds o-seeds))
(obj (car seeds))
(next-sds (cdr seeds)))
(setf (seeds o-seeds) next-sds)
obj))
(defmethod add ((output optics-output) obj)
(setf (ordered-data output)
(append (ordered-data output) `(,obj))))
(defmethod cluster-numbering ((output optics-output)
input &key (a 1.01))
(let* ((r-e (r-epsilon input))
(a-e (* a (epsilon input))))
(setf (ordered-data output)
(coerce
(cons
#("ID" "reachability" "core distance" "ClusterID")
(loop for obj in (ordered-data output)
as r = (or (reachability-d obj) a-e)
as c = (or (core-d obj) a-e)
as id = (id obj)
with c-id = 0
with cluster-info
collect
(if (> r r-e)
(if (<= c r-e)
(progn
(incf c-id)
(if (assoc c-id cluster-info :test #'=)
(incf (cdr (assoc c-id cluster-info :test #'=)))
(push (cons c-id 1) cluster-info))
`#(,id ,a-e ,c ,c-id))
(progn
(if (assoc -1 cluster-info :test #'=)
(incf (cdr (assoc -1 cluster-info :test #'=)))
(push (cons -1 1) cluster-info))
`#(,id ,a-e ,a-e -1)))
(progn
(if (assoc c-id cluster-info :test #'=)
(incf (cdr (assoc c-id cluster-info :test #'=)))
(push (cons c-id 1) cluster-info))
`#(,id ,r ,c ,c-id)))
finally
(setf (cluster-info output)
(sort cluster-info #'< :key #'car))))
'vector))))