-
Notifications
You must be signed in to change notification settings - Fork 26
/
ndtest.py
160 lines (135 loc) · 5.08 KB
/
ndtest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
from __future__ import division
import numpy as np
from numpy import random
from scipy.spatial.distance import pdist, cdist
from scipy.stats import kstwobign, pearsonr
from scipy.stats import genextreme
__all__ = ['ks2d2s', 'estat', 'estat2d']
def ks2d2s(x1, y1, x2, y2, nboot=None, extra=False):
'''Two-dimensional Kolmogorov-Smirnov test on two samples.
Parameters
----------
x1, y1 : ndarray, shape (n1, )
Data of sample 1.
x2, y2 : ndarray, shape (n2, )
Data of sample 2. Size of two samples can be different.
nboot : None or int
Number of bootstrap resample to estimate the p-value. A large number is expected.
If None, an approximate analytic estimate will be used.
extra: bool, optional
If True, KS statistic is also returned. Default is False.
Returns
-------
p : float
Two-tailed p-value.
D : float, optional
KS statistic, returned if keyword `extra` is True.
Notes
-----
This is the two-sided K-S test. Small p-values means that the two samples are significantly different.
Note that the p-value is only an approximation as the analytic distribution is unkonwn. The approximation
is accurate enough when N > ~20 and p-value < ~0.20 or so. When p-value > 0.20, the value may not be accurate,
but it certainly implies that the two samples are not significantly different. (cf. Press 2007)
References
----------
Peacock, J.A. 1983, Two-Dimensional Goodness-of-Fit Testing in Astronomy, MNRAS, 202, 615-627
Fasano, G. and Franceschini, A. 1987, A Multidimensional Version of the Kolmogorov-Smirnov Test, MNRAS, 225, 155-170
Press, W.H. et al. 2007, Numerical Recipes, section 14.8
'''
assert (len(x1) == len(y1)) and (len(x2) == len(y2))
n1, n2 = len(x1), len(x2)
D = avgmaxdist(x1, y1, x2, y2)
if nboot is None:
sqen = np.sqrt(n1 * n2 / (n1 + n2))
r1 = pearsonr(x1, y1)[0]
r2 = pearsonr(x2, y2)[0]
r = np.sqrt(1 - 0.5 * (r1**2 + r2**2))
d = D * sqen / (1 + r * (0.25 - 0.75 / sqen))
p = kstwobign.sf(d)
else:
n = n1 + n2
x = np.concatenate([x1, x2])
y = np.concatenate([y1, y2])
d = np.empty(nboot, 'f')
for i in range(nboot):
idx = random.choice(n, n, replace=True)
ix1, ix2 = idx[:n1], idx[n1:]
#ix1 = random.choice(n, n1, replace=True)
#ix2 = random.choice(n, n2, replace=True)
d[i] = avgmaxdist(x[ix1], y[ix1], x[ix2], y[ix2])
p = np.sum(d > D).astype('f') / nboot
if extra:
return p, D
else:
return p
def avgmaxdist(x1, y1, x2, y2):
D1 = maxdist(x1, y1, x2, y2)
D2 = maxdist(x2, y2, x1, y1)
return (D1 + D2) / 2
def maxdist(x1, y1, x2, y2):
n1 = len(x1)
D1 = np.empty((n1, 4))
for i in range(n1):
a1, b1, c1, d1 = quadct(x1[i], y1[i], x1, y1)
a2, b2, c2, d2 = quadct(x1[i], y1[i], x2, y2)
D1[i] = [a1 - a2, b1 - b2, c1 - c2, d1 - d2]
# re-assign the point to maximize difference,
# the discrepancy is significant for N < ~50
D1[:, 0] -= 1 / n1
dmin, dmax = -D1.min(), D1.max() + 1 / n1
return max(dmin, dmax)
def quadct(x, y, xx, yy):
n = len(xx)
ix1, ix2 = xx <= x, yy <= y
a = np.sum(ix1 & ix2) / n
b = np.sum(ix1 & ~ix2) / n
c = np.sum(~ix1 & ix2) / n
d = 1 - a - b - c
return a, b, c, d
def estat2d(x1, y1, x2, y2, **kwds):
return estat(np.c_[x1, y1], np.c_[x2, y2], **kwds)
def estat(x, y, nboot=1000, replace=False, method='log', fitting=False):
'''
Energy distance statistics test.
Reference
---------
Aslan, B, Zech, G (2005) Statistical energy as a tool for binning-free
multivariate goodness-of-fit tests, two-sample comparison and unfolding.
Nuc Instr and Meth in Phys Res A 537: 626-636
Szekely, G, Rizzo, M (2014) Energy statistics: A class of statistics
based on distances. J Stat Planning & Infer 143: 1249-1272
Brian Lau, multdist, https://github.com/brian-lau/multdist
'''
n, N = len(x), len(x) + len(y)
stack = np.vstack([x, y])
stack = (stack - stack.mean(0)) / stack.std(0)
if replace:
rand = lambda x: random.randint(x, size=x)
else:
rand = random.permutation
en = energy(stack[:n], stack[n:], method)
en_boot = np.zeros(nboot, 'f')
for i in range(nboot):
idx = rand(N)
en_boot[i] = energy(stack[idx[:n]], stack[idx[n:]], method)
if fitting:
param = genextreme.fit(en_boot)
p = genextreme.sf(en, *param)
return p, en, param
else:
p = (en_boot >= en).sum() / nboot
return p, en, en_boot
def energy(x, y, method='log'):
dx, dy, dxy = pdist(x), pdist(y), cdist(x, y)
n, m = len(x), len(y)
if method == 'log':
dx, dy, dxy = np.log(dx), np.log(dy), np.log(dxy)
elif method == 'gaussian':
raise NotImplementedError
elif method == 'linear':
pass
else:
raise ValueError
z = dxy.sum() / (n * m) - dx.sum() / n**2 - dy.sum() / m**2
# z = ((n*m)/(n+m)) * z # ref. SR
return z