forked from bazingagin/npc_gzip
-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
256 lines (186 loc) · 6.36 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
from collections.abc import Sequence
import numpy as np
import scipy
import torch
def NCD(c1: float, c2: float, c12: float) -> float:
"""
Calculates Normalized Compression Distance (NCD).
Arguments:
c1 (float): The compressed length of the first object.
c2 (float): The compressed length of the second object.
c12 (float): The compressed length of the concatenation of the first and second objects.
Returns:
float: The Normalized Compression Distance c1 and c2.
Formula:
NCD(c1, c2, c12) = (c12 - min(c1, c2)) / max(c1, c2)
"""
distance = (c12 - min(c1, c2)) / max(c1, c2)
return distance
def CLM(c1, c2, c12):
"""
Calculates Compression-based Length Measure (CLM).
Arguments:
c1: The compressed length of the first object.
c2: The compressed length of the second object.
c12: The compressed length of the concatenation of the first and second objects.
Returns:
float: The Compression-based Length Measure value between c1 and c2.
Formula:
CLM(c1, c2, c12) = 1 - (c1 + c2 - c12) / c12
"""
dis = 1 - (c1 + c2 - c12) / c12
return dis
def CDM(c1: float, c2: float, c12: float) -> float:
"""
Calculates Compound Dissimilarity Measure (CDM).
Arguments:
c1 (float): The compressed length of the first object.
c2 (float): The compressed length of the second object.
c12 (float): The compressed length of the concatenation of the first and second objects.
Returns:
float: The Compound Dissimilarity Measure value between c1 and c2.
Formula:
CDM(c1, c2, c12) = c12 / (c1 + c2)
"""
dis = c12 / (c1 + c2)
return dis
def MSE(v1: np.ndarray, v2: np.ndarray) -> float:
"""
Calculates Mean Squared Error (MSE).
Arguments:
v1 (np.ndarray): The first array.
v2 (np.ndarray): The second array.
Returns:
float: The Mean Squared Error value, representing the average squared difference between v1 and v2.
Formula:
MSE(v1, v2) = Σ((v1 - v2) ** 2) / len(v1)
"""
return np.sum((v1 - v2) ** 2) / len(v1)
def agg_by_concat_space(t1: str, t2: str) -> str:
"""
Combines `t1` and `t2` with a space.
Arguments:
t1 (str): First item.
t2 (str): Second item.
Returns:
str: `{t1} {t2}`
"""
return t1 + " " + t2
def agg_by_jag_word(t1: str, t2: str) -> str:
"""
# TODO: Better description
Arguments:
t1 (str): First item.
t2 (str): Second item.
Returns:
str:
"""
t1_list = t1.split(" ")
t2_list = t2.split(" ")
combined = []
minimum_list_size = min([len(t1_list), len(t2_list)])
for i in range(0, minimum_list_size - 1, 2):
combined.append(t1_list[i])
combined.append(t2_list[i + 1])
if len(t1_list) > len(t2_list):
combined += t1_list[i:]
return " ".join(combined)
def agg_by_jag_char(t1: str, t2: str):
"""
# TODO: Better description
Arguments:
t1 (str): First item.
t2 (str): Second item.
Returns:
str:
"""
t1_list = list(t1)
t2_list = list(t2)
combined = []
minimum_list_size = min([len(t1_list), len(t2_list)])
for i in range(0, minimum_list_size - 1, 2):
combined.append(t1_list[i])
combined.append(t2_list[i + 1])
if len(t1_list) > len(t2_list):
combined += t1_list[i:]
return "".join(combined)
def aggregate_strings(stringa: str, stringb: str, by_character: bool = False) -> str:
"""
Aggregates strings.
Arguments:
stringa (str): First item.
stringb (str): Second item.
by_character (bool): True if you want to join the combined string by character,
Else combines by word
Returns:
str: combination of stringa and stringb
"""
lista = list(stringa)
listb = list(stringb)
combined = []
minimum_list_size = min([len(lista), len(listb)])
for i in range(0, minimum_list_size - 1, 2):
combined.append(lista[i])
combined.append(listb[i + 1])
if len(lista) > len(listb):
combined += lista[i:]
if by_character:
return "".join(combined)
return " ".join(combined)
def agg_by_avg(i1: torch.Tensor, i2: torch.Tensor) -> torch.Tensor:
"""
Calculates the average of i1 and i2, rounding to the shortest.
Arguments:
i1 (torch.Tensor): First series of numbers.
i2 (torch.Tensor): Second series of numbers.
Returns:
torch.Tensor: Average of the two series of numbers.
"""
return torch.div(i1 + i2, 2, rounding_mode="trunc")
def agg_by_min_or_max(
i1: torch.Tensor,
i2: torch.Tensor,
aggregate_by_minimum: bool = False,
) -> torch.Tensor:
"""
Calculates the average of i1 and i2, rounding to the shortest.
Arguments:
i1 (torch.Tensor): First series of numbers.
i2 (torch.Tensor): Second series of numbers.
aggregate_by_minimum (bool): True if you want to take the minimum of the two series.
False if you want to take the maximum instead.
Returns:
torch.Tensor: Average of the two series.
"""
stacked = torch.stack([i1, i2], axis=0)
if aggregate_by_minimum:
return torch.min(stacked, axis=0)[0]
return torch.max(stacked, axis=0)[0]
def agg_by_stack(i1: torch.Tensor, i2: torch.Tensor) -> torch.Tensor:
"""
Combines `i1` and `i2` via `torch.stack`.
Arguments:
i1 (torch.Tensor): First series of numbers.
i2 (torch.Tensor): Second series of numbers.
Returns:
torch.Tensor: Stack of the two series.
"""
return torch.stack([i1, i2])
def mean_confidence_interval(data: Sequence, confidence: float = 0.95) -> tuple:
"""
Computes the mean confidence interval of `data` with `confidence`
Arguments:
data (Sequence): Data to compute a confidence interval over.
confidence (float): Level to compute confidence.
Returns:
tuple: (Mean, quantile-error-size)
"""
if isinstance(data, np.ndarray):
array = data
else:
array = np.array(data, dtype=np.float32)
n = array.shape[0]
mean = np.mean(array)
standard_error = scipy.stats.sem(array)
quantile = scipy.stats.t.ppf((1 + confidence) / 2.0, n - 1)
return mean, standard_error * quantile