-
Notifications
You must be signed in to change notification settings - Fork 0
/
baseline_autoreg.py
104 lines (77 loc) · 2.86 KB
/
baseline_autoreg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
import pandas as pd
import numpy as np
from pathlib import Path
from statsmodels.tsa.ar_model import AutoReg
from sklearn.metrics import mean_squared_error
from math import sqrt
dirpath_1 = Path('.../top_events/')
dirpath_2 = Path('.../handpicks_events/')
# Aggregate all metrics
def get_frames(dirpath):
for file in dirpath.iterdir():
df = pd.read_csv(file, index_col=0, header=None).T
df = df.applymap(lambda x: x.strip('\t'))
df.iloc[:, 0] = pd.to_datetime(df.iloc[:, 0])
df.set_index('Event', drop=True, inplace=True)
df.index.name = 'DateTime'
yield df
def get_set():
df_1, df_2, df_3, df_4 = get_frames(dirpath_1)
for frame in [df_2, df_3, df_4]:
frame.columns = df_1.columns.to_list()
df = pd.concat([df_1, df_2, df_3, df_4], axis=0)
df.sort_index(inplace=True)
df = df.apply(lambda x: pd.to_numeric(x))
df_1, df_2, df_3, df_4 = get_frames(dirpath_2)
for frame in [df_2, df_3, df_4]:
frame.columns = df_1.columns.to_list()
df_others = pd.concat([df_1, df_2, df_3, df_4], axis=0)
df_others.sort_index(inplace=True)
df_others = df_others.apply(lambda x: pd.to_numeric(x))
# Deal with duplicates metrics
df_e = pd.concat([df, df_others], axis=1).sort_index(axis=1)
df = df_e.iloc[:, ~df_e.columns.duplicated()]
return df
def get_autoregression(frame, metric, window, lags):
# train-test split
X = frame[f'{metric}'].values
train, test = X[1:X.size - 100], X[X.size - 100:]
# train autoregression
window = window
model = AutoReg(train, lags=lags)
model_fit = model.fit()
coef = model_fit.params
# Walk forward over time steps in test
history = train[train.size - window:]
history = [history[i] for i in range(history.size)]
preds = list()
for t in range(test.size):
length = len(history)
lag = [history[i] for i in range(length - window, length)]
yhat = coef[0]
for d in range(window):
yhat += coef[d + 1] * lag[window - d - 1]
obs = test[t]
preds.append(yhat)
history.append(obs)
print('predicted:', yhat, 'expected', obs)
rmse = sqrt(mean_squared_error(test, preds))
print('Test RMSE:', rmse)
# plot the results
# plt.plot(test, label='Actual Observations')
# plt.plot(preds, color='pink', label='Prediction')
# plt.legend(loc="best")
# plt.show()
return None
def main():
data = get_set()
print('RMSE daily data:')
get_autoregression(frame=data,
metric='Tap_Challenge_Accomplished_This_Week',
window=7, lags=150)
print('RMSE weekly resample:')
get_autoregression(frame=data.resample('W').median(),
metric='Tap_Challenge_Accomplished_This_Week',
window=7, lags=30)
if __name__ == '__main__':
main()