-
Notifications
You must be signed in to change notification settings - Fork 3
/
common.py
92 lines (67 loc) · 2.36 KB
/
common.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pathlib
import os
import pandas as pd
from joblib import dump, load
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
import numpy as np
#default to no verbose, change if needed
verbose = False
tag = 'default'
#list of classifiers where one hot encoding is required
one_hot_encoded = ['NeuralNet']
#set paths
model_dir = 'models'
model_suffix = '.joblib'
data_path = 'data'
data_suffix = '_Iq.csv'
# path to current model directory
def model_directory():
return pathlib.Path(model_dir, tag)
def model_file(filename):
return pathlib.Path(model_directory(), filename)
scaler_file = 'std_scaler.bin'
pca_file = 'pca.bin'
class_name_file = 'classes.npy'
def log_verbose(*args):
if verbose:
print(*args)
def preprocess_input_file(filename):
input_df = pd.read_csv(filename)
input_df.rename(columns={input_df.columns[0]: 'id'}, inplace=True)
return input_df
def apply_scaler(data):
scaler = load(model_file(scaler_file))
return scaler.transform(data)
def save_scaler(scaler):
dump(scaler, model_file(scaler_file), compress=True)
def check_apply_pca(data):
if os.path.isfile(model_file(pca_file)):
log_verbose('Applying pca')
pca = load(model_file(pca_file))
return pca.transform(data)
else:
return data
def save_pca(pca):
dump(pca, model_file(pca_file), compress=True)
def retrieve_class_names():
encoder = LabelEncoder()
encoder.classes_ = np.load(model_file(class_name_file), allow_pickle=True)
class_probability_names = []
for class_name in encoder.classes_:
class_probability_names.append(class_name + '_prob')
return encoder, class_probability_names
def save_encoder(encoder):
np.save(model_file(class_name_file), encoder.classes_)
def append_predictions(clf, data, model_name, output_df):
encoder, class_probability_names = retrieve_class_names()
prediction = clf.predict(data)
output_df[model_name] = encoder.inverse_transform(prediction)
prob = clf.predict_proba(data)
df_prob = pd.DataFrame(prob, columns=class_probability_names)
for current_prob in class_probability_names:
output_df[current_prob + '_' + model_name] = df_prob[current_prob].values
return output_df
def get_data_files():
return pathlib.Path(data_path).glob('*' + data_suffix)