-
Notifications
You must be signed in to change notification settings - Fork 0
/
cohort_analysis.py
103 lines (82 loc) · 3.64 KB
/
cohort_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import pandas as pd
from statsmodels.regression.linear_model import OLS
from statsmodels.tools import add_constant
def feature_extraction(filepath=None):
"""
input: a datefile path
output: a cleaned dataframe
"""
df = pd.read_csv(filepath, parse_dates=True, index_col=['Start Date'],
usecols=['Start Date', 'Users', 'Week_0', 'Week_1', 'Week_2', 'Week_3', 'Week_4', 'Week_8'])
df.rename({'Week_8': 'y', 'Users': 'Cohort_size'}, axis=1, inplace=True)
df['Month'] = df.index.month_name()
df = df.loc[~((df == 0).any(axis=1))]
df['Cohort_active_users'] = df.Week_0 / df.Cohort_size * 100
sub = df[['Week_0', 'Week_1', 'Week_2', 'Week_3', 'Week_4', 'y']].apply(lambda x: x / df.Week_0 * 100)
df.loc[:, ['Week_0', 'Week_1', 'Week_2', 'Week_3', 'Week_4', 'y']] = sub
df.drop('Week_0', axis=1, inplace=True)
df.Cohort_size = df.Cohort_size.map(np.log)
return df
def get_the_set(df, target_variable):
"""
input: a matrix with all variables and a label for the y variable
output: X_train, X_test, y_train, y_test
"""
X = df.drop(labels=target_variable, axis=1).copy()
y = df[f'{target_variable}'].copy()
X_train, X_test, y_train, y_test = map(lambda x: x.reset_index(drop=True), train_test_split(X, y,
test_size=1 / 4,
random_state=42,
# stratify=y
))
return X_train, X_test, y_train, y_test
def fit_encoder(df):
"""
input: a matrix with all variables
output: a fitted encoder
"""
encoder = OneHotEncoder(drop='first') # , handle_unknown='ignore')
cat_vars = df.select_dtypes(include=['O', 'category'])
encoder.fit(cat_vars)
return encoder
def run_scaling(X):
"""
input: a matrix with all variables
output: a matrix with all numerical values scaled
"""
scaler = StandardScaler()
num_var = X.select_dtypes(include=['int', 'float']).copy()
num_scaled = scaler.fit_transform(num_var)
X.loc[:, X.select_dtypes(include=['int', 'float']).columns] = num_scaled
return X
def encode_categorical_variables(X, encoder):
"""
input: a matrix with all variables
output: a matrix with categorical and object encoded with (0,1)
"""
cat_vars = X.select_dtypes(include=['O', 'category']).copy()
cat_names = cat_vars.columns
try:
encoded = encoder.transform(cat_vars).toarray()
except ValueError as e:
return X
df_cat = pd.DataFrame(encoded, columns=encoder.get_feature_names(cat_names))
X = X.select_dtypes(exclude=['O', 'category']).join(df_cat)
return X
# Data prep
data_file = '.../cohort_analysis.csv'
df = feature_extraction(data_file).drop('Month', axis=1)
X_train, X_test, y_train, y_test = get_the_set(df=df, target_variable='y')
encoder = fit_encoder(X_train)
X_train = run_scaling(X_train)
X_train = encode_categorical_variables(X_train, encoder)
X_test = run_scaling(X_test)
X_test = encode_categorical_variables(X_test, encoder)
# Model
model = OLS(y_train, add_constant(X_train.drop(['Week_1', 'Week_3', 'Cohort_active_users'], axis=1))).fit()
model.summary()
# Test the assumption of Linear Regression
tester = Assumptions.Assumption_Tester_OLS(X_train, y_train)
tester.run_all()