-
Notifications
You must be signed in to change notification settings - Fork 0
/
normalize.py
71 lines (60 loc) · 2.91 KB
/
normalize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, PowerTransformer, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_selector, ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
class DebugTransformer(BaseEstimator, TransformerMixin):
def __init__(self, name):
self.name = name
def fit(self, X, y=None):
return self
def transform(self, X):
print(f"\nDebug info for {self.name}:")
print(f"Type of X: {type(X)}")
if isinstance(X, pd.DataFrame):
print(f"Columns and Types: \n{X.dtypes}")
print(X.describe())
elif isinstance(X, np.ndarray):
print(f"Shape: {X.shape}")
else:
print("Unexpected type")
print(f"First few rows:\n{X[:5]}")
return X
def build_standard_normalizer(min_frequency=100, numerical_strategy='median', categorical_fill_value='NaN', verbose = False, cores=-1, indicator_type=bool):
def build_standard_transformer(transformers):
return ColumnTransformer(
transformers=transformers,
remainder='passthrough',
verbose=verbose,
verbose_feature_names_out=False,
n_jobs=cores
)
numerical_columns = make_column_selector(dtype_include=['number'])
categorical_columns = make_column_selector(dtype_include=['object', 'category'])
categorical_imputer = SimpleImputer(missing_values=pd.NA, strategy='constant', fill_value=categorical_fill_value)
numerical_imputer = SimpleImputer(missing_values=pd.NA, strategy=numerical_strategy , add_indicator= True)
imputer = build_standard_transformer([
('impute numbers', numerical_imputer, numerical_columns),
('impute strings', categorical_imputer, categorical_columns)
])
missingindicator_bool_forcer = build_standard_transformer([
('force_missingindicator_to_bool',
FunctionTransformer(lambda X: X.astype(bool)),
make_column_selector(pattern='^missingindicator_'))
])
encoder = OneHotEncoder(min_frequency=min_frequency, handle_unknown='ignore', sparse_output=False, dtype=indicator_type)
scaler = PowerTransformer()
normalizer = build_standard_transformer([
('encode strings', encoder, categorical_columns),
('scale numbers', scaler, numerical_columns)
])
pipeline_steps = [('imputer', imputer), ('bool_forcer', missingindicator_bool_forcer)]
if verbose:
pipeline_steps.append(('debug_after_imputer', DebugTransformer("After Imputer")))
pipeline_steps.append(('normalizer', normalizer))
if verbose:
pipeline_steps.append(('debug_after_normalizer', DebugTransformer("After Normalizer")))
return Pipeline(pipeline_steps, verbose=verbose)
__all__ = ['build_custom_pipeline']