-
Notifications
You must be signed in to change notification settings - Fork 0
/
CaseStudy
181 lines (136 loc) · 6.59 KB
/
CaseStudy
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
##1 Generate the Dataset based on the variables of "Verdienststrukturerhebung 2018"
# Firstly generating the dataset for the further imputation Case Study
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
# Re-creating the original_data DataFrame with missing values
# (This is necessary as the code execution state was reset)
np.random.seed(0) # Seed for reproducibility
# Generate 'Age' data
age_data = np.random.randint(18, 70, 500)
# Generate 'Income' data with a linear relationship to 'Age'
income_data = 1000 * age_data + np.random.normal(0, 10000, 500)
# Introduce missing data at random (15% missing)
def introduce_missing_data_at_random(data, missing_percentage):
total_data_points = data.size
total_missing = int(total_data_points * missing_percentage / 100)
flat_indices = np.arange(total_data_points)
missing_indices = np.random.choice(flat_indices, total_missing, replace=False)
row_indices, col_indices = np.unravel_index(missing_indices, data.shape)
data.values[row_indices, col_indices] = np.nan
original_data = pd.DataFrame({
'Income': income_data,
'Age': age_data
})
introduce_missing_data_at_random(original_data, 15)
##2 Do a scatter plot to have a look at the data
# Scatter plot of Income vs. Age from the original_data DataFrame to show linear relationship between the two variables
plt.figure(figsize=(10, 6))
plt.scatter(original_data['Age'], original_data['Income'], alpha=0.6, s=80)
plt.title('Scatter Plot of Age vs. Income')
plt.xlabel('Age')
plt.ylabel('Income')
plt.grid(True)
plt.show()
##3 Set up Imputations for MAR values
#Imputation techniques: 4different imputation techniques (mean, nearest neighbor, and linear regression) are computed:
# Set up the imputation methods for meanimputation and kNN
mean_imputer = SimpleImputer(strategy='mean') # For mean imputation
knn_imputer = KNNImputer(n_neighbors=5) # For KNN imputation
# Application of Mean Imputation
original_data['Income_Mean'] = mean_imputer.fit_transform(original_data[['Income']])
# Apply kNN Imputation
original_data['Income_KNN'] = knn_imputer.fit_transform(original_data[['Income']])
# Separate the data for Linear Regression Imputation
train_data = original_data.dropna(subset=['Income'])
missing_data = original_data[original_data['Income'].isnull()]
# Check if there are missing values to be imputed
if not missing_data.empty:
# Create a linear regression model
lr_model = LinearRegression()
# Fit the model on the data without missing values
lr_model.fit(train_data[['Age']], train_data['Income'])
# Predict the missing 'Income' values
predicted_income = lr_model.predict(missing_data[['Age']])
# Fill in the missing values in the original data
original_data.loc[original_data['Income'].isnull(), 'Income_LR'] = predicted_income
else:
# Indicate that there's no missing data for linear regression imputation
original_data['Income_LR'] = np.nan
print("No missing 'Income' data to impute using Linear Regression.")
# Display the head of the DataFrame
original_data[['Income', 'Income_Mean', 'Income_KNN', 'Income_LR']].head()
# Apply simple mean imputation
mean_imputer = SimpleImputer(strategy='mean')
original_data['Income_Mean'] = mean_imputer.fit_transform(original_data[['Income']])
# Scatter plot to visualize the original and mean imputed data
plt.figure(figsize=(12, 6))
# Original data points
plt.scatter(original_data['Age'], original_data['Income'], alpha=0.5, label='Original Data', color='dodgerblue')
# Mean imputed data points
plt.scatter(original_data['Age'], original_data['Income_Mean'], alpha=0.5, label='Mean Imputed Data', color='orange')
plt.title('Comparison of Original and Mean Imputed Income Data')
plt.xlabel('Age')
plt.ylabel('Income')
plt.legend()
plt.grid(True)
plt.show()
##Improved/new version:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Generated synthetic dataset with linear relationship and introduce missing data
np.random.seed(0)
age_data = np.random.randint(18, 70, 500)
income_data = 1000 * age_data + np.random.normal(0, 10000, 500)
original_data = pd.DataFrame({
'Age': age_data,
'Income': income_data
})
def introduce_missing_data_at_random(data, missing_percentage):
np.random.seed(0)
n_missing = int(len(data) * missing_percentage / 100)
missing_indices = np.random.choice(data.index, n_missing, replace=False)
data.loc[missing_indices, 'Income'] = np.nan
introduce_missing_data_at_random(original_data, 15)
# Simple Mean Imputation
mean_imputer = SimpleImputer(strategy='mean')
original_data['Income_Mean'] = mean_imputer.fit_transform(original_data[['Income']])
# Simple Regression Imputation
linreg = LinearRegression()
linreg.fit(original_data[['Age']][original_data['Income'].notnull()], original_data['Income'].dropna())
original_data['Income_Reg'] = original_data['Income']
reg_pred = linreg.predict(original_data[['Age']][original_data['Income'].isnull()])
original_data.loc[original_data['Income'].isnull(), 'Income_Reg'] = reg_pred
# Simple kNN Imputation
knn_imputer = KNNImputer(n_neighbors=5)
original_data['Income_KNN'] = knn_imputer.fit_transform(original_data[['Income']])
# Multiple Imputation with Random Forest
rf_imputer = IterativeImputer(estimator=RandomForestRegressor(n_estimators=10), random_state=0, max_iter=10, initial_strategy='mean')
original_data['Income_RF'] = rf_imputer.fit_transform(original_data[['Income']])
# Plotting the results
plt.figure(figsize=(20, 5))
for i, method in enumerate(['Income_Mean', 'Income_Reg', 'Income_KNN', 'Income_RF'], 1):
plt.subplot(1, 4, i)
sns.scatterplot(data=original_data, x='Age', y='Income', color='blue', alpha=0.5)
sns.scatterplot(data=original_data, x='Age', y=method, color='red', alpha=0.5)
plt.title(f'Scatter Plot with {method.split("_")[1]} Imputation')
plt.xlabel('Age')
plt.ylabel('Income')
plt.tight_layout()
plt.show()
# Comparing variances
variances = {method: original_data[method].var() for method in ['Income', 'Income_Mean', 'Income_Reg', 'Income_KNN', 'Income_RF']}
print(variances)
# Boxplot of the different imputation methods
plt.figure(figsize=(12, 6))
sns.boxplot(data=original_data[['Income', 'Income_Mean', 'Income_Reg', 'Income_KNN', 'Income_RF']])
plt.title('Boxplot of Imputation Methods')
plt.show()