https://jungh0.github.io/beijing-house-scikit-learn/map.html
Housing Price of Beijing from 2011 to 2017, fetching from ย ย
#clean value
data = clean_value(data,'floor')
data = clean_value(data,'constructionTime')
#drop nan data
data_dnan = dropnan(data)
Remove the meaningless words
Left only the floor number
def clean_value(D,str):
floor = D[str].str.extract('(\d+)')
floor.columns = [str + '_new']
D = drop(D,str) #drop original floor data
D = concat(D,floor) #concat clean_floor to dataset
return D
def one_hot_data(D,c_str):
D[c_str] = D[c_str].astype(str)
one_hot_data = pd.get_dummies(D[[c_str]])
D = drop(D,c_str)
new = concat(D,one_hot_data)
return new
After Preprocessing
Dataset Size : 127670
Use 102137 train set and 25534 test set
Because the MSE is the smallest
Use the Random Forest Regressor
def find_estimators(train_set_x,train_set_y,test_set_x,test_set_y):
#MSE์ ๋ณํ๋ฅผ ํ์ธํ๊ธฐ ์ํ์ฌ ์์๋ธ์ ํฌ๊ธฐ ๋ฒ์์์ ๋๋ค ํฌ๋ ์คํธ ํธ๋ ์ด๋
mseOos = []
nTreeList = range(10, 500, 100)
for iTrees in nTreeList:
depth = None
maxFeat = None #4 #์กฐ์ ํด๋ณผ ๊ฒ
wineRFModel = RandomForestRegressor(n_estimators=iTrees,max_depth=None, max_features=maxFeat,oob_score=False, random_state=531)
wineRFModel.fit(train_set_x, train_set_y)
#๋ฐ์ดํฐ ์ธํธ์ ๋ํ MSE ๋์
prediction = wineRFModel.predict(test_set_x)
mseOos.append(mean_squared_error(test_set_y, prediction))
print("MSE")
print(mseOos[-1])
#ํธ๋ ์ด๋ ํ
์คํธ ์ค์ฐจ ๋๋น ์์๋ธ์ ํธ๋ฆฌ ๊ฐ์ ๋ํ ๊ทธ๋ฆฌ๊ธฐ
plt.plot(nTreeList, mseOos)
plt.xlabel('Number of Trees in Ensemble')
plt.ylabel('Mean Squared Error')
plt.show()
def find_max_depth(train_set_x,train_set_y,test_set_x,test_set_y):
#MSE์ ๋ณํ๋ฅผ ํ์ธํ๊ธฐ ์ํ์ฌ ์์๋ธ์ ํฌ๊ธฐ ๋ฒ์์์ ๋๋ค ํฌ๋ ์คํธ ํธ๋ ์ด๋
mseOos = []
nTreeList = range(10, 50, 1)
for iTrees in nTreeList:
maxFeat = None
wineRFModel = RandomForestRegressor(n_estimators=30,max_depth=iTrees, max_features=maxFeat,oob_score=False, random_state=531)
wineRFModel.fit(train_set_x, train_set_y)
#๋ฐ์ดํฐ ์ธํธ์ ๋ํ MSE ๋์
prediction = wineRFModel.predict(test_set_x)
mseOos.append(mean_squared_error(test_set_y, prediction))
print("MSE")
print(mseOos[-1])
#ํธ๋ ์ด๋ ํ
์คํธ ์ค์ฐจ ๋๋น ์์๋ธ์ ํธ๋ฆฌ ๊ฐ์ ๋ํ ๊ทธ๋ฆฌ๊ธฐ
plt.plot(nTreeList, mseOos)
plt.xlabel('Number of maxFeat in Ensemble')
plt.ylabel('Mean Squared Error')
plt.show()
def find_max_features(train_set_x,train_set_y,test_set_x,test_set_y):
#MSE์ ๋ณํ๋ฅผ ํ์ธํ๊ธฐ ์ํ์ฌ ์์๋ธ์ ํฌ๊ธฐ ๋ฒ์์์ ๋๋ค ํฌ๋ ์คํธ ํธ๋ ์ด๋
mseOos = []
nTreeList = range(1, 23, 1)
for iTrees in nTreeList:
wineRFModel = RandomForestRegressor(n_estimators=30,max_depth=17, max_features=iTrees,oob_score=False, random_state=531)
wineRFModel.fit(train_set_x, train_set_y)
#๋ฐ์ดํฐ ์ธํธ์ ๋ํ MSE ๋์
prediction = wineRFModel.predict(test_set_x)
mseOos.append(mean_squared_error(test_set_y, prediction))
print("MSE")
print(mseOos[-1])
#ํธ๋ ์ด๋ ํ
์คํธ ์ค์ฐจ ๋๋น ์์๋ธ์ ํธ๋ฆฌ ๊ฐ์ ๋ํ ๊ทธ๋ฆฌ๊ธฐ
plt.plot(nTreeList, mseOos)
plt.xlabel('Number of max_features in Ensemble')
plt.ylabel('Mean Squared Error')
plt.show()