依前一篇的 LOF 將離群值刪除後,再重新進行訓練及預測,結果可以將分數拉到 0.72。底下為完整代碼
import pandas as pd import numpy as np from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.neighbors import LocalOutlierFactor as LOF display=pd.options.display display.max_columns=None display.max_rows=None display.width=None display.max_colwidth=None df=pd.read_excel("boston.xlsx", sheet_name='Sheet1',index_col=0, keep_default_na=False) df=df[['PRICE','LSTAT','RM']] x=df[['LSTAT','RM']] y=df['PRICE'] lof=LOF(n_neighbors=20, contamination='auto') y_pred=lof.fit_predict(np.c_[x, y]) df=pd.DataFrame(data=df.loc[np.where(y_pred==1)].values, columns=['PRICE','LSTAT','RM']) print(df.shape) data=np.c_[df['LSTAT']**(1/3),df['RM']] x=pd.DataFrame(data=data, columns=['LSTAT','RM']) # x=df[['LSTAT','RM']] y=df['PRICE'] #資料切割 80%訓練, 20%測試 x_train, x_test, y_train, y_test=train_test_split(x, y, test_size=0.2, random_state=5) model=LinearRegression()#線性迴歸模型 model.fit(x_train, y_train)#模型訓練,資料量大時,會花很久的時間 #分數愈接近1, 表示愈準確 print(f'分數 : {model.score(x_test, y_test)}') #開始預測價格 y_pred=model.predict(x_test) for i in zip(y_pred, y_test): print(i) 結果 : (480, 3) 分數 : 0.7243778618653828 (np.float64(2.62241110725126), 13.8) (np.float64(13.651942482381545), 14.0) (np.float64(25.771022346404653), 25.1) (np.float64(21.86752945730331), 24.0)