learning_python:: 선형회귀

basic import statements

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

데이터 읽기

dataset =  pd.read_csv('./input/LinearRegressionData.csv')
x = dataset.iloc[:, :-1].values
y = dataset.iloc[:,-1].values
데이터 전처리 01: one-hot encoding
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# encoder, 2번째 컬럼 데이터를 인코딩, 첫줄 drop, 나머지 컬럼은 그대로 진행
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'),[2])], remainder='passthrough')
x=ct.fit_transform(x)
데이터 전처리 02: 훈련/검증 데이터 나누기
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.2, random_state=1)

01. linear regression

from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(x_train, y_train)

02. Gradient Descent, Stochastic GD

from sklearn.linear_model import SGDRegressor
# epoch = max_iter, 델타 = eta0, verbose = 진행 단계 로그 보기
sr = SGDRegressor(max_iter=30000, eta0=1e-4, random_state=0, verbose=1)
sr.fit(x_train, y_train)

model evaluation: fitting index, R-squres

from sklearn.metrics import r2_score
r2_score(y_train, reg.predict(x_train))

model evaluation: prediction performance index, MAE, MSE, RMSE

y_pred = reg.predict(x_test)
# MAE (Mean Absolute Error)
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_pred)
# MSE (Mean Squared Error)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)
# RMSE (Root Mean Squared Error)
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared=False)

simple visualization

plt.scatter(x_train, y_train, color = 'blue')
plt.plot(x_train, reg.predict(x_train), color = 'red')
plt.scatter(x_test, y_test, color = 'green')
plt.scatter(x_test, reg.predict(x_test), color = 'black')
plt.show()