본문 바로가기
First step/AI 기초반

[TIL]21.07.16 케라스 실습 선형회귀 데이터

by Joshua21 2021. 7. 17.

#선형회귀 적용하기 1.보스턴 집값 예측

 

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split

 

import pandas as pd

import numpy as np

import tensorflow as tf

 

seed=0

np.random.seed(seed)

tf.random.set_seed(seed)

 

prehouse=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/housing.csv', delim_whitespace=True,header=None)

house=prehouse.sample(frac=1)

 

#데이터 분류

dataset=house.values

X=dataset[:,0:13]

Y=dataset[:,13]

X_train ,X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=seed)

 

#선형 회귀 실행 참 거짓을 구분할 필요가 없음, 출력층에 활성화 함수를 지정할필요도 없음

model= Sequential()

model.add(Dense(30, input_dim=13,activation='relu'))

model.add(Dense(6,activation = 'relu'))

model.add(Dense(1))

 

model.compile(loss='mean_squared_error',optimizer='adam')

 

model.fit(X_train, Y_train ,epochs=200, batch_size=10)

 

y_prediction = model.predict(X_test).flatten()

 

for i in range(10):

  label=Y_test[i]

  prediction= y_prediction[i]

  print('실제 가격: {:.3f}, 예상가격:{:.3f}'.format(label,prediction))

 

이항 다항 학습

accuracy val accuracy loss val loss 다 확인가능

선형회귀 학습에서는 loss 와 val loss만 확인가능

 

#선형회귀 적용하기 1.보스턴 집값 예측

 

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split

 

import pandas as pd

import numpy as np

import tensorflow as tf

 

seed=0

np.random.seed(seed)

tf.random.set_seed(seed)

 

prehouse=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/housing.csv', delim_whitespace=True,header=None)

house=prehouse.sample(frac=1)

 

#데이터 분류

dataset=house.values

X=dataset[:,0:13]

Y=dataset[:,13]

X_train ,X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=seed)

 

#선형 회귀 실행 참 거짓을 구분할 필요가 없음, 출력층에 활성화 함수를 지정할필요도 없음

model= Sequential()

model.add(Dense(30, input_dim=13,activation='relu'))

model.add(Dense(6,activation = 'relu'))

model.add(Dense(1))

 

model.compile(loss='mean_squared_error',optimizer='adam')



from tensorflow.keras.callbacks import ModelCheckpoint , EarlyStopping

#자동 중단 설정

early_stopping_callback=EarlyStopping(monitor='val_loss',patience=50)

#모델저장 조건

modelpath='./model/{epoch:02d}-{val_loss:.4f}.hdf5'

Checkpointer= ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=1,save_best_only=True)



history=model.fit(X_train, Y_train , validation_split=0.33, epochs=500, batch_size=0, callbacks=[early_stopping_callback,Checkpointer])

 

history=pd.DataFrame(history.history)

print(history.tail())

 

y_vloss=history.history=['val_loss']

 

y_loss=history.history=['loss']



import matplotlib.pyplot as plt

 

x_len=np.arange(len(y_loss))

plt.figure(figsize=(10,5))

plt.plot(x_len,y_loss,'o',c='violet',markersize=3,label='val_loss')

plt.plot(x_len,y_vloss,'o',c='springgreen',markersize=3,label='loss')

plt.legend()

plt.show()

 

#예측 값과 실제 값의 비교

Y_prediction = model.predict(X_test).flatten()

for i in range(10):

  label=Y_test[i]

  prediction= Y_prediction[i]

  print('실제 가격: {:.3f}, 예상가격:{:.3f}'.format(label,prediction))



정규화

머신러닝 알고리즘은 데이터가 가진 특성들을 비교하여 데이터의 패턴을 찾음데이터가 가진 특성의스케일 차이가 심하게 나는 경우 문제가 발생

예를 들어 주택에 관한 정보가 담김 데이터에서 특성으로 방싀갯수 (10개미만의차이) 얼마나 오래전에 지어졌는지(길게는 몇백년) 같은 것들이 포함될수 있음

 

모든 특성들이 비슷한 영향력을 행사하도록 값을 변환해주는 기술

 

단위가 다르면 직접적인 비교가불가능, 동일한 단위를 사용하더라도 값의 범위가 크게 차이나는상황에서는 비교가어려움

(10점만점의 9점,100점만점의 9점)

 

min-max 정규화

가장 일반적인 방범 모든 특성의 최소값0최대값1로 변환

 

outlier에 대한고민 이 필요

통계적인 기법을적용하면 개선되기는 하나 영향을 제거할수없음

따라서 모든스케일러 처리 전에는 outlier 제거가 선행되어야함

 

z-score 정규화

outlier문제를 피하는 데이터 정규화 전략

x라는 값을 z 점수로 바꿔줌 

(x-평균)/표준편차

 

#선형회귀 적용하기 1.보스턴 집값 예측

 

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import ModelCheckpoint , EarlyStopping

import pandas as pd

import numpy as np

import tensorflow as tf

import matplotlib.pyplot as plt

 

seed=0

np.random.seed(seed)

tf.random.set_seed(seed)

 

prehouse=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/housing.csv', delim_whitespace=True,header=None)

house=prehouse.sample(frac=1)

 

#데이터 분류

dataset=house.values

X=dataset[:,0:13]

Y=dataset[:,13]

 

#정규화 전 boxplot으로 시각화 해서data확인해보기

house.plot.box(figsize=(10,10))

X_train ,X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=seed)

plt.boxplot(X_train)

 

#정규화 (x-평균)/표준편차

mean= X_train.mean(axis=0)

X_train-=mean

std=X_train.std(axis=0)

X_train/= std

X_test-= mean

X_test /= std

# 정규화 이후 시각화 확인

plt.boxplot(X_train)

 

model= Sequential()

model.add(Dense(30, input_dim=13,activation='relu'))

model.add(Dense(6,activation = 'relu'))

model.add(Dense(1))

model.compile(loss='mean_squared_error',optimizer='adam')



#자동 중단 설정

early_stopping_callback=EarlyStopping(monitor='val_loss',patience=50)

#모델저장 조건

modelpath='./model/{epoch:02d}-{val_loss:.4f}.hdf5'

Checkpointer= ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=1,save_best_only=True)



history=model.fit(X_train, Y_train , validation_split=0.33, epochs=500, batch_size=10, callbacks=[early_stopping_callback,Checkpointer])

 

hist=pd.DataFrame(history.history)

print(hist.tail())

 

y_vloss=history.history=['val_loss']

 

y_loss=history.history=['loss']





x_len=np.arange(len(y_loss))

plt.figure(figsize=(10,5))

plt.plot(x_len,y_loss,'o',c='violet',markersize=3,label='val_loss')

plt.plot(x_len,y_vloss,'o',c='springgreen',markersize=3,label='loss')

plt.legend()

plt.show()

 

#예측 값과 실제 값의 비교

Y_prediction = model.predict(X_test).flatten()

for i in range(10):

  label=Y_test[i]

  prediction= Y_prediction[i]

  print('실제 가격: {:.3f}, 예상가격:{:.3f}'.format(label,prediction))



plt.figure(figsize=(20,10))

plt.plot(Y_prediction,c='red')

plt.plot(Y_test,c='blue')

 

avr=abs(Y_prediction-Y_test).mean()

print(avr)

 

#선형회귀 연습문제 주가 변동 추이 정답

 

from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split



import pandas as pd

import numpy as np

import tensorflow as tf

import matplotlib.pyplot as plt



path ='/content/drive/MyDrive/Colab Notebooks/dataset/'

data=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/data-02-stock_daily.csv', header=1)

 

fig=plt.figure(figsize=(30,20))

 

ax1=fig.add_subplot(3,1,1)

ax2=fig.add_subplot(3,1,2)

ax3=fig.add_subplot(3,1,3)

 

ax1.plot(data['Open'])

ax1.plot(data['High'])

ax1.plot(data['Low'])

ax1.plot(data['Close'])

ax2.plot(data['Volume'])

ax3.plot(data['Open'][0:7], linewidth=3.0, label='open')

ax3.plot(data['High'][0:7], linewidth=3.0, label='high')

ax3.plot(data['Low'][0:7], linewidth=3.0, label='low')

ax3.plot(data['Close'][0:7], linewidth=3.0, label='close')

 

ax3.legend(prop={'size':30})

 

xdata=data[['Open','High','Low','Volume']]

ydata=pd.DataFrame(data['Close'])

xdata.plot.box(figsize=(5,5))

plt.show()

 

#데이터 정규화

from sklearn.preprocessing import StandardScaler

xdata_ss=StandardScaler().fit_transform(xdata)

ydata_ss=StandardScaler().fit_transform(ydata)

print(xdata_ss.shape,ydata_ss.shape)

plt.boxplot(xdata_ss)

plt.show()

 

#트레이닝,테스트데이터 분리

xtrain=xdata_ss[220:,:]

xtest=xdata_ss[220:,:]

ytrain=ydata_ss[220:,:]

ytest=ydata_ss[220:,:]

 

print(xtrain.shape,ytrain.shape,xtest.shape,ytest.shape)

 

model= Sequential()

model.add(Dense(units=1024, input_dim=4,activation='relu'))

model.add(Dense(units=512,activation = 'relu'))

model.add(Dense(units=256,activation = 'relu'))

model.add(Dense(units=128,activation = 'relu'))

model.add(Dense(units=64,activation = 'relu'))

model.add(Dense(units=32,activation = 'relu'))

model.add(Dense(1))

 

model.compile(loss='mean_squared_error',optimizer='adam',metrics=['mae'])

 

from tensorflow.keras.callbacks import ModelCheckpoint , EarlyStopping

 

#자동 중단 설정

es=EarlyStopping(monitor='mae',patience=10)

 

seed=123

np.random.seed(seed)

tf.random.set_seed(seed)

hist=model.fit(xtrain, ytrain ,epochs=100, batch_size=16, callbacks=[es])

 

print('loss'+str(hist.history['loss']))

print('MAE'+str(hist.history['mae']))

 

res=model.evaluate(xtest,ytest,batch_size=32)

print('loss',res[0],'mae',res[1])

 

xhat=xtest

yhat=model.predict(xhat)

plt.figure()

plt.plot(yhat,label='predicted')

plt.plot(ytest,label='actual')

plt.legend(prop={'size':20})

print('Evalueate : {}'.format(np.average((yhat-ytest)**2)))

 

 

'First step > AI 기초반' 카테고리의 다른 글

[TIL] 21.07.19pandas로 웹크롤링  (0) 2021.07.19
[TIL]21.07.19 비지도학습  (0) 2021.07.19
[TIL]21.07.15과적합  (0) 2021.07.15
[TIL]21.07.14keras로 실습하기  (0) 2021.07.14
[TIL]21.07.13 웹크롤링  (0) 2021.07.13