저번 시간에 마지막으로 했던 실습 문제는 버섯데이터 분류인데 이항분류,다항분류로 풀이가 가능했다.
#이항분류로 해보기
import pandas as pd
mushrooms= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/mushrooms.csv')
print(mushrooms)
#label encoding - 열 단위로만 가능하므로 loop사용
from sklearn.preprocessing import LabelEncoder
labelencoder= LabelEncoder()
for col in mushrooms.columns:
mushrooms[col]=labelencoder.fit_transform(mushrooms[col])
import matplotlib.pyplot as plt
y=mushrooms['class'].values #class 열만 취함
x=mushrooms.drop(['class'],axis=1) #class 열을 제외한 나머지 모든 열
x=x.values
#정규화 코드
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed=0
#Train data 와 test data(20%)분리하기
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=seed)
print('x_train.shape=',x_train.shape)
print('x_test.shape=',x_test.shape)
print('y_train.shape=',y_train.shape)
print('y_test.shape=',y_test.shape)
print(y_test[0:5])
model=Sequential()
model.add(Dense(48, input_dim=22,activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history=model.fit(x_train,y_train,epochs=25,batch_size=32)
model.summary()
##test data set으로 모델 검증
model.evaluate(x_test,y_test)
#다항분류로 해보기
import pandas as pd
mushrooms= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/mushrooms.csv')
print(mushrooms)
#label encoding - 열 단위로만 가능하므로 loop사용
from sklearn.preprocessing import LabelEncoder
labelencoder= LabelEncoder()
for col in mushrooms.columns:
mushrooms[col]=labelencoder.fit_transform(mushrooms[col])
import matplotlib.pyplot as plt
y=mushrooms['class'].values #class 열만 취함
#print(b,mushrooms)
x=mushrooms.drop(['class'],axis=1) #class 열을 제외한 나머지 모든 열
x=x.values
#정규화 코드
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed=0
#Train data 와 test data(20%)분리하기
x_train ,x_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=seed)
print('x_train.shape=',x_train.shape)
print('x_test.shape=',x_test.shape)
print('y_train.shape=',y_train.shape)
print('y_test.shape=',y_test.shape)
#binary를 multi-class로
y_train= tf.keras.utils.to_categorical(y_train,num_classes=2)
y_test= tf.keras.utils.to_categorical(y_test,num_classes=2)
##to categorical로 이항을 다항으로 변환후
print("===== to_categorical 이후=====")
print('y_train.shape=',y_train.shape)
print('y_test.shape=',y_test.shape)
print(y_test[0:5])
model=Sequential()
model.add(Dense(48, input_dim=22,activation='relu'))
model.add(Dense(24, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(x_train, y_train , epochs=25, batch_size=32)
#test data set으로 모델 검증
model.evaluate(x_test,y_test)
#연습문제 광석 연습문제 다항분류로
import pandas as pd
sona=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/sonar.csv', header=None)
dataset=sona.values
X=dataset[:,0:60].astype(float)
Y_obj=dataset[:,60]
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed=0
e= LabelEncoder()
e.fit(Y_obj)
Y=e.transform(Y_obj) #0,1,2 로 변환
y_encoded=tf.keras.utils.to_categorical(Y)
x_train ,x_test, y_train, y_test = train_test_split(X,y_encoded,test_size=0.2,random_state=seed)
model= Sequential()
model.add(Dense(4, input_dim=60,activation='relu'))
model.add(Dense(8,activation = 'relu'))
model.add(Dense(2,activation = 'softmax'))
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(x_train, y_train , epochs=25, batch_size=32)
result = model.evaluate(x_test,y_test)
print('\n Accuracy : %.4f'%(result[1]))
#광석 분류문제 이항분류로 풀기
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed=0
sona=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/sonar.csv', header=None)
dataset=sona.values
X=dataset[:,0:60].astype(float)
#astype 없으면 model.fit 에서 unsuported object type float Error가 발생
Y_obj=dataset[:,60]
e= LabelEncoder()
e.fit(Y_obj)
Y=e.transform(Y_obj)
model= Sequential()
model.add(Dense(24, input_dim=60,activation='relu'))
model.add(Dense(10,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
history = model.fit(X, Y , epochs=100, batch_size=5)
result = model.evaluate(X,Y)
print('\n Accuracy : %.4f'%(result[1]))
과적합
층이 너무 많거나 변수가 복잡해서 발생하기도하고 테스트셋과 학습셋이 중복될떄도 생기기도함
학습하는 데이터셋과 테스트할 데이터셋을 관전히 구분한 다음 학습과 동시에 테스트를 진행
학습셋 테스트셋 실제 값에 예측해보기
학습셋의 정확도는 높아져도 테스트셋의 오차함수를 통해 구해지는 에러가 커지면 과적합이 일어난것
학습을 진행해도 테스트 결과가 더이상 좋아지지 않는 지점에서 학습을 멈춰야함 이때의 학습정도가 가장적절한 것으로 볼 수 있음
테스트셋(validation) 학습셋(train) 새로운데이터(test)
#과적합 연습문제
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
sona=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/sonar.csv', header=None)
dataset=sona.values
X=dataset[:,0:60].astype(float)
#astype 없으면 model.fit 에서 unsuported object type float Error가 발생
Y_obj=dataset[:,60]
e= LabelEncoder()
e.fit(Y_obj)
Y=e.transform(Y_obj)
#학습셋과 테스트셋의 구분
x_train ,x_test, y_train, y_test = train_test_split(X,Y,test_size=0.3,random_state=seed)
model= Sequential()
model.add(Dense(24, input_dim=60,activation='relu'))
model.add(Dense(10,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(x_train, y_train , epochs=200, batch_size=5)
print('\n Train Accuracy : %.4f'%(model.evaluate(x_train, y_train)[1]))
print('\n Test Accuracy : %.4f'%(model.evaluate(x_test, y_test)[1]))
#iris 문제 과적합
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
#데이터 입력
df= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/iris.csv',
names=['sepal_length','sepal_width','petal_length','petal_width','species'])
#데이터 분류
dataset=df.values
X=dataset[:,0:4].astype(float)
Y_obj=dataset[:,4]
#문자열을 숫자로 변환
e= LabelEncoder()
e.fit(Y_obj)
Y=e.transform(Y_obj)
y_encoded=tf.keras.utils.to_categorical(Y)
x_train ,x_test, y_train, y_test = train_test_split(X,y_encoded,test_size=0.3,random_state=seed)
#3.모델구성하기
model= Sequential()
model.add(Dense(16, input_dim=4,activation='relu'))
model.add(Dense(3,activation = 'softmax'))
#4.모델 학습과정 설정하기 loss=categorical_crossentropy
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
#5.모델 학습시키기
model.fit(x_train, y_train,epochs=300)
print('\n Train Accuracy : %.4f'%(model.evaluate(x_train, y_train)[1]))
print('\n Test Accuracy : %.4f'%(model.evaluate(x_test, y_test)[1]))
loss 머신러닝에서 훈련을 통해서 목표를 잘 달성했는지를 나타내는값을잡고 그값을 기준으로 훈련을 시행 학습을 통해서 직접적으로 줄이고자 하는 값을 손실 에러 혹은 코스트 라고 표현
학습을 통해서 목표를 얼마나 잘 달성했는지를 나타내는 척고(metric) accuracy는 척도의 하나
머신허닝의 최종목표는 척도로 달성률을표시하지만 직접 척도를낮추도록 훈련하는 것은 힘들기 때문에 손실을 줄이는 방향으로 훈련
#모델의 저장과 재사용 연습
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
sona=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/sonar.csv', header=None)
dataset=sona.values
X=dataset[:,0:60].astype(float)
#astype 없으면 model.fit 에서 unsuported object type float Error가 발생
Y_obj=dataset[:,60]
e= LabelEncoder()
e.fit(Y_obj)
Y=e.transform(Y_obj)
#학습셋과 테스트셋의 구분
x_train ,x_test, y_train, y_test = train_test_split(X,Y,test_size=0.3,random_state=seed)
model= Sequential()
model.add(Dense(24, input_dim=60,activation='relu'))
model.add(Dense(10,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(x_train, y_train , epochs=200, batch_size=5)
model.save('my_model.h5') #모델을 파일료 컴퓨터에 저장
del model #테스트를 위해 메모리내의 모델을 삭제
from tensorflow.keras.models import load_model #모델을 새로 불러옴
model= load_model('my_model.h5')
#불러온 모델로 테스트 실행
print('\n Test Accuracy : %.4f'%(model.evaluate(x_test, y_test)[1]))
k겹 교차 검증
데이터 셋을 겨러개로 나누어서 하나씩테스트셋으로 사용하고 비교
데이터셋 1,2,3,4,5 학습1234 테스트5 학습2345 테스트1 이런식으로 교차검증
#K겹 교차검증 만들기 sklearn StratifiedKFold() 써야함
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
sona=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/sonar.csv', header=None)
dataset=sona.values
X=dataset[:,0:60]
#X=dataset[:,0:60].astype(float) 라고 쓰는 경우는 fit 하기전, 지금 converter를 안해도 됨
Y_obj=dataset[:,60]
e= LabelEncoder()
e.fit(Y_obj)
Y=e.transform(Y_obj)
#10개의 파일로 쪼갬
n_fold=10
skf= StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
# 빈accuracy 배열
accuracy=[]
#모델의 설정 컴파일 실행
for train, test in skf.split(X,Y):
model=Sequential()
model.add(Dense(24, input_dim=60,activation='relu'))
model.add(Dense(10,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
## X=astype float 쓰면 아래 두줄 생략 가능
X_train = tf.convert_to_tensor(X[train],dtype=tf.float32)
Y_train = tf.convert_to_tensor(Y[train],dtype=tf.float32)
model.fit(X_train, Y_train , epochs=100, batch_size=5)
X_test = tf.convert_to_tensor(X[test],dtype=tf.float32)
Y_test = tf.convert_to_tensor(Y[test],dtype=tf.float32)
k_accuracy='%.4f'%(model.evaluate(X_test,Y_test)[1])
accuracy.append(k_accuracy)
#결과 출력
print('\n %.f fold accuracy : '%n_fold,accuracy)
#와인데이터 실습문제
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
wine_pre=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/wine.csv',header=None)
wine=wine_pre.sample(frac=1)
#frac =1 은 데이터를 100% 가져오란 뜻
#print(wine.shape) 6497,13
dataset=wine.values
X=dataset[:,0:12]
Y=dataset[:,12]
model= Sequential()
model.add(Dense(30, input_dim=12,activation='relu'))
model.add(Dense(12,activation = 'relu'))
model.add(Dense(8,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.fit(X, Y , epochs=200, batch_size=200)
print('\n Accuracy : %.4f'%(model.evaluate(X, Y)[1]))
#모델 업데이트하기 epochs 마다모델의 정확도를 기록해서 hd5라는 확장자로 저장하기
#와인데이터 실습문제
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
wine_pre=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/wine.csv',header=None)
wine=wine_pre.sample(frac=1)
#frac =1 은 데이터를 100% 가져오란 뜻
#print(wine.shape) 6497,13
dataset=wine.values
X=dataset[:,0:12]
Y=dataset[:,12]
model= Sequential()
model.add(Dense(30, input_dim=12,activation='relu'))
model.add(Dense(12,activation = 'relu'))
model.add(Dense(8,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
import os
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
MODEL_DIR='./model/'
if not os.path.exists(MODEL_DIR):
os.mkdir(MODEL_DIR)
modelpath='./model/{epoch:02d}-{val_loss:.4f}.hdf5'
Checkpointer= ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=1,save_best_only=True)
model.fit(X, Y , validation_split=0.2, epochs=200, batch_size=200, verbose=0, callbacks=[Checkpointer])
print('\n Accuracy : %.4f'%(model.evaluate(X, Y)[1]))
callback 함수 무엇인가 일을다른 객체에게 시키고 그일이 끝나는것을 기다리는것이 아니라 객체가 나를 다시부를때까지 내일을 하고있는것
non block 이며 비동기방식의 함수로 사용된다는 것을 알수 있음
모델 업데이트하면 에포크가 진행되면서 모든값이 저장되는 것이 아니라 테스트 오파를 실행한 결괏값이 향상되었을때만 저장되는 것을 볼수 있음
#그래프로 확인하기 실습1
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
wine_pre=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/wine.csv',header=None)
wine=wine_pre.sample(frac=0.15)
dataset=wine.values
X=dataset[:,0:12]
Y=dataset[:,12]
model= Sequential()
model.add(Dense(30, input_dim=12,activation='relu'))
model.add(Dense(12,activation = 'relu'))
model.add(Dense(8,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#모델 실행및 저장 validation_split을 위해 훈련 데이터의 일부를 자동으로 예약할 데이터의 비율을 나타내므로 0보다 크고 1보다 작은수로 설정해야함
history=model.fit(X,Y,validation_split=0.33,epochs=3500,batch_size=500)
#y_closs에 테스트셋으로 실험 결과의 오차 값을 저장
y_vloss=history.history['val_loss']
#y_acc에 학습셋으로 측정한 정확도의 값을 저장
y_acc=history.history['accuracy']
#x값을 지정하고 정확도는 파랑 오차를 빨강 으로 표시
x_len=np.arange(len(y_acc))
plt.plot(x_len, y_vloss,'o',c='red',markersize=3)
plt.plot(x_len,y_acc,'o',c='blue',markersize=3)
plt.show()
#그래프로 확인하기 실습2
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
wine_pre=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/wine.csv',header=None)
wine=wine_pre.sample(frac=1)
dataset=wine.values
X=dataset[:,0:12]
Y=dataset[:,12]
model= Sequential()
model.add(Dense(30, input_dim=12,activation='relu'))
model.add(Dense(12,activation = 'relu'))
model.add(Dense(8,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
from sklearn.model_selection import train_test_split
X_train ,X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.3,random_state=seed)
MODEL_DIR='./model/'
if not os.path.exists(MODEL_DIR):
os.mkdir(MODEL_DIR)
#모델 저장 조건 설정
modelpath='./model/{epoch:02d}-{val_loss:.4f}.hdf5'
Checkpointer= ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=1,save_best_only=True)
history=model.fit(X_train, Y_train , validation_split=0.2, epochs=1500, batch_size=200, verbose=0, callbacks=[Checkpointer])
#y_closs에 테스트셋으로 실험 결과의 오차 값을 저장
y_loss=history.history['val_loss']
vloss=history.history['loss']
#y_acc에 학습셋으로 측정한 정확도의 값을 저장
y_acc=history.history['val_accuracy']
acc=history.history['accuracy']
#x값을 지정하고 정확도는 파랑 오차를 빨강 으로 표시
x_len=np.arange(len(y_acc))
plt.plot(x_len, y_acc,'o',c='red',markersize=3 ,label='val_accuracy')
plt.plot(x_len,acc,'o',c='blue',markersize=3, label='accuracy')
plt.plot(x_len,y_loss,'o',c='purple',markersize=3,label='val_loss')
plt.plot(x_len,vloss,'o',c='green',markersize=3,label='loss')
plt.legend()
plt.show()
#얼리 스톱 연습문제
import pandas as pd
import numpy as np
import tensorflow as tf
import os
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
seed=0
np.random.seed(seed)
tf.random.set_seed(seed)
wine_pre=pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/wine.csv',header=None)
wine=wine_pre.sample(frac=0.15)
dataset=wine.values
X=dataset[:,0:12]
Y=dataset[:,12]
model= Sequential()
model.add(Dense(30, input_dim=12,activation='relu'))
model.add(Dense(12,activation = 'relu'))
model.add(Dense(8,activation = 'relu'))
model.add(Dense(1,activation = 'sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#함수에 모니터할 값과 테스트 오차가 좋아지지 않아도 몇번까지 기다릴지를 정함
early_stopping_callback= EarlyStopping(monitor='val_loss',patience=100)
modelpath='./model/{epoch:02d}-{val_loss:.4f}.hdf5'
Checkpointer= ModelCheckpoint(filepath=modelpath, monitor='val_loss', verbose=1,save_best_only=True)
history=model.fit(X, Y , validation_split=0.33, epochs=3500, batch_size=500, callbacks=[early_stopping_callback,Checkpointer])
#y_closs에 테스트셋으로 실험 결과의 오차 값을 저장
y_vloss=history.history['val_loss']
#y_acc에 학습셋으로 측정한 정확도의 값을 저장
y_acc=history.history['accuracy']
#x값을 지정하고 정확도는 파랑 오차를 빨강 으로 표시
x_len=np.arange(len(y_acc))
plt.plot(x_len, y_vloss,'o',c='red',markersize=3,label='val_loss')
plt.plot(x_len,y_acc,'o',c='blue',markersize=3,label='accuracy')
plt.legend()
plt.show()
print('\n Accuracy : %.4f'%(model.evaluate(X, Y)[1]))
'First step > AI 기초반' 카테고리의 다른 글
[TIL]21.07.19 비지도학습 (0) | 2021.07.19 |
---|---|
[TIL]21.07.16 케라스 실습 선형회귀 데이터 (0) | 2021.07.17 |
[TIL]21.07.14keras로 실습하기 (0) | 2021.07.14 |
[TIL]21.07.13 웹크롤링 (0) | 2021.07.13 |
[TIL] 21.07.13pandas 사용, (0) | 2021.07.13 |