https://www.kaggle.com/datasets/muratkokludataset/date-fruit-datasets
이번 데이터는 Kaggle 에 있는 7개의 과일 종류가 존재합니다.
각 데이터를 구분할 수있는 특성들이 주어졌을때, 분류하는 문제입니다
Import Modules
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import RandomizedSearchCV,train_test_split
!pip install openpyxl
# 데이터 불러오기
DATA_PATH ="/input/Date_Fruit_Datasets.xlsx"
data = pd.read_excel(DATA_PATH)
data.head()
데이터의 Null값과 분포 확인
data.isnull().sum()
#outputs
AREA 0
PERIMETER 0
MAJOR_AXIS 0
MINOR_AXIS 0
ECCENTRICITY 0
EQDIASQ 0
SOLIDITY 0
CONVEX_AREA 0
EXTENT 0
ASPECT_RATIO 0
ROUNDNESS 0
COMPACTNESS 0
SHAPEFACTOR_1 0
SHAPEFACTOR_2 0
SHAPEFACTOR_3 0
SHAPEFACTOR_4 0
MeanRR 0
MeanRG 0
MeanRB 0
StdDevRR 0
StdDevRG 0
StdDevRB 0
SkewRR 0
SkewRG 0
SkewRB 0
KurtosisRR 0
KurtosisRG 0
KurtosisRB 0
EntropyRR 0
EntropyRG 0
EntropyRB 0
ALLdaub4RR 0
ALLdaub4RG 0
ALLdaub4RB 0
Class 0
dtype: int64
data.describe()
Features 와 Label로 분할 및 StandardScaler()
raw_features = data.iloc[:,:-1]
label = data.iloc[:,-1]
raw_features
scaler = StandardScaler()
features = pd.DataFrame(scaler.fit_transform(raw_features), columns= raw_features.columns)
features.head()
PCA를 통한 차원 축소
Features 수가 많기 때문에, 적절한 차원 축소를 통해 핵심 특성만 찾기
from sklearn.decomposition import PCA
n_components= 15
pca = PCA(n_components= n_components)
reduced_features = pd.DataFrame(pca.fit_transform(features), columns = [f"PC{i+1}" for i in range(n_components)])
reduced_features
차원을 축소한 DataFrame에 Class Features를 concatenate를 통해 붙여줍니다.
full_data = pd.concat([reduced_features,label],axis=1)
full_data
Split Train, Test Data
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2)
for train_index, test_index in split.split(full_data,full_data["Class"]):
train = full_data.loc[train_index]
test = full_data.loc[test_index]
X_train = train.drop("Class",axis=1)
y_train = train["Class"]
X_test = test.drop("Class",axis=1)
y_test =test["Class"]
최적의 하이퍼 파라미터 찾기(RandomizedSearchCV)
분류기 Classifier 는 CatBoostClassifer를 사용
from catboost import CatBoostClassifier
cb_clf = CatBoostClassifier(task_type ="GPU",silent=True)
param_dict = {
"early_stopping_rounds":np.arange(5,15),
"learning_rate":np.linspace(0.05,0.15,10),
"n_estimators":np.arange(100,200,10),
"max_depth":np.arange(1,10),
}
rscv = RandomizedSearchCV(cb_clf,param_dict,scoring="accuracy",cv=10)
rscv.fit(X_train,y_train)
print(rscv.best_params_)
print(rscv.best_score_)
#outputs
{'n_estimators': 170, 'max_depth': 6, 'learning_rate': 0.07222222222222222, 'early_stopping_rounds': 13}
0.9108176838810642
최적의 파라미터를 일일히 작성하기 보단 **rscv.best_params_ 를 통해 전달
from sklearn.metrics import classification_report
cb_clf = CatBoostClassifier(task_type="GPU", silent=True, **rscv.best_params_)
cb_clf.fit(X_train,y_train)
y_pred = cb_clf.predict(X_test)
print(classification_report(y_pred,y_test))
LogisticRegression을 통한 예측
from sklearn.linear_model import LogisticRegressionCV
lrcv = LogisticRegressionCV(cv=10, n_jobs=-1)
lrcv.fit(X_train,y_train)
y_pred= lrcv.predict(X_test)
print(classification_report(y_test,y_pred))
LogisticRegression 하이퍼 파라미터 튜닝
lrcv = LogisticRegressionCV(cv=10, n_jobs=-1, max_iter=10000)
param_dict = dict(
solver=['newton-cg', 'lbfgs', 'liblinear'],
penalty=["l1", "l2", "elasticnet"],
Cs=[100, 10, 1.0, 0.1, 0.01]
)
rscv = RandomizedSearchCV(lrcv , param_dict, scoring='accuracy', cv=10)
rscv.fit(X_train, y_train)
print(rscv.best_params_)
print(rscv.best_score_)
#outputs
{'solver': 'liblinear', 'penalty': 'l2', 'Cs': 10}
0.8969679186228483
최적의 파라미터로 로지스틱 회귀 예측
lrcv = LogisticRegressionCV(**rscv.best_params_)
lrcv.fit(X_train,y_train)
y_pred = lrcv.predict(X_test)
print(classification_report(y_pred,y_test))
'Kaggle' 카테고리의 다른 글
고양이와 개 이미지 분류하기 (0) | 2022.04.28 |
---|