[ML_7] Classification by using DecisionTreeClassifier(+)

2025. 7. 18. 20:44python/ML

 

#%%
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, roc_curve,roc_auc_score, f1_score, precision_recall_curve 
from sklearn.model_selection import train_test_split, GridSearchCV 
from sklearn.preprocessing import LabelEncoder 
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

feature_name_df = pd.read_csv("data/har_dataset/features.txt", sep = r'\s+', header = None, names = ['column_index', 'column_name'])  

feature_name = feature_name_df.iloc[:,1].values.tolist() 

#%%
# groupby test 
df = pd.DataFrame({'column_name' : ['a','b','c','a','a','b'], 'col2' : ['A','B','C','A','A','B']}) 
print(df.groupby('column_name').groups)
dp_df = df.groupby('column_name').count() 
# df_df = df.groupby('col1').size()

ddf = dp_df.count()

 
#%%
# check the duplication feature name 
feature_dp_df = feature_name_df.groupby('column_name').count() 
print(feature_dp_df[feature_dp_df['column_index'] > 1].count())  

#%% 
def get_new_features(old_feature_name_df): 
    feature_dup_df = pd.DataFrame(old_feature_name_df.groupby('column_name').cumcount(),columns = ['dup_cnt']) 
    
    new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, left_index = True, right_index = True,how = 'outer') 
    
    new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x.iloc[0] + '_' + str(x.iloc[1]) if x.iloc[1] > 0 else x.iloc[0], axis = 1) 
    
    new_feature_name_df = new_feature_name_df.drop(['index','dup_cnt'], axis = 1) 
    
    return new_feature_name_df  



#%% 
import pandas as pd 

def get_human_dataset(): 
    feature_name_df = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/features.txt", sep = r'\s+', header = None, names = ['column_index', 'column_name'])  
    
    new_feature_name_df = get_new_features(feature_name_df) 
    
    feature_name = new_feature_name_df.iloc[:,1].values.tolist() 
    
    X_train = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/train/X_train.txt", sep = r"\s+", names = feature_name) 
    X_test = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/test/X_test.txt", sep = r"\s+", names = feature_name) 
    
    y_train = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/train/y_train.txt", sep = r"\s+", header = None, names = ['action']) 
    y_test = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/test/y_test.txt",sep = r"\s+", header = None, names= ['action']) 
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_human_dataset() 


#%% 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score 

dtc_ml = DecisionTreeClassifier(random_state = 121) 
dtc_ml.fit(X_train, y_train) 
y_pred = dtc_ml.predict(X_test) 
acc_score = accuracy_score(y_test, y_pred) 
print(f"acc_score : {acc_score}") 
print(f"dtc_ml_params : {dtc_ml.get_params()}") 

#%% 
from sklearn.model_selection import GridSearchCV 

params = {
    'max_depth' : [6,8,10,12,16,20,24],
    'min_samples_split' : [16]
    }

grid_cv = GridSearchCV(dtc_ml, param_grid= params, scoring = 'accuracy', cv = 5, verbose = 1) 
grid_cv.fit(X_train, y_train) 
print(f"Gird_cv best score : {grid_cv.best_score_}") 
print(f"Grid_cb best parameter : {grid_cv.best_params_}") 
#%% 
cv_results_df = pd.DataFrame(grid_cv.cv_results_) 
cv_results_df 

#%% 
params = {
    'max_depth' : [8,10,12,16,20], 
    'min_samples_split' : [16,24]
    }

grid_cv = GridSearchCV(dtc_ml, param_grid= params, scoring = 'accuracy', cv = 5, verbose = 1) 
grid_cv.fit(X_train, y_train) 
print(f" Grid cv best score : {grid_cv.best_score_}") 
print(f"Grid cv best parameter sets : {grid_cv.best_params_}") 

#%% 
best_dtc = grid_cv.best_estimator_ 
pred1 = best_dtc.predict(X_test) 
accuracy = accuracy_score(y_test, y_pred) 
print(f" accuracy_score : {accuracy}") 

#%% 
import seaborn as sns 
import matplotlib.pyplot as plt 
ftr_importances_values = best_dtc.feature_importances_ 
ftr_importances = pd.Series(ftr_importances_values, index = X_train.columns) 

ftr_top20 = ftr_importances.sort_values(ascending=False)[:20] 
plt.figure(figsize = (8,6)) 
plt.title("Feature importances Top 20") 
sns.barplot(x=ftr_top20, y = ftr_top20.index) 
plt.show()