[ML_7] Classification by using DecisionTreeClassifier(+)
2025. 7. 18. 20:44ㆍpython/ML
#%%
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_curve,roc_auc_score, f1_score, precision_recall_curve
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
feature_name_df = pd.read_csv("data/har_dataset/features.txt", sep = r'\s+', header = None, names = ['column_index', 'column_name'])
feature_name = feature_name_df.iloc[:,1].values.tolist()
#%%
# groupby test
df = pd.DataFrame({'column_name' : ['a','b','c','a','a','b'], 'col2' : ['A','B','C','A','A','B']})
print(df.groupby('column_name').groups)
dp_df = df.groupby('column_name').count()
# df_df = df.groupby('col1').size()
ddf = dp_df.count()
#%%
# check the duplication feature name
feature_dp_df = feature_name_df.groupby('column_name').count()
print(feature_dp_df[feature_dp_df['column_index'] > 1].count())
#%%
def get_new_features(old_feature_name_df):
feature_dup_df = pd.DataFrame(old_feature_name_df.groupby('column_name').cumcount(),columns = ['dup_cnt'])
new_feature_name_df = pd.merge(old_feature_name_df.reset_index(), feature_dup_df, left_index = True, right_index = True,how = 'outer')
new_feature_name_df['column_name'] = new_feature_name_df[['column_name', 'dup_cnt']].apply(lambda x : x.iloc[0] + '_' + str(x.iloc[1]) if x.iloc[1] > 0 else x.iloc[0], axis = 1)
new_feature_name_df = new_feature_name_df.drop(['index','dup_cnt'], axis = 1)
return new_feature_name_df
#%%
import pandas as pd
def get_human_dataset():
feature_name_df = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/features.txt", sep = r'\s+', header = None, names = ['column_index', 'column_name'])
new_feature_name_df = get_new_features(feature_name_df)
feature_name = new_feature_name_df.iloc[:,1].values.tolist()
X_train = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/train/X_train.txt", sep = r"\s+", names = feature_name)
X_test = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/test/X_test.txt", sep = r"\s+", names = feature_name)
y_train = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/train/y_train.txt", sep = r"\s+", header = None, names = ['action'])
y_test = pd.read_csv("C:/Users/82103/Desktop/AI/data/har_dataset/test/y_test.txt",sep = r"\s+", header = None, names= ['action'])
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_human_dataset()
#%%
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
dtc_ml = DecisionTreeClassifier(random_state = 121)
dtc_ml.fit(X_train, y_train)
y_pred = dtc_ml.predict(X_test)
acc_score = accuracy_score(y_test, y_pred)
print(f"acc_score : {acc_score}")
print(f"dtc_ml_params : {dtc_ml.get_params()}")
#%%
from sklearn.model_selection import GridSearchCV
params = {
'max_depth' : [6,8,10,12,16,20,24],
'min_samples_split' : [16]
}
grid_cv = GridSearchCV(dtc_ml, param_grid= params, scoring = 'accuracy', cv = 5, verbose = 1)
grid_cv.fit(X_train, y_train)
print(f"Gird_cv best score : {grid_cv.best_score_}")
print(f"Grid_cb best parameter : {grid_cv.best_params_}")
#%%
cv_results_df = pd.DataFrame(grid_cv.cv_results_)
cv_results_df
#%%
params = {
'max_depth' : [8,10,12,16,20],
'min_samples_split' : [16,24]
}
grid_cv = GridSearchCV(dtc_ml, param_grid= params, scoring = 'accuracy', cv = 5, verbose = 1)
grid_cv.fit(X_train, y_train)
print(f" Grid cv best score : {grid_cv.best_score_}")
print(f"Grid cv best parameter sets : {grid_cv.best_params_}")
#%%
best_dtc = grid_cv.best_estimator_
pred1 = best_dtc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f" accuracy_score : {accuracy}")
#%%
import seaborn as sns
import matplotlib.pyplot as plt
ftr_importances_values = best_dtc.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = X_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
plt.figure(figsize = (8,6))
plt.title("Feature importances Top 20")
sns.barplot(x=ftr_top20, y = ftr_top20.index)
plt.show()'python > ML' 카테고리의 다른 글
| [Probability] Bayes Rule (0) | 2025.08.03 |
|---|---|
| [Linear_algebra] Null space (0) | 2025.08.01 |
| [ML_code] visualize_boundary(model, X, y) (2) | 2025.07.17 |
| [ML_6] Prediction of pima diabetes using Scikitlearn (0) | 2025.07.17 |
| [ML_5] Prediction of Titanic survival by Scikitlearn (0) | 2025.07.15 |