machine-learing-methods/Lab2/5.2/part2.py
2023-12-06 00:00:07 +02:00

107 lines
3.8 KiB
Python

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from time import time
print("Loading data")
train_df = pd.read_csv("sign_mnist_train.csv")
test_df = pd.read_csv("sign_mnist_test.csv")
def remove_every_nth_column(df, count):
return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0]
def leave_every_nth_column(df, count):
return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0]
X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"])
y_train, y_test = train_df["label"], test_df["label"]
# # ('leave_every_nth_column_5', 0.6002509760178472)
# X_train = leave_every_nth_column(X_train, 5)
# X_test = leave_every_nth_column(X_test, 5)
# # ('remove_every_nth_column_15', 0.6167038482989403)
# X_train = remove_every_nth_column(X_train, 15)
# X_test = remove_every_nth_column(X_test, 15)
# # ('remove_every_nth_column_14', 0.6179587283881762)
# X_train = remove_every_nth_column(X_train, 14)
# X_test = remove_every_nth_column(X_test, 14)
# # ('remove_every_nth_column_11', 0.6183770217512549)
# X_train = remove_every_nth_column(X_train, 11)
# X_test = remove_every_nth_column(X_test, 11)
start = time()
ccp_alpha = 0.0009
print("Training", ccp_alpha)
decision_tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
decision_tree.fit(X_train, y_train)
print(time() - start)
if False:
print("Viz")
ccp_alphas_collect=[]
accuracy_collect=[]
for ccp_alpha in ccp_alphas:
tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
tree.fit(X_train, y_train)
accuracy = tree.score(X_test, y_test)
ccp_alphas_collect.append(ccp_alpha)
accuracy_collect.append(accuracy)
plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect))
plt.grid()
plt.xlabel('effective alpha')
plt.ylabel('Accuracy of test set')
plt.show()
pred = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, pred) * 100
print("Unprunned:")
print(f"Accuracy: {accuracy:.2f}%")
#print("10-fold cross-validation score: ", cross_val_score(decision_tree, X_test, y_test, cv=10).mean())
if False:
plt.figure(figsize=(12, 6))
plot_tree(decision_tree, filled=True, feature_names=X_test.columns.tolist())
plt.show()
if False:
# Pruning
decision_tree_prunned = DecisionTreeClassifier(ccp_alpha=0.01)
decision_tree_prunned.fit(X_train, y_train)
print("Prunned:")
print(f"Accuracy: {accuracy_score(decision_tree_prunned.predict(X_test), pred) * 100:.2f}%")
print("10-fold cross-validation score: ", cross_val_score(decision_tree_prunned, X_test, y_test, cv=10).mean())
if True:
if True:
feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
print(decision_tree.feature_importances_)
print(len(decision_tree.feature_importances_))
print(X_train.columns)
image = []
for y in range(28):
image.append([0]*28)
for idx, importance in enumerate(decision_tree.feature_importances_):
pixel_idx = int(X_train.columns[idx].removeprefix("pixel"))-1
pixel_x = pixel_idx % 28
pixel_y = pixel_idx // 28
image[pixel_y][pixel_x] = importance
for y in range(28):
print(",".join(str(a) for a in image[y]))
# feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
# feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
# feat_importances.plot(kind='bar', figsize=(8,6))
# plt.show()