107 lines
3.8 KiB
Python
107 lines
3.8 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
|
|
from sklearn.metrics import accuracy_score
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.preprocessing import LabelEncoder
|
|
from sklearn.model_selection import cross_val_score
|
|
from time import time
|
|
|
|
print("Loading data")
|
|
train_df = pd.read_csv("sign_mnist_train.csv")
|
|
test_df = pd.read_csv("sign_mnist_test.csv")
|
|
|
|
def remove_every_nth_column(df, count):
|
|
return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0]
|
|
|
|
def leave_every_nth_column(df, count):
|
|
return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0]
|
|
|
|
X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"])
|
|
y_train, y_test = train_df["label"], test_df["label"]
|
|
|
|
# # ('leave_every_nth_column_5', 0.6002509760178472)
|
|
# X_train = leave_every_nth_column(X_train, 5)
|
|
# X_test = leave_every_nth_column(X_test, 5)
|
|
|
|
# # ('remove_every_nth_column_15', 0.6167038482989403)
|
|
# X_train = remove_every_nth_column(X_train, 15)
|
|
# X_test = remove_every_nth_column(X_test, 15)
|
|
|
|
# # ('remove_every_nth_column_14', 0.6179587283881762)
|
|
# X_train = remove_every_nth_column(X_train, 14)
|
|
# X_test = remove_every_nth_column(X_test, 14)
|
|
|
|
# # ('remove_every_nth_column_11', 0.6183770217512549)
|
|
# X_train = remove_every_nth_column(X_train, 11)
|
|
# X_test = remove_every_nth_column(X_test, 11)
|
|
|
|
start = time()
|
|
ccp_alpha = 0.0009
|
|
print("Training", ccp_alpha)
|
|
decision_tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
|
|
decision_tree.fit(X_train, y_train)
|
|
print(time() - start)
|
|
|
|
if False:
|
|
print("Viz")
|
|
ccp_alphas_collect=[]
|
|
accuracy_collect=[]
|
|
for ccp_alpha in ccp_alphas:
|
|
tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
|
|
tree.fit(X_train, y_train)
|
|
accuracy = tree.score(X_test, y_test)
|
|
ccp_alphas_collect.append(ccp_alpha)
|
|
accuracy_collect.append(accuracy)
|
|
|
|
plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect))
|
|
plt.grid()
|
|
plt.xlabel('effective alpha')
|
|
plt.ylabel('Accuracy of test set')
|
|
plt.show()
|
|
|
|
pred = decision_tree.predict(X_test)
|
|
accuracy = accuracy_score(y_test, pred) * 100
|
|
print("Unprunned:")
|
|
print(f"Accuracy: {accuracy:.2f}%")
|
|
#print("10-fold cross-validation score: ", cross_val_score(decision_tree, X_test, y_test, cv=10).mean())
|
|
|
|
if False:
|
|
plt.figure(figsize=(12, 6))
|
|
plot_tree(decision_tree, filled=True, feature_names=X_test.columns.tolist())
|
|
plt.show()
|
|
|
|
if False:
|
|
# Pruning
|
|
decision_tree_prunned = DecisionTreeClassifier(ccp_alpha=0.01)
|
|
decision_tree_prunned.fit(X_train, y_train)
|
|
print("Prunned:")
|
|
print(f"Accuracy: {accuracy_score(decision_tree_prunned.predict(X_test), pred) * 100:.2f}%")
|
|
print("10-fold cross-validation score: ", cross_val_score(decision_tree_prunned, X_test, y_test, cv=10).mean())
|
|
|
|
if True:
|
|
if True:
|
|
feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
|
|
print(decision_tree.feature_importances_)
|
|
print(len(decision_tree.feature_importances_))
|
|
print(X_train.columns)
|
|
|
|
image = []
|
|
for y in range(28):
|
|
image.append([0]*28)
|
|
|
|
|
|
for idx, importance in enumerate(decision_tree.feature_importances_):
|
|
pixel_idx = int(X_train.columns[idx].removeprefix("pixel"))-1
|
|
pixel_x = pixel_idx % 28
|
|
pixel_y = pixel_idx // 28
|
|
image[pixel_y][pixel_x] = importance
|
|
|
|
for y in range(28):
|
|
print(",".join(str(a) for a in image[y]))
|
|
|
|
# feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
|
|
# feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
|
|
# feat_importances.plot(kind='bar', figsize=(8,6))
|
|
# plt.show() |