import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree from sklearn.metrics import accuracy_score import matplotlib.pyplot as plt from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import cross_val_score from time import time print("Loading data") train_df = pd.read_csv("sign_mnist_train.csv") test_df = pd.read_csv("sign_mnist_test.csv") def remove_every_nth_column(df, count): return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0] def leave_every_nth_column(df, count): return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0] X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"]) y_train, y_test = train_df["label"], test_df["label"] # # ('leave_every_nth_column_5', 0.6002509760178472) # X_train = leave_every_nth_column(X_train, 5) # X_test = leave_every_nth_column(X_test, 5) # # ('remove_every_nth_column_15', 0.6167038482989403) # X_train = remove_every_nth_column(X_train, 15) # X_test = remove_every_nth_column(X_test, 15) # # ('remove_every_nth_column_14', 0.6179587283881762) # X_train = remove_every_nth_column(X_train, 14) # X_test = remove_every_nth_column(X_test, 14) # # ('remove_every_nth_column_11', 0.6183770217512549) # X_train = remove_every_nth_column(X_train, 11) # X_test = remove_every_nth_column(X_test, 11) start = time() ccp_alpha = 0.0009 print("Training", ccp_alpha) decision_tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha) decision_tree.fit(X_train, y_train) print(time() - start) if False: print("Viz") ccp_alphas_collect=[] accuracy_collect=[] for ccp_alpha in ccp_alphas: tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha) tree.fit(X_train, y_train) accuracy = tree.score(X_test, y_test) ccp_alphas_collect.append(ccp_alpha) accuracy_collect.append(accuracy) plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect)) plt.grid() plt.xlabel('effective alpha') plt.ylabel('Accuracy of test set') plt.show() pred = decision_tree.predict(X_test) accuracy = accuracy_score(y_test, pred) * 100 print("Unprunned:") print(f"Accuracy: {accuracy:.2f}%") #print("10-fold cross-validation score: ", cross_val_score(decision_tree, X_test, y_test, cv=10).mean()) if False: plt.figure(figsize=(12, 6)) plot_tree(decision_tree, filled=True, feature_names=X_test.columns.tolist()) plt.show() if False: # Pruning decision_tree_prunned = DecisionTreeClassifier(ccp_alpha=0.01) decision_tree_prunned.fit(X_train, y_train) print("Prunned:") print(f"Accuracy: {accuracy_score(decision_tree_prunned.predict(X_test), pred) * 100:.2f}%") print("10-fold cross-validation score: ", cross_val_score(decision_tree_prunned, X_test, y_test, cv=10).mean()) if True: if True: feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"]) print(decision_tree.feature_importances_) print(len(decision_tree.feature_importances_)) print(X_train.columns) image = [] for y in range(28): image.append([0]*28) for idx, importance in enumerate(decision_tree.feature_importances_): pixel_idx = int(X_train.columns[idx].removeprefix("pixel"))-1 pixel_x = pixel_idx % 28 pixel_y = pixel_idx // 28 image[pixel_y][pixel_x] = importance for y in range(28): print(",".join(str(a) for a in image[y])) # feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"]) # feat_importances.sort_values(by='Importance', ascending=False, inplace=True) # feat_importances.plot(kind='bar', figsize=(8,6)) # plt.show()