import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns # Import seaborn for bar plotting from sklearn.model_selection import train_test_split from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis from sklearn.metrics import accuracy_score, confusion_matrix # 1. Load the dataset train_df = pd.read_csv("sign_mnist_train.csv") test_df = pd.read_csv("sign_mnist_test.csv") # for df in [train_df, test_df]: # for column in df.columns[1:]: # df.loc[abs(df[column] - 160) <= 5, column] = 0 if False: # Create a 4x4 grid of subplots fig, axes = plt.subplots(4, 4, figsize=(8, 8)) plt.subplots_adjust(wspace=0, hspace=0) # Adjust spacing df_by_label = test_df[test_df["label"] == 1][:16] indices = list(range(16)) for i, ax in zip(indices, axes.ravel()): hand_sign = np.array(df_by_label.iloc[i, 1:]) hand_sign = hand_sign.reshape(28, 28) ax.imshow(hand_sign, cmap='gray', aspect='auto') ax.axis('off') # Hide axis labels plt.show() X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"]) y_train, y_test = train_df["label"], test_df["label"] # for hand_sign in range(26): # count = len(train_df[train_df["label"] == hand_sign]) # print(f"[{hand_sign}] = {count}") # 2. if False: lda_classifier = LinearDiscriminantAnalysis() lda_classifier.fit(X_train, y_train) lda_predictions = lda_classifier.predict(X_test) lda_confusion_matrix = confusion_matrix(y_test, lda_predictions) lda_accuracy = np.mean(y_test == lda_predictions) set_of_signs = list(set(lda_predictions)) lda_class_accuracies = [lda_confusion_matrix[i, i] / np.sum(lda_confusion_matrix[i, :]) for i in range(len(lda_confusion_matrix))] print(f"LDA Overall Accuracy: {lda_accuracy*100:.2f}") print("LDA Class-Specific Accuracies:") for i, acc in enumerate(lda_class_accuracies): print(f"Class {set_of_signs[i]}: {acc*100:.2f}") def remove_every_nth_column(df, count): return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0] def leave_every_nth_column(df, count): return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0] def iter_operations(train_df, test_df): for i in range(2, 20): yield ( f"remove_every_nth_column_{i}", remove_every_nth_column(train_df, i), remove_every_nth_column(test_df, i) ) for i in range(2, 20): yield ( f"leave_every_nth_column_{i}", leave_every_nth_column(train_df, i), leave_every_nth_column(test_df, i) ) # 3 if False: def get_overall_accuracy(X_train, X_test, y_train, y_test): lda_classifier = LinearDiscriminantAnalysis() lda_classifier.fit(X_train, y_train) lda_predictions = lda_classifier.predict(X_test) lda_accuracy = np.mean(y_test == lda_predictions) return lda_accuracy init_X_train = X_train.copy() init_X_test = X_test.copy() # ('leave_every_nth_column_5', 0.6002509760178472) init_X_train = leave_every_nth_column(init_X_train, 5) init_X_test = leave_every_nth_column(init_X_test, 5) # ('remove_every_nth_column_15', 0.6167038482989403) init_X_train = remove_every_nth_column(init_X_train, 15) init_X_test = remove_every_nth_column(init_X_test, 15) # ('remove_every_nth_column_14', 0.6179587283881762) init_X_train = remove_every_nth_column(init_X_train, 14) init_X_test = remove_every_nth_column(init_X_test, 14) # ('remove_every_nth_column_11', 0.6183770217512549) init_X_train = remove_every_nth_column(init_X_train, 11) init_X_test = remove_every_nth_column(init_X_test, 11) print(len(init_X_train.columns)) indices = list(int(c[5:]) for c in init_X_train.columns) print(indices) for y in range(28): for x in range(28): if y*28+x+1 in indices: print("1; ", end='') else: print("0; ", end='') print("") # results = [] # for (name, reduced_X_train, reduced_X_test) in iter_operations(init_X_train, init_X_test): # accuracy = get_overall_accuracy(reduced_X_train, reduced_X_test, y_train, y_test) # results.append((name, accuracy)) print(get_overall_accuracy(init_X_train, init_X_test, y_train, y_test)) # results.sort(key=lambda e: -e[1]) # # print(results[0]) # for (name, accuracy) in results[:8]: # print(name, accuracy) # 4 if False: qda_classifier = QuadraticDiscriminantAnalysis() qda_classifier.fit(X_train, y_train) qda_predictions = qda_classifier.predict(X_test) qda_accuracy = np.mean(y_test == qda_predictions) qda_confusion_matrix = confusion_matrix(y_test, qda_predictions) qda_class_accuracies = [qda_confusion_matrix[i, i] / np.sum(qda_confusion_matrix[i, :]) for i in range(len(qda_confusion_matrix))] print(f"QDA Overall Accuracy: {qda_accuracy*100:.2f}%") print("QDA Class-Specific Accuracies:") for i, acc in enumerate(qda_class_accuracies): print(f"Class {i}: {acc*100:.2f}%") # 5 if True: def get_overall_accuracy(X_train, X_test, y_train, y_test): qda_classifier = QuadraticDiscriminantAnalysis() qda_classifier.fit(X_train, y_train) qda_predictions = qda_classifier.predict(X_test) qda_accuracy = np.mean(y_test == qda_predictions) return qda_accuracy init_X_train = X_train.copy() init_X_test = X_test.copy() # remove_every_nth_column_3 0.7632459564974903 init_X_train = remove_every_nth_column(init_X_train, 3) init_X_test = remove_every_nth_column(init_X_test, 3) # remove_every_nth_column_11 0.7717512548800892 init_X_train = remove_every_nth_column(init_X_train, 11) init_X_test = remove_every_nth_column(init_X_test, 11) print(len(init_X_train.columns)) # indices = list(int(c[5:]) for c in init_X_train.columns) # for y in range(28): # for x in range(28): # if y*28+x+1 in indices: # print("1;", end='') # else: # print("0;", end='') # print("") # results = [] # for (name, reduced_X_train, reduced_X_test) in iter_operations(init_X_train, init_X_test): # print(name) # accuracy = get_overall_accuracy(reduced_X_train, reduced_X_test, y_train, y_test) # results.append((name, accuracy)) print(get_overall_accuracy(init_X_train, init_X_test, y_train, y_test)) # results.sort(key=lambda e: -e[1]) # for (name, accuracy) in results[:8]: # print(name, accuracy)