machine-learing-methods/Lab1/4.1/main.py
2023-11-30 20:24:45 +02:00

189 lines
6.5 KiB
Python

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # Import seaborn for bar plotting
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix
# 1. Load the dataset
train_df = pd.read_csv("sign_mnist_train.csv")
test_df = pd.read_csv("sign_mnist_test.csv")
# for df in [train_df, test_df]:
# for column in df.columns[1:]:
# df.loc[abs(df[column] - 160) <= 5, column] = 0
if False:
# Create a 4x4 grid of subplots
fig, axes = plt.subplots(4, 4, figsize=(8, 8))
plt.subplots_adjust(wspace=0, hspace=0) # Adjust spacing
df_by_label = test_df[test_df["label"] == 1][:16]
indices = list(range(16))
for i, ax in zip(indices, axes.ravel()):
hand_sign = np.array(df_by_label.iloc[i, 1:])
hand_sign = hand_sign.reshape(28, 28)
ax.imshow(hand_sign, cmap='gray', aspect='auto')
ax.axis('off') # Hide axis labels
plt.show()
X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"])
y_train, y_test = train_df["label"], test_df["label"]
# for hand_sign in range(26):
# count = len(train_df[train_df["label"] == hand_sign])
# print(f"[{hand_sign}] = {count}")
# 2.
if False:
lda_classifier = LinearDiscriminantAnalysis()
lda_classifier.fit(X_train, y_train)
lda_predictions = lda_classifier.predict(X_test)
lda_confusion_matrix = confusion_matrix(y_test, lda_predictions)
lda_accuracy = np.mean(y_test == lda_predictions)
set_of_signs = list(set(lda_predictions))
lda_class_accuracies = [lda_confusion_matrix[i, i] / np.sum(lda_confusion_matrix[i, :]) for i in range(len(lda_confusion_matrix))]
print(f"LDA Overall Accuracy: {lda_accuracy*100:.2f}")
print("LDA Class-Specific Accuracies:")
for i, acc in enumerate(lda_class_accuracies):
print(f"Class {set_of_signs[i]}: {acc*100:.2f}")
def remove_every_nth_column(df, count):
return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0]
def leave_every_nth_column(df, count):
return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0]
def iter_operations(train_df, test_df):
for i in range(2, 20):
yield (
f"remove_every_nth_column_{i}",
remove_every_nth_column(train_df, i),
remove_every_nth_column(test_df, i)
)
for i in range(2, 20):
yield (
f"leave_every_nth_column_{i}",
leave_every_nth_column(train_df, i),
leave_every_nth_column(test_df, i)
)
# 3
if False:
def get_overall_accuracy(X_train, X_test, y_train, y_test):
lda_classifier = LinearDiscriminantAnalysis()
lda_classifier.fit(X_train, y_train)
lda_predictions = lda_classifier.predict(X_test)
lda_accuracy = np.mean(y_test == lda_predictions)
return lda_accuracy
init_X_train = X_train.copy()
init_X_test = X_test.copy()
# ('leave_every_nth_column_5', 0.6002509760178472)
init_X_train = leave_every_nth_column(init_X_train, 5)
init_X_test = leave_every_nth_column(init_X_test, 5)
# ('remove_every_nth_column_15', 0.6167038482989403)
init_X_train = remove_every_nth_column(init_X_train, 15)
init_X_test = remove_every_nth_column(init_X_test, 15)
# ('remove_every_nth_column_14', 0.6179587283881762)
init_X_train = remove_every_nth_column(init_X_train, 14)
init_X_test = remove_every_nth_column(init_X_test, 14)
# ('remove_every_nth_column_11', 0.6183770217512549)
init_X_train = remove_every_nth_column(init_X_train, 11)
init_X_test = remove_every_nth_column(init_X_test, 11)
print(len(init_X_train.columns))
indices = list(int(c[5:]) for c in init_X_train.columns)
print(indices)
for y in range(28):
for x in range(28):
if y*28+x+1 in indices:
print("1; ", end='')
else:
print("0; ", end='')
print("")
# results = []
# for (name, reduced_X_train, reduced_X_test) in iter_operations(init_X_train, init_X_test):
# accuracy = get_overall_accuracy(reduced_X_train, reduced_X_test, y_train, y_test)
# results.append((name, accuracy))
print(get_overall_accuracy(init_X_train, init_X_test, y_train, y_test))
# results.sort(key=lambda e: -e[1])
# # print(results[0])
# for (name, accuracy) in results[:8]:
# print(name, accuracy)
# 4
if False:
qda_classifier = QuadraticDiscriminantAnalysis()
qda_classifier.fit(X_train, y_train)
qda_predictions = qda_classifier.predict(X_test)
qda_accuracy = np.mean(y_test == qda_predictions)
qda_confusion_matrix = confusion_matrix(y_test, qda_predictions)
qda_class_accuracies = [qda_confusion_matrix[i, i] / np.sum(qda_confusion_matrix[i, :]) for i in range(len(qda_confusion_matrix))]
print(f"QDA Overall Accuracy: {qda_accuracy*100:.2f}%")
print("QDA Class-Specific Accuracies:")
for i, acc in enumerate(qda_class_accuracies):
print(f"Class {i}: {acc*100:.2f}%")
# 5
if True:
def get_overall_accuracy(X_train, X_test, y_train, y_test):
qda_classifier = QuadraticDiscriminantAnalysis()
qda_classifier.fit(X_train, y_train)
qda_predictions = qda_classifier.predict(X_test)
qda_accuracy = np.mean(y_test == qda_predictions)
return qda_accuracy
init_X_train = X_train.copy()
init_X_test = X_test.copy()
# remove_every_nth_column_3 0.7632459564974903
init_X_train = remove_every_nth_column(init_X_train, 3)
init_X_test = remove_every_nth_column(init_X_test, 3)
# remove_every_nth_column_11 0.7717512548800892
init_X_train = remove_every_nth_column(init_X_train, 11)
init_X_test = remove_every_nth_column(init_X_test, 11)
print(len(init_X_train.columns))
# indices = list(int(c[5:]) for c in init_X_train.columns)
# for y in range(28):
# for x in range(28):
# if y*28+x+1 in indices:
# print("1;", end='')
# else:
# print("0;", end='')
# print("")
# results = []
# for (name, reduced_X_train, reduced_X_test) in iter_operations(init_X_train, init_X_test):
# print(name)
# accuracy = get_overall_accuracy(reduced_X_train, reduced_X_test, y_train, y_test)
# results.append((name, accuracy))
print(get_overall_accuracy(init_X_train, init_X_test, y_train, y_test))
# results.sort(key=lambda e: -e[1])
# for (name, accuracy) in results[:8]:
# print(name, accuracy)