This commit is contained in:
Rokas Puzonas 2023-11-30 20:24:45 +02:00
commit 61313c1e87
11 changed files with 46108 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
Lab1/Figures/

1561
Lab1/2.1/bike_test.csv Normal file

File diff suppressed because it is too large Load Diff

7201
Lab1/2.1/bike_train.csv Normal file

File diff suppressed because it is too large Load Diff

245
Lab1/2.1/main.py Normal file
View File

@ -0,0 +1,245 @@
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
train_df = pd.read_csv("bike_train.csv")
test_df = pd.read_csv("bike_test.csv")
# 1.
if False:
print("Feature count:", len(train_df.columns))
print("Training set size: ", len(train_df.index))
print("Test set size: ", len(test_df.index))
plt.scatter(train_df['Temperature'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
plt.xlabel('Temperature')
plt.ylabel('Rented_Bike_Count')
plt.title('Temperature vs. Rented_Bike_Count')
plt.grid()
plt.show()
# 2.
if False:
plt.scatter(train_df['Visibility'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
plt.xlabel('Visibility')
plt.ylabel('Rented_Bike_Count')
plt.title('Visibility vs. Rented_Bike_Count')
plt.grid()
plt.show()
plt.scatter(train_df['Rainfall'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
plt.xlabel('Rainfall')
plt.ylabel('Rented_Bike_Count')
plt.title('Rainfall vs. Rented_Bike_Count')
plt.grid()
plt.show()
train_df = train_df.drop(columns=['Date'])
test_df = test_df.drop(columns=['Date'])
# 3.
if False:
sns.heatmap(train_df.drop(columns=["Functioning_Day", "Holiday", "Seasons"]).corr(), annot=True)
plt.show()
lm_fit = LinearRegression().fit(train_df[['Temperature', 'Dew_point_temperature']], train_df['Rented_Bike_Count'])
plt.scatter(train_df['Temperature'], train_df['Dew_point_temperature'], color='blue', alpha=0.5)
plt.xlabel('Temperature')
plt.ylabel('Dew_point_temperature')
plt.title('Temperature vs. Dew_point_temperature')
plt.grid()
plt.show()
# plt.scatter(train_df['Visibility'], train_df['Humidity'], color='blue', alpha=0.5)
# plt.xlabel('Visibility')
# plt.ylabel('Humidity')
# plt.title('Visibility vs. Humidity')
# plt.grid()
# plt.show()
train_df = pd.get_dummies(data=train_df, drop_first=True)
test_df = pd.get_dummies(data=test_df, drop_first=True)
# 4.
if False:
X_train = train_df.drop(columns=['Rented_Bike_Count'])
X_test = test_df.drop(columns=['Rented_Bike_Count'])
lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count'])
# plt.scatter(train_df['Rented_Bike_Count'], lm_fit.predict(train_df.drop(columns=['Rented_Bike_Count'])), color='blue', alpha=0.5, s=3)
# plt.xlabel('True Rented_Bike_Count values (Training data)')
# plt.ylabel('Predicted Rented_Bike_Count values')
# plt.ylim(-1000, 2000)
# plt.plot([0, 1000], [0, 1000], color='red', linewidth=3)
# plt.title('Predicted vs. True Rented_Bike_Count')
# plt.grid()
# plt.show()
plt.scatter(test_df['Rented_Bike_Count'], lm_fit.predict(X_test), color='blue', alpha=0.5, s=3)
plt.xlabel('True Rented_Bike_Count')
plt.ylabel('Predicted Rented_Bike_Count')
#plt.ylim(-1000, 2000)
plt.plot([0, 1000], [0, 1000], color='red', linewidth=3)
plt.title('Predicted vs. True Rented_Bike_Count')
plt.grid()
plt.show()
y_pred = lm_fit.predict(test_df.drop(columns=['Rented_Bike_Count']))
r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred)
print(f'R-squared: {r_squared}')
print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2)))
train_df[f"Rented_Bike_Count"]=np.log(train_df["Rented_Bike_Count"] + 1)
test_df[f"Rented_Bike_Count"]=np.log(test_df["Rented_Bike_Count"] + 1)
# 5.
if False:
X_train = train_df.drop(columns=['Rented_Bike_Count'])
X_test = test_df.drop(columns=['Rented_Bike_Count'])
y_train = np.log(train_df["Rented_Bike_Count"] + 1)
y_test = np.log(test_df["Rented_Bike_Count"] + 1)
lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count'])
y_pred = lm_fit.predict(X_test)
r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred)
print(f'R-squared: {r_squared}')
print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2)))
plt.scatter(test_df['Rented_Bike_Count'], y_pred, color='blue', alpha=0.5, s=3)
plt.xlabel('True log(Rented_Bike_Count+1)')
plt.ylabel('Predicted log(Rented_Bike_Count+1)')
#plt.ylim(-2.5, 10)
plt.plot([4, 7.5], [4, 7.5], color='red', linewidth=2)
plt.title('Predicted vs. True log(Rented_Bike_Count+1)')
plt.grid()
plt.show()
def append_plus_column(dataframe: DataFrame, column_a, column_b):
dataframe[f"{column_a}_plus_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b], axis=1)
def append_plus_mul_column(dataframe: DataFrame, column_a, column_b):
dataframe[f"{column_a}_plus_mul_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b] + row[column_a]*row[column_b], axis=1)
def append_log_column(dataframe: DataFrame, column):
dataframe[f"{column}_log"]=np.log(dataframe[column])
def append_sqrt_column(dataframe: DataFrame, column):
dataframe[f"{column}_sqrt"]=np.sqrt(dataframe[column])
def append_square_column(dataframe: DataFrame, column):
dataframe[f"{column}_square"]=np.square(dataframe[column])
def iter_transformations(dataframe: DataFrame):
for column_idx in range(len(dataframe.columns)):
name = dataframe.columns[column_idx]
column_contains_zero = any(v == 0 for v in dataframe[name])
column_contains_negative = any(v < 0 for v in dataframe[name])
if not column_contains_zero:
yield (f"{name}_log", lambda df: append_log_column(df, name))
if not column_contains_negative:
yield (f"{name}_sqrt", lambda df: append_sqrt_column(df, name))
yield (f"{name}_square", lambda df: append_square_column(df, name))
# for other_column_idx in range(column_idx+1, len(dataframe.columns)):
# other_name = dataframe.columns[other_column_idx]
# yield (f"{name}_plus_{other_name}", lambda df: append_plus_column(df, name, other_name))
# yield (f"{name}_plus_mul_{other_name}", lambda df: append_plus_mul_column(df, name, other_name))
def calc_r2_squared(transform_func):
X_train = train_df.drop(columns=['Rented_Bike_Count'])
X_test = test_df.drop(columns=['Rented_Bike_Count'])
y_train = train_df['Rented_Bike_Count']
y_test = test_df['Rented_Bike_Count']
transform_func(X_train)
transform_func(X_test)
lm_fit = LinearRegression().fit(X_train, y_train)
y_pred = lm_fit.predict(X_test)
r_squared = r2_score(y_test, y_pred)
return r_squared
# 6.
if False:
init_X_train = train_df.drop(columns=['Rented_Bike_Count'])
init_X_test = test_df.drop(columns=['Rented_Bike_Count'])
init_y_train = train_df['Rented_Bike_Count']
init_y_test = test_df['Rented_Bike_Count']
results = []
columns = train_df.drop(columns=['Rented_Bike_Count']).columns
print(len(columns))
for column_a_idx in range(len(columns)):
column_a = columns[column_a_idx]
for column_b_idx in range(column_a_idx+1, len(columns)):
column_b = columns[column_b_idx]
r2_plus = calc_r2_squared(lambda df: append_plus_column(df, column_a, column_b))
r2_plus_mul = calc_r2_squared(lambda df: append_plus_mul_column(df, column_a, column_b))
results.append((column_a, column_b, r2_plus, r2_plus_mul))
results.sort(key=lambda e: e[2]-e[3])
for (column_a, column_b, r2_plus, r2_plus_mul) in results[:10]:
print(column_a, column_b, r2_plus, r2_plus_mul, r2_plus_mul - r2_plus)
# for df in [X_train, X_test]:
#append_plus_mul_column(df, "Humidity", "Visibility")
#append_plus_column(df, "Humidity", "Visibility")
#append_plus_mul_column(df, "Dew_point_temperature", "Rainfall")
#append_plus_column(df, "Dew_point_temperature", "Rainfall")
# pass
# lm_fit = LinearRegression().fit(X_train, y_train)
# y_pred = lm_fit.predict(X_test)
# r_squared = r2_score(y_test, y_pred)
# print(f'R-squared: {r_squared}')
# plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5)
# plt.xlabel('True Rented_Bike_Count values')
# plt.ylabel('Predicted Rented_Bike_Count values')
# plt.ylim(-7.5, 10)
# plt.plot([0, 10], [0, 10], color='red', linewidth=3)
# plt.title('Predicted vs. True Rented_Bike_Count')
# plt.grid()
# plt.show()
# 7.
if True:
r2_list = []
for (transform_name, transform_func) in iter_transformations(train_df.drop(columns=['Rented_Bike_Count'])):
r2_list.append((transform_name, calc_r2_squared(lambda _: _), calc_r2_squared(transform_func)))
r2_list.sort(key=lambda e: e[1]-e[2])
for a in r2_list[:10]:
print(f"{a[0]:30} {a[1]:.6f} {a[2]:.6f} {a[2]-a[1]:.6f}")
# X_train = train_df.drop(columns=['Rented_Bike_Count'])
# X_test = test_df.drop(columns=['Rented_Bike_Count'])
# y_train = train_df['Rented_Bike_Count']
# y_test = test_df['Rented_Bike_Count']
# for df in [X_train, X_test]:
# append_sqrt_column(df, "Rainfall")
# append_sqrt_column(df, "Humidity")
# append_square_column(df, "Visibility")
# lm_fit = LinearRegression().fit(X_train, y_train)
# y_pred = lm_fit.predict(X_test)
# r_squared = r2_score(y_test, y_pred)
# print(f'R-squared: {r_squared}')
# plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5)
# plt.xlabel('True Rented_Bike_Count values')
# plt.ylabel('Predicted Rented_Bike_Count values')
# plt.ylim(-7.5, 10)
# plt.plot([0, 10], [0, 10], color='red', linewidth=3)
# plt.title('Predicted vs. True Rented_Bike_Count')
# plt.grid()
# plt.show()

264
Lab1/3.1/main.py Normal file
View File

@ -0,0 +1,264 @@
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
import numpy as np
import seaborn as sns
from scipy.stats import boxcox
def normalize_column(column):
min_value = min(column)
max_value = max(column)
return (column-min_value)/(max_value-min_value)
# 1.
main_df = pd.read_csv("music_spotify.csv")
# https://developer.spotify.com/documentation/web-api/reference/get-audio-features
#print(len(main_df.columns))
# 2.
if False:
main_df["duration_ms_01"] = normalize_column(main_df["duration_ms"])
for column_name in ["acousticness", "danceability", "energy", "instrumentalness", "duration_ms"]:
main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0")
main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1")
plt.title(f"Density plot ({column_name} vs target)")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
plt.xlabel(column_name)
plt.grid()
plt.show()
# 3.
if False:
# main_df["loudness_01"] = normalize_column(main_df["loudness"])
# main_df["tempo_01"] = normalize_column(main_df["tempo"])
for column_name in ["liveness", "loudness", "speechiness", "tempo", "valence"]:
main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0")
main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1")
plt.title(f"Density plot ({column_name} vs target)")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
plt.xlabel(column_name)
plt.grid()
plt.show()
# 4.
if False:
for column_name in ["key", "mode", "time_signature"]:
df_new = main_df[[column_name, "target"]].pivot_table(columns=column_name, index="target", aggfunc=len, fill_value=0).T#.apply(lambda x: x/sum(x), axis=1)
ax = df_new.plot.bar(stacked=True)
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.style.use('ggplot')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
plt.ylabel("occurrences")
plt.title(f"Bar graph ({column_name} vs target)")
plt.show()
# 5.
# Explain parts 1-4 in report
# 6.
X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3)
if False:
# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler and transform the features
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
# Now use the scaled features to train your logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
preds = model.predict(X_test_scaled)
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
print(classification_report(y_test, np.where(preds_probs > 0.5, 1, 0)))
conf_matrix = confusion_matrix(y_test, preds)
print("----- Confusion matrix ---------")
print(" Predicted 0, Predicted 1")
print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}")
print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}")
print("---------- Coeffs --------------")
print(model.coef_[0])
# Use sigmoid for equation
# 7.
if False:
# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler and transform the features
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
# Now use the scaled features to train your logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
for threshold in [0.3, 0.4, 0.5]:
preds = np.where(preds_probs > threshold, 1, 0)
print(f"=========== THRESHOLD: {threshold} ===========")
print(classification_report(y_test, preds))
conf_matrix = confusion_matrix(y_test, preds)
print("----- Confusion matrix ---------")
print(" Predicted 0, Predicted 1")
print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}")
print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}")
# 8.
if False:
def get_accuracy(X_train, X_test, y_train, y_test):
# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler and transform the features
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
# Now use the scaled features to train your logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
preds = model.predict(X_test_scaled)
#return accuracy_score(y_test, preds)
return np.mean(preds == y_test)
iterations = 50
diffs = {}
for column_name in X_train.columns:
diffs[column_name] = 0
for _ in range(iterations):
X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3)
baseline_accuracy = get_accuracy(X_train, X_test, y_train, y_test)
for column_name in X_train.columns:
accuracy = get_accuracy(X_train.drop(columns=[column_name]), X_test.drop(columns=[column_name]), y_train, y_test)
diffs[column_name] += (accuracy - baseline_accuracy)
#diffs.append((column_name, accuracy - baseline_accuracy))
diffs_array = list(diffs.items())
diffs_array.sort(key=lambda e: -e[1])
for (column_name, diff) in diffs_array:
print(f"{column_name:20} {diff / iterations}")
# 9.
if True:
# plt.figure(figsize=(8, 6))
# sns.heatmap(main_df.drop(columns=["X", "song_title", "artist"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
# plt.title("Correlation Matrix")
# plt.show()
# main_df["duration_ms"] = normalize_column(main_df["duration_ms"])
# main_df["loudness"] = normalize_column(main_df["loudness"])
# main_df["tempo"] = normalize_column(main_df["tempo"])
# main_df.drop(columns=["X"]).boxplot()
# plt.show()
transformations = {
"log": (
lambda column: np.log(column + 0.1),
lambda column: all(v > 0 for v in column)
),
"square": (
lambda column: np.square(column),
lambda column: True
),
"sqrt": (
lambda column: np.sqrt(column),
lambda column: all(v >= 0 for v in column)
),
"sqrt3": (
lambda column: np.float_power(column, 1/3),
lambda column: all(v >= 0 for v in column)
),
"exp": (
lambda column: np.exp(column),
lambda column: all(v <= 1000 for v in column)
)
}
results = []
for column_name in X_train.columns:
for (transform_name, (transform, criteria)) in transformations.items():
if not criteria(main_df[column_name]): continue
new_X_train = X_train.copy()
new_X_test = X_test.copy()
new_X_train[f"{column_name}_{transform_name}"] = transform(X_train[column_name])
new_X_test[f"{column_name}_{transform_name}"] = transform(X_test[column_name])
scaler = StandardScaler()
scaler.fit(new_X_train)
X_train_scaled=scaler.transform(new_X_train)
X_test_scaled=scaler.transform(new_X_test)
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
result_entry = {
"column": column_name,
"transform": transform_name,
"thresholds": {},
}
results.append(result_entry)
for threshold in [0.3, 0.5, 0.7]:
preds = np.where(preds_probs > threshold, 1, 0)
confusion = confusion_matrix(y_test, preds)
diag_values = np.diag(confusion)
row_sums = np.sum(confusion, axis=1)
percentage_accuracy = (diag_values / row_sums)
result_entry["thresholds"][threshold] = {
"overall_accuracy": np.mean(y_test == preds),
"class0_accuracy": percentage_accuracy[0],
"class1_accuracy": percentage_accuracy[1],
}
results.sort(key=lambda e: -max(a["overall_accuracy"] for a in e["thresholds"].values()))
best_result = results[0]
for result in results[:5]:
print("------")
print("column", result["column"], result["transform"])
for threshold in result["thresholds"].keys():
print(f'[{threshold}] class0_accuracy {result["thresholds"][threshold]["class0_accuracy"]:.6f}', )
print(f'[{threshold}] class1_accuracy {result["thresholds"][threshold]["class1_accuracy"]:.6f}')
print(f'[{threshold}] overall_accuracy {result["thresholds"][threshold]["overall_accuracy"]:.6f}')
#print("error_rate", result["error_rate"])
# print("======================")
# print("overall_accuracy", best_result["overall_accuracy"])
# print("class0_accuracy", best_result["class0_accuracy"])
# print("class1_accuracy", best_result["class1_accuracy"])
# print("error_rate", best_result["error_rate"])
# print("threshold", best_result["threshold"])
# print("column", best_result["column"], best_result["transform"])
# Calculate ROC curve and AUC
# fpr, tpr, thresholds = roc_curve(y_test, best_result["probs"])
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve
# plt.figure(figsize=(5,5))
# plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.legend(loc='lower right')
# plt.show()
# # Print AUC value
# print("AUC:", roc_auc)

2018
Lab1/3.1/music_spotify.csv Normal file

File diff suppressed because it is too large Load Diff

189
Lab1/4.1/main.py Normal file
View File

@ -0,0 +1,189 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns # Import seaborn for bar plotting
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import accuracy_score, confusion_matrix
# 1. Load the dataset
train_df = pd.read_csv("sign_mnist_train.csv")
test_df = pd.read_csv("sign_mnist_test.csv")
# for df in [train_df, test_df]:
# for column in df.columns[1:]:
# df.loc[abs(df[column] - 160) <= 5, column] = 0
if False:
# Create a 4x4 grid of subplots
fig, axes = plt.subplots(4, 4, figsize=(8, 8))
plt.subplots_adjust(wspace=0, hspace=0) # Adjust spacing
df_by_label = test_df[test_df["label"] == 1][:16]
indices = list(range(16))
for i, ax in zip(indices, axes.ravel()):
hand_sign = np.array(df_by_label.iloc[i, 1:])
hand_sign = hand_sign.reshape(28, 28)
ax.imshow(hand_sign, cmap='gray', aspect='auto')
ax.axis('off') # Hide axis labels
plt.show()
X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"])
y_train, y_test = train_df["label"], test_df["label"]
# for hand_sign in range(26):
# count = len(train_df[train_df["label"] == hand_sign])
# print(f"[{hand_sign}] = {count}")
# 2.
if False:
lda_classifier = LinearDiscriminantAnalysis()
lda_classifier.fit(X_train, y_train)
lda_predictions = lda_classifier.predict(X_test)
lda_confusion_matrix = confusion_matrix(y_test, lda_predictions)
lda_accuracy = np.mean(y_test == lda_predictions)
set_of_signs = list(set(lda_predictions))
lda_class_accuracies = [lda_confusion_matrix[i, i] / np.sum(lda_confusion_matrix[i, :]) for i in range(len(lda_confusion_matrix))]
print(f"LDA Overall Accuracy: {lda_accuracy*100:.2f}")
print("LDA Class-Specific Accuracies:")
for i, acc in enumerate(lda_class_accuracies):
print(f"Class {set_of_signs[i]}: {acc*100:.2f}")
def remove_every_nth_column(df, count):
return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0]
def leave_every_nth_column(df, count):
return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0]
def iter_operations(train_df, test_df):
for i in range(2, 20):
yield (
f"remove_every_nth_column_{i}",
remove_every_nth_column(train_df, i),
remove_every_nth_column(test_df, i)
)
for i in range(2, 20):
yield (
f"leave_every_nth_column_{i}",
leave_every_nth_column(train_df, i),
leave_every_nth_column(test_df, i)
)
# 3
if False:
def get_overall_accuracy(X_train, X_test, y_train, y_test):
lda_classifier = LinearDiscriminantAnalysis()
lda_classifier.fit(X_train, y_train)
lda_predictions = lda_classifier.predict(X_test)
lda_accuracy = np.mean(y_test == lda_predictions)
return lda_accuracy
init_X_train = X_train.copy()
init_X_test = X_test.copy()
# ('leave_every_nth_column_5', 0.6002509760178472)
init_X_train = leave_every_nth_column(init_X_train, 5)
init_X_test = leave_every_nth_column(init_X_test, 5)
# ('remove_every_nth_column_15', 0.6167038482989403)
init_X_train = remove_every_nth_column(init_X_train, 15)
init_X_test = remove_every_nth_column(init_X_test, 15)
# ('remove_every_nth_column_14', 0.6179587283881762)
init_X_train = remove_every_nth_column(init_X_train, 14)
init_X_test = remove_every_nth_column(init_X_test, 14)
# ('remove_every_nth_column_11', 0.6183770217512549)
init_X_train = remove_every_nth_column(init_X_train, 11)
init_X_test = remove_every_nth_column(init_X_test, 11)
print(len(init_X_train.columns))
indices = list(int(c[5:]) for c in init_X_train.columns)
print(indices)
for y in range(28):
for x in range(28):
if y*28+x+1 in indices:
print("1; ", end='')
else:
print("0; ", end='')
print("")
# results = []
# for (name, reduced_X_train, reduced_X_test) in iter_operations(init_X_train, init_X_test):
# accuracy = get_overall_accuracy(reduced_X_train, reduced_X_test, y_train, y_test)
# results.append((name, accuracy))
print(get_overall_accuracy(init_X_train, init_X_test, y_train, y_test))
# results.sort(key=lambda e: -e[1])
# # print(results[0])
# for (name, accuracy) in results[:8]:
# print(name, accuracy)
# 4
if False:
qda_classifier = QuadraticDiscriminantAnalysis()
qda_classifier.fit(X_train, y_train)
qda_predictions = qda_classifier.predict(X_test)
qda_accuracy = np.mean(y_test == qda_predictions)
qda_confusion_matrix = confusion_matrix(y_test, qda_predictions)
qda_class_accuracies = [qda_confusion_matrix[i, i] / np.sum(qda_confusion_matrix[i, :]) for i in range(len(qda_confusion_matrix))]
print(f"QDA Overall Accuracy: {qda_accuracy*100:.2f}%")
print("QDA Class-Specific Accuracies:")
for i, acc in enumerate(qda_class_accuracies):
print(f"Class {i}: {acc*100:.2f}%")
# 5
if True:
def get_overall_accuracy(X_train, X_test, y_train, y_test):
qda_classifier = QuadraticDiscriminantAnalysis()
qda_classifier.fit(X_train, y_train)
qda_predictions = qda_classifier.predict(X_test)
qda_accuracy = np.mean(y_test == qda_predictions)
return qda_accuracy
init_X_train = X_train.copy()
init_X_test = X_test.copy()
# remove_every_nth_column_3 0.7632459564974903
init_X_train = remove_every_nth_column(init_X_train, 3)
init_X_test = remove_every_nth_column(init_X_test, 3)
# remove_every_nth_column_11 0.7717512548800892
init_X_train = remove_every_nth_column(init_X_train, 11)
init_X_test = remove_every_nth_column(init_X_test, 11)
print(len(init_X_train.columns))
# indices = list(int(c[5:]) for c in init_X_train.columns)
# for y in range(28):
# for x in range(28):
# if y*28+x+1 in indices:
# print("1;", end='')
# else:
# print("0;", end='')
# print("")
# results = []
# for (name, reduced_X_train, reduced_X_test) in iter_operations(init_X_train, init_X_test):
# print(name)
# accuracy = get_overall_accuracy(reduced_X_train, reduced_X_test, y_train, y_test)
# results.append((name, accuracy))
print(get_overall_accuracy(init_X_train, init_X_test, y_train, y_test))
# results.sort(key=lambda e: -e[1])
# for (name, accuracy) in results[:8]:
# print(name, accuracy)

7173
Lab1/4.1/sign_mnist_test.csv Normal file

File diff suppressed because one or more lines are too long

27456
Lab1/4.1/sign_mnist_train.csv Normal file

File diff suppressed because one or more lines are too long

BIN
Lab1/Extra.xlsx Normal file

Binary file not shown.

Binary file not shown.