add lab1
This commit is contained in:
commit
61313c1e87
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
Lab1/Figures/
|
||||
1561
Lab1/2.1/bike_test.csv
Normal file
1561
Lab1/2.1/bike_test.csv
Normal file
File diff suppressed because it is too large
Load Diff
7201
Lab1/2.1/bike_train.csv
Normal file
7201
Lab1/2.1/bike_train.csv
Normal file
File diff suppressed because it is too large
Load Diff
245
Lab1/2.1/main.py
Normal file
245
Lab1/2.1/main.py
Normal file
@ -0,0 +1,245 @@
|
||||
import pandas as pd
|
||||
from pandas import DataFrame
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
from sklearn.linear_model import LinearRegression
|
||||
from sklearn.metrics import r2_score
|
||||
|
||||
train_df = pd.read_csv("bike_train.csv")
|
||||
test_df = pd.read_csv("bike_test.csv")
|
||||
|
||||
# 1.
|
||||
if False:
|
||||
print("Feature count:", len(train_df.columns))
|
||||
print("Training set size: ", len(train_df.index))
|
||||
print("Test set size: ", len(test_df.index))
|
||||
plt.scatter(train_df['Temperature'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
|
||||
plt.xlabel('Temperature')
|
||||
plt.ylabel('Rented_Bike_Count')
|
||||
plt.title('Temperature vs. Rented_Bike_Count')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
# 2.
|
||||
if False:
|
||||
plt.scatter(train_df['Visibility'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
|
||||
plt.xlabel('Visibility')
|
||||
plt.ylabel('Rented_Bike_Count')
|
||||
plt.title('Visibility vs. Rented_Bike_Count')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
plt.scatter(train_df['Rainfall'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
|
||||
plt.xlabel('Rainfall')
|
||||
plt.ylabel('Rented_Bike_Count')
|
||||
plt.title('Rainfall vs. Rented_Bike_Count')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
train_df = train_df.drop(columns=['Date'])
|
||||
test_df = test_df.drop(columns=['Date'])
|
||||
|
||||
# 3.
|
||||
if False:
|
||||
sns.heatmap(train_df.drop(columns=["Functioning_Day", "Holiday", "Seasons"]).corr(), annot=True)
|
||||
plt.show()
|
||||
|
||||
lm_fit = LinearRegression().fit(train_df[['Temperature', 'Dew_point_temperature']], train_df['Rented_Bike_Count'])
|
||||
|
||||
plt.scatter(train_df['Temperature'], train_df['Dew_point_temperature'], color='blue', alpha=0.5)
|
||||
plt.xlabel('Temperature')
|
||||
plt.ylabel('Dew_point_temperature')
|
||||
plt.title('Temperature vs. Dew_point_temperature')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
# plt.scatter(train_df['Visibility'], train_df['Humidity'], color='blue', alpha=0.5)
|
||||
# plt.xlabel('Visibility')
|
||||
# plt.ylabel('Humidity')
|
||||
# plt.title('Visibility vs. Humidity')
|
||||
# plt.grid()
|
||||
# plt.show()
|
||||
|
||||
train_df = pd.get_dummies(data=train_df, drop_first=True)
|
||||
test_df = pd.get_dummies(data=test_df, drop_first=True)
|
||||
|
||||
# 4.
|
||||
if False:
|
||||
X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
||||
X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
||||
lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count'])
|
||||
|
||||
# plt.scatter(train_df['Rented_Bike_Count'], lm_fit.predict(train_df.drop(columns=['Rented_Bike_Count'])), color='blue', alpha=0.5, s=3)
|
||||
# plt.xlabel('True Rented_Bike_Count values (Training data)')
|
||||
# plt.ylabel('Predicted Rented_Bike_Count values')
|
||||
# plt.ylim(-1000, 2000)
|
||||
# plt.plot([0, 1000], [0, 1000], color='red', linewidth=3)
|
||||
# plt.title('Predicted vs. True Rented_Bike_Count')
|
||||
# plt.grid()
|
||||
# plt.show()
|
||||
|
||||
plt.scatter(test_df['Rented_Bike_Count'], lm_fit.predict(X_test), color='blue', alpha=0.5, s=3)
|
||||
plt.xlabel('True Rented_Bike_Count')
|
||||
plt.ylabel('Predicted Rented_Bike_Count')
|
||||
#plt.ylim(-1000, 2000)
|
||||
plt.plot([0, 1000], [0, 1000], color='red', linewidth=3)
|
||||
plt.title('Predicted vs. True Rented_Bike_Count')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
y_pred = lm_fit.predict(test_df.drop(columns=['Rented_Bike_Count']))
|
||||
r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred)
|
||||
print(f'R-squared: {r_squared}')
|
||||
print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2)))
|
||||
|
||||
|
||||
train_df[f"Rented_Bike_Count"]=np.log(train_df["Rented_Bike_Count"] + 1)
|
||||
test_df[f"Rented_Bike_Count"]=np.log(test_df["Rented_Bike_Count"] + 1)
|
||||
|
||||
# 5.
|
||||
if False:
|
||||
X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
||||
X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
||||
y_train = np.log(train_df["Rented_Bike_Count"] + 1)
|
||||
y_test = np.log(test_df["Rented_Bike_Count"] + 1)
|
||||
|
||||
lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count'])
|
||||
|
||||
y_pred = lm_fit.predict(X_test)
|
||||
r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred)
|
||||
print(f'R-squared: {r_squared}')
|
||||
print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2)))
|
||||
|
||||
plt.scatter(test_df['Rented_Bike_Count'], y_pred, color='blue', alpha=0.5, s=3)
|
||||
plt.xlabel('True log(Rented_Bike_Count+1)')
|
||||
plt.ylabel('Predicted log(Rented_Bike_Count+1)')
|
||||
#plt.ylim(-2.5, 10)
|
||||
plt.plot([4, 7.5], [4, 7.5], color='red', linewidth=2)
|
||||
plt.title('Predicted vs. True log(Rented_Bike_Count+1)')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
def append_plus_column(dataframe: DataFrame, column_a, column_b):
|
||||
dataframe[f"{column_a}_plus_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b], axis=1)
|
||||
|
||||
def append_plus_mul_column(dataframe: DataFrame, column_a, column_b):
|
||||
dataframe[f"{column_a}_plus_mul_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b] + row[column_a]*row[column_b], axis=1)
|
||||
|
||||
def append_log_column(dataframe: DataFrame, column):
|
||||
dataframe[f"{column}_log"]=np.log(dataframe[column])
|
||||
|
||||
def append_sqrt_column(dataframe: DataFrame, column):
|
||||
dataframe[f"{column}_sqrt"]=np.sqrt(dataframe[column])
|
||||
|
||||
def append_square_column(dataframe: DataFrame, column):
|
||||
dataframe[f"{column}_square"]=np.square(dataframe[column])
|
||||
|
||||
def iter_transformations(dataframe: DataFrame):
|
||||
for column_idx in range(len(dataframe.columns)):
|
||||
name = dataframe.columns[column_idx]
|
||||
column_contains_zero = any(v == 0 for v in dataframe[name])
|
||||
column_contains_negative = any(v < 0 for v in dataframe[name])
|
||||
|
||||
if not column_contains_zero:
|
||||
yield (f"{name}_log", lambda df: append_log_column(df, name))
|
||||
if not column_contains_negative:
|
||||
yield (f"{name}_sqrt", lambda df: append_sqrt_column(df, name))
|
||||
yield (f"{name}_square", lambda df: append_square_column(df, name))
|
||||
|
||||
# for other_column_idx in range(column_idx+1, len(dataframe.columns)):
|
||||
# other_name = dataframe.columns[other_column_idx]
|
||||
# yield (f"{name}_plus_{other_name}", lambda df: append_plus_column(df, name, other_name))
|
||||
# yield (f"{name}_plus_mul_{other_name}", lambda df: append_plus_mul_column(df, name, other_name))
|
||||
|
||||
def calc_r2_squared(transform_func):
|
||||
X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
||||
X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
||||
y_train = train_df['Rented_Bike_Count']
|
||||
y_test = test_df['Rented_Bike_Count']
|
||||
|
||||
transform_func(X_train)
|
||||
transform_func(X_test)
|
||||
|
||||
lm_fit = LinearRegression().fit(X_train, y_train)
|
||||
y_pred = lm_fit.predict(X_test)
|
||||
r_squared = r2_score(y_test, y_pred)
|
||||
return r_squared
|
||||
|
||||
# 6.
|
||||
if False:
|
||||
init_X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
||||
init_X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
||||
init_y_train = train_df['Rented_Bike_Count']
|
||||
init_y_test = test_df['Rented_Bike_Count']
|
||||
|
||||
results = []
|
||||
columns = train_df.drop(columns=['Rented_Bike_Count']).columns
|
||||
print(len(columns))
|
||||
for column_a_idx in range(len(columns)):
|
||||
column_a = columns[column_a_idx]
|
||||
for column_b_idx in range(column_a_idx+1, len(columns)):
|
||||
column_b = columns[column_b_idx]
|
||||
r2_plus = calc_r2_squared(lambda df: append_plus_column(df, column_a, column_b))
|
||||
r2_plus_mul = calc_r2_squared(lambda df: append_plus_mul_column(df, column_a, column_b))
|
||||
results.append((column_a, column_b, r2_plus, r2_plus_mul))
|
||||
|
||||
results.sort(key=lambda e: e[2]-e[3])
|
||||
for (column_a, column_b, r2_plus, r2_plus_mul) in results[:10]:
|
||||
print(column_a, column_b, r2_plus, r2_plus_mul, r2_plus_mul - r2_plus)
|
||||
|
||||
# for df in [X_train, X_test]:
|
||||
#append_plus_mul_column(df, "Humidity", "Visibility")
|
||||
#append_plus_column(df, "Humidity", "Visibility")
|
||||
|
||||
#append_plus_mul_column(df, "Dew_point_temperature", "Rainfall")
|
||||
#append_plus_column(df, "Dew_point_temperature", "Rainfall")
|
||||
# pass
|
||||
|
||||
# lm_fit = LinearRegression().fit(X_train, y_train)
|
||||
# y_pred = lm_fit.predict(X_test)
|
||||
# r_squared = r2_score(y_test, y_pred)
|
||||
# print(f'R-squared: {r_squared}')
|
||||
|
||||
# plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5)
|
||||
# plt.xlabel('True Rented_Bike_Count values')
|
||||
# plt.ylabel('Predicted Rented_Bike_Count values')
|
||||
# plt.ylim(-7.5, 10)
|
||||
# plt.plot([0, 10], [0, 10], color='red', linewidth=3)
|
||||
# plt.title('Predicted vs. True Rented_Bike_Count')
|
||||
# plt.grid()
|
||||
# plt.show()
|
||||
|
||||
# 7.
|
||||
if True:
|
||||
r2_list = []
|
||||
for (transform_name, transform_func) in iter_transformations(train_df.drop(columns=['Rented_Bike_Count'])):
|
||||
r2_list.append((transform_name, calc_r2_squared(lambda _: _), calc_r2_squared(transform_func)))
|
||||
|
||||
r2_list.sort(key=lambda e: e[1]-e[2])
|
||||
for a in r2_list[:10]:
|
||||
print(f"{a[0]:30} {a[1]:.6f} {a[2]:.6f} {a[2]-a[1]:.6f}")
|
||||
|
||||
# X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
||||
# X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
||||
# y_train = train_df['Rented_Bike_Count']
|
||||
# y_test = test_df['Rented_Bike_Count']
|
||||
|
||||
# for df in [X_train, X_test]:
|
||||
# append_sqrt_column(df, "Rainfall")
|
||||
# append_sqrt_column(df, "Humidity")
|
||||
# append_square_column(df, "Visibility")
|
||||
|
||||
# lm_fit = LinearRegression().fit(X_train, y_train)
|
||||
# y_pred = lm_fit.predict(X_test)
|
||||
# r_squared = r2_score(y_test, y_pred)
|
||||
# print(f'R-squared: {r_squared}')
|
||||
|
||||
# plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5)
|
||||
# plt.xlabel('True Rented_Bike_Count values')
|
||||
# plt.ylabel('Predicted Rented_Bike_Count values')
|
||||
# plt.ylim(-7.5, 10)
|
||||
# plt.plot([0, 10], [0, 10], color='red', linewidth=3)
|
||||
# plt.title('Predicted vs. True Rented_Bike_Count')
|
||||
# plt.grid()
|
||||
# plt.show()
|
||||
264
Lab1/3.1/main.py
Normal file
264
Lab1/3.1/main.py
Normal file
@ -0,0 +1,264 @@
|
||||
import pandas as pd
|
||||
from sklearn.preprocessing import StandardScaler
|
||||
from sklearn.linear_model import LogisticRegression
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import confusion_matrix, accuracy_score
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.metrics import classification_report
|
||||
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
||||
import numpy as np
|
||||
import seaborn as sns
|
||||
from scipy.stats import boxcox
|
||||
|
||||
def normalize_column(column):
|
||||
min_value = min(column)
|
||||
max_value = max(column)
|
||||
return (column-min_value)/(max_value-min_value)
|
||||
|
||||
# 1.
|
||||
main_df = pd.read_csv("music_spotify.csv")
|
||||
# https://developer.spotify.com/documentation/web-api/reference/get-audio-features
|
||||
|
||||
#print(len(main_df.columns))
|
||||
|
||||
# 2.
|
||||
if False:
|
||||
main_df["duration_ms_01"] = normalize_column(main_df["duration_ms"])
|
||||
for column_name in ["acousticness", "danceability", "energy", "instrumentalness", "duration_ms"]:
|
||||
main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0")
|
||||
main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1")
|
||||
plt.title(f"Density plot ({column_name} vs target)")
|
||||
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
|
||||
plt.xlabel(column_name)
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
# 3.
|
||||
if False:
|
||||
# main_df["loudness_01"] = normalize_column(main_df["loudness"])
|
||||
# main_df["tempo_01"] = normalize_column(main_df["tempo"])
|
||||
for column_name in ["liveness", "loudness", "speechiness", "tempo", "valence"]:
|
||||
main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0")
|
||||
main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1")
|
||||
plt.title(f"Density plot ({column_name} vs target)")
|
||||
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
|
||||
plt.xlabel(column_name)
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
# 4.
|
||||
if False:
|
||||
for column_name in ["key", "mode", "time_signature"]:
|
||||
df_new = main_df[[column_name, "target"]].pivot_table(columns=column_name, index="target", aggfunc=len, fill_value=0).T#.apply(lambda x: x/sum(x), axis=1)
|
||||
ax = df_new.plot.bar(stacked=True)
|
||||
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
|
||||
plt.style.use('ggplot')
|
||||
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
|
||||
plt.ylabel("occurrences")
|
||||
plt.title(f"Bar graph ({column_name} vs target)")
|
||||
plt.show()
|
||||
|
||||
# 5.
|
||||
# Explain parts 1-4 in report
|
||||
|
||||
# 6.
|
||||
X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3)
|
||||
if False:
|
||||
# Initialize the scaler
|
||||
scaler = StandardScaler()
|
||||
|
||||
# Fit the scaler and transform the features
|
||||
scaler.fit(X_train)
|
||||
X_train_scaled=scaler.transform(X_train)
|
||||
X_test_scaled=scaler.transform(X_test)
|
||||
# Now use the scaled features to train your logistic regression model
|
||||
model = LogisticRegression()
|
||||
model.fit(X_train_scaled, y_train)
|
||||
|
||||
preds = model.predict(X_test_scaled)
|
||||
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
|
||||
print(classification_report(y_test, np.where(preds_probs > 0.5, 1, 0)))
|
||||
conf_matrix = confusion_matrix(y_test, preds)
|
||||
print("----- Confusion matrix ---------")
|
||||
print(" Predicted 0, Predicted 1")
|
||||
print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}")
|
||||
print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}")
|
||||
print("---------- Coeffs --------------")
|
||||
print(model.coef_[0])
|
||||
|
||||
# Use sigmoid for equation
|
||||
|
||||
# 7.
|
||||
if False:
|
||||
# Initialize the scaler
|
||||
scaler = StandardScaler()
|
||||
|
||||
# Fit the scaler and transform the features
|
||||
scaler.fit(X_train)
|
||||
X_train_scaled=scaler.transform(X_train)
|
||||
X_test_scaled=scaler.transform(X_test)
|
||||
# Now use the scaled features to train your logistic regression model
|
||||
model = LogisticRegression()
|
||||
model.fit(X_train_scaled, y_train)
|
||||
|
||||
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
|
||||
for threshold in [0.3, 0.4, 0.5]:
|
||||
preds = np.where(preds_probs > threshold, 1, 0)
|
||||
print(f"=========== THRESHOLD: {threshold} ===========")
|
||||
print(classification_report(y_test, preds))
|
||||
conf_matrix = confusion_matrix(y_test, preds)
|
||||
print("----- Confusion matrix ---------")
|
||||
print(" Predicted 0, Predicted 1")
|
||||
print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}")
|
||||
print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}")
|
||||
|
||||
# 8.
|
||||
if False:
|
||||
def get_accuracy(X_train, X_test, y_train, y_test):
|
||||
# Initialize the scaler
|
||||
scaler = StandardScaler()
|
||||
|
||||
# Fit the scaler and transform the features
|
||||
scaler.fit(X_train)
|
||||
X_train_scaled=scaler.transform(X_train)
|
||||
X_test_scaled=scaler.transform(X_test)
|
||||
# Now use the scaled features to train your logistic regression model
|
||||
model = LogisticRegression()
|
||||
model.fit(X_train_scaled, y_train)
|
||||
|
||||
preds = model.predict(X_test_scaled)
|
||||
|
||||
#return accuracy_score(y_test, preds)
|
||||
return np.mean(preds == y_test)
|
||||
|
||||
iterations = 50
|
||||
diffs = {}
|
||||
for column_name in X_train.columns:
|
||||
diffs[column_name] = 0
|
||||
|
||||
for _ in range(iterations):
|
||||
X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3)
|
||||
baseline_accuracy = get_accuracy(X_train, X_test, y_train, y_test)
|
||||
|
||||
for column_name in X_train.columns:
|
||||
accuracy = get_accuracy(X_train.drop(columns=[column_name]), X_test.drop(columns=[column_name]), y_train, y_test)
|
||||
diffs[column_name] += (accuracy - baseline_accuracy)
|
||||
#diffs.append((column_name, accuracy - baseline_accuracy))
|
||||
|
||||
diffs_array = list(diffs.items())
|
||||
diffs_array.sort(key=lambda e: -e[1])
|
||||
for (column_name, diff) in diffs_array:
|
||||
print(f"{column_name:20} {diff / iterations}")
|
||||
|
||||
# 9.
|
||||
if True:
|
||||
# plt.figure(figsize=(8, 6))
|
||||
# sns.heatmap(main_df.drop(columns=["X", "song_title", "artist"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
|
||||
# plt.title("Correlation Matrix")
|
||||
# plt.show()
|
||||
|
||||
# main_df["duration_ms"] = normalize_column(main_df["duration_ms"])
|
||||
# main_df["loudness"] = normalize_column(main_df["loudness"])
|
||||
# main_df["tempo"] = normalize_column(main_df["tempo"])
|
||||
# main_df.drop(columns=["X"]).boxplot()
|
||||
# plt.show()
|
||||
|
||||
transformations = {
|
||||
"log": (
|
||||
lambda column: np.log(column + 0.1),
|
||||
lambda column: all(v > 0 for v in column)
|
||||
),
|
||||
"square": (
|
||||
lambda column: np.square(column),
|
||||
lambda column: True
|
||||
),
|
||||
"sqrt": (
|
||||
lambda column: np.sqrt(column),
|
||||
lambda column: all(v >= 0 for v in column)
|
||||
),
|
||||
"sqrt3": (
|
||||
lambda column: np.float_power(column, 1/3),
|
||||
lambda column: all(v >= 0 for v in column)
|
||||
),
|
||||
"exp": (
|
||||
lambda column: np.exp(column),
|
||||
lambda column: all(v <= 1000 for v in column)
|
||||
)
|
||||
}
|
||||
|
||||
results = []
|
||||
for column_name in X_train.columns:
|
||||
for (transform_name, (transform, criteria)) in transformations.items():
|
||||
if not criteria(main_df[column_name]): continue
|
||||
new_X_train = X_train.copy()
|
||||
new_X_test = X_test.copy()
|
||||
|
||||
new_X_train[f"{column_name}_{transform_name}"] = transform(X_train[column_name])
|
||||
new_X_test[f"{column_name}_{transform_name}"] = transform(X_test[column_name])
|
||||
|
||||
scaler = StandardScaler()
|
||||
scaler.fit(new_X_train)
|
||||
X_train_scaled=scaler.transform(new_X_train)
|
||||
X_test_scaled=scaler.transform(new_X_test)
|
||||
|
||||
model = LogisticRegression()
|
||||
model.fit(X_train_scaled, y_train)
|
||||
|
||||
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
|
||||
result_entry = {
|
||||
"column": column_name,
|
||||
"transform": transform_name,
|
||||
"thresholds": {},
|
||||
}
|
||||
results.append(result_entry)
|
||||
for threshold in [0.3, 0.5, 0.7]:
|
||||
preds = np.where(preds_probs > threshold, 1, 0)
|
||||
confusion = confusion_matrix(y_test, preds)
|
||||
diag_values = np.diag(confusion)
|
||||
row_sums = np.sum(confusion, axis=1)
|
||||
percentage_accuracy = (diag_values / row_sums)
|
||||
result_entry["thresholds"][threshold] = {
|
||||
"overall_accuracy": np.mean(y_test == preds),
|
||||
"class0_accuracy": percentage_accuracy[0],
|
||||
"class1_accuracy": percentage_accuracy[1],
|
||||
}
|
||||
|
||||
results.sort(key=lambda e: -max(a["overall_accuracy"] for a in e["thresholds"].values()))
|
||||
best_result = results[0]
|
||||
for result in results[:5]:
|
||||
print("------")
|
||||
print("column", result["column"], result["transform"])
|
||||
for threshold in result["thresholds"].keys():
|
||||
print(f'[{threshold}] class0_accuracy {result["thresholds"][threshold]["class0_accuracy"]:.6f}', )
|
||||
print(f'[{threshold}] class1_accuracy {result["thresholds"][threshold]["class1_accuracy"]:.6f}')
|
||||
print(f'[{threshold}] overall_accuracy {result["thresholds"][threshold]["overall_accuracy"]:.6f}')
|
||||
#print("error_rate", result["error_rate"])
|
||||
|
||||
# print("======================")
|
||||
# print("overall_accuracy", best_result["overall_accuracy"])
|
||||
# print("class0_accuracy", best_result["class0_accuracy"])
|
||||
# print("class1_accuracy", best_result["class1_accuracy"])
|
||||
# print("error_rate", best_result["error_rate"])
|
||||
# print("threshold", best_result["threshold"])
|
||||
# print("column", best_result["column"], best_result["transform"])
|
||||
|
||||
# Calculate ROC curve and AUC
|
||||
# fpr, tpr, thresholds = roc_curve(y_test, best_result["probs"])
|
||||
# roc_auc = auc(fpr, tpr)
|
||||
|
||||
# # Plot ROC curve
|
||||
# plt.figure(figsize=(5,5))
|
||||
# plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
|
||||
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
|
||||
# plt.xlim([0.0, 1.0])
|
||||
# plt.ylim([0.0, 1.05])
|
||||
# plt.xlabel('False Positive Rate')
|
||||
# plt.ylabel('True Positive Rate')
|
||||
# plt.title('Receiver Operating Characteristic (ROC) Curve')
|
||||
# plt.legend(loc='lower right')
|
||||
# plt.show()
|
||||
|
||||
# # Print AUC value
|
||||
# print("AUC:", roc_auc)
|
||||
|
||||
|
||||
2018
Lab1/3.1/music_spotify.csv
Normal file
2018
Lab1/3.1/music_spotify.csv
Normal file
File diff suppressed because it is too large
Load Diff
189
Lab1/4.1/main.py
Normal file
189
Lab1/4.1/main.py
Normal file
@ -0,0 +1,189 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns # Import seaborn for bar plotting
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
|
||||
from sklearn.metrics import accuracy_score, confusion_matrix
|
||||
|
||||
# 1. Load the dataset
|
||||
train_df = pd.read_csv("sign_mnist_train.csv")
|
||||
test_df = pd.read_csv("sign_mnist_test.csv")
|
||||
|
||||
# for df in [train_df, test_df]:
|
||||
# for column in df.columns[1:]:
|
||||
# df.loc[abs(df[column] - 160) <= 5, column] = 0
|
||||
|
||||
if False:
|
||||
# Create a 4x4 grid of subplots
|
||||
fig, axes = plt.subplots(4, 4, figsize=(8, 8))
|
||||
plt.subplots_adjust(wspace=0, hspace=0) # Adjust spacing
|
||||
|
||||
df_by_label = test_df[test_df["label"] == 1][:16]
|
||||
|
||||
indices = list(range(16))
|
||||
for i, ax in zip(indices, axes.ravel()):
|
||||
hand_sign = np.array(df_by_label.iloc[i, 1:])
|
||||
hand_sign = hand_sign.reshape(28, 28)
|
||||
|
||||
ax.imshow(hand_sign, cmap='gray', aspect='auto')
|
||||
ax.axis('off') # Hide axis labels
|
||||
|
||||
plt.show()
|
||||
|
||||
X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"])
|
||||
y_train, y_test = train_df["label"], test_df["label"]
|
||||
|
||||
# for hand_sign in range(26):
|
||||
# count = len(train_df[train_df["label"] == hand_sign])
|
||||
# print(f"[{hand_sign}] = {count}")
|
||||
|
||||
# 2.
|
||||
if False:
|
||||
lda_classifier = LinearDiscriminantAnalysis()
|
||||
lda_classifier.fit(X_train, y_train)
|
||||
|
||||
lda_predictions = lda_classifier.predict(X_test)
|
||||
lda_confusion_matrix = confusion_matrix(y_test, lda_predictions)
|
||||
|
||||
lda_accuracy = np.mean(y_test == lda_predictions)
|
||||
|
||||
set_of_signs = list(set(lda_predictions))
|
||||
lda_class_accuracies = [lda_confusion_matrix[i, i] / np.sum(lda_confusion_matrix[i, :]) for i in range(len(lda_confusion_matrix))]
|
||||
print(f"LDA Overall Accuracy: {lda_accuracy*100:.2f}")
|
||||
print("LDA Class-Specific Accuracies:")
|
||||
for i, acc in enumerate(lda_class_accuracies):
|
||||
print(f"Class {set_of_signs[i]}: {acc*100:.2f}")
|
||||
|
||||
def remove_every_nth_column(df, count):
|
||||
return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0]
|
||||
|
||||
def leave_every_nth_column(df, count):
|
||||
return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0]
|
||||
|
||||
def iter_operations(train_df, test_df):
|
||||
for i in range(2, 20):
|
||||
yield (
|
||||
f"remove_every_nth_column_{i}",
|
||||
remove_every_nth_column(train_df, i),
|
||||
remove_every_nth_column(test_df, i)
|
||||
)
|
||||
|
||||
for i in range(2, 20):
|
||||
yield (
|
||||
f"leave_every_nth_column_{i}",
|
||||
leave_every_nth_column(train_df, i),
|
||||
leave_every_nth_column(test_df, i)
|
||||
)
|
||||
|
||||
# 3
|
||||
if False:
|
||||
def get_overall_accuracy(X_train, X_test, y_train, y_test):
|
||||
lda_classifier = LinearDiscriminantAnalysis()
|
||||
lda_classifier.fit(X_train, y_train)
|
||||
|
||||
lda_predictions = lda_classifier.predict(X_test)
|
||||
lda_accuracy = np.mean(y_test == lda_predictions)
|
||||
return lda_accuracy
|
||||
|
||||
init_X_train = X_train.copy()
|
||||
init_X_test = X_test.copy()
|
||||
|
||||
# ('leave_every_nth_column_5', 0.6002509760178472)
|
||||
init_X_train = leave_every_nth_column(init_X_train, 5)
|
||||
init_X_test = leave_every_nth_column(init_X_test, 5)
|
||||
|
||||
# ('remove_every_nth_column_15', 0.6167038482989403)
|
||||
init_X_train = remove_every_nth_column(init_X_train, 15)
|
||||
init_X_test = remove_every_nth_column(init_X_test, 15)
|
||||
|
||||
# ('remove_every_nth_column_14', 0.6179587283881762)
|
||||
init_X_train = remove_every_nth_column(init_X_train, 14)
|
||||
init_X_test = remove_every_nth_column(init_X_test, 14)
|
||||
|
||||
# ('remove_every_nth_column_11', 0.6183770217512549)
|
||||
init_X_train = remove_every_nth_column(init_X_train, 11)
|
||||
init_X_test = remove_every_nth_column(init_X_test, 11)
|
||||
|
||||
print(len(init_X_train.columns))
|
||||
indices = list(int(c[5:]) for c in init_X_train.columns)
|
||||
print(indices)
|
||||
|
||||
for y in range(28):
|
||||
for x in range(28):
|
||||
if y*28+x+1 in indices:
|
||||
print("1; ", end='')
|
||||
else:
|
||||
print("0; ", end='')
|
||||
|
||||
print("")
|
||||
|
||||
# results = []
|
||||
# for (name, reduced_X_train, reduced_X_test) in iter_operations(init_X_train, init_X_test):
|
||||
# accuracy = get_overall_accuracy(reduced_X_train, reduced_X_test, y_train, y_test)
|
||||
# results.append((name, accuracy))
|
||||
|
||||
print(get_overall_accuracy(init_X_train, init_X_test, y_train, y_test))
|
||||
# results.sort(key=lambda e: -e[1])
|
||||
# # print(results[0])
|
||||
# for (name, accuracy) in results[:8]:
|
||||
# print(name, accuracy)
|
||||
|
||||
# 4
|
||||
if False:
|
||||
qda_classifier = QuadraticDiscriminantAnalysis()
|
||||
qda_classifier.fit(X_train, y_train)
|
||||
|
||||
qda_predictions = qda_classifier.predict(X_test)
|
||||
qda_accuracy = np.mean(y_test == qda_predictions)
|
||||
qda_confusion_matrix = confusion_matrix(y_test, qda_predictions)
|
||||
|
||||
qda_class_accuracies = [qda_confusion_matrix[i, i] / np.sum(qda_confusion_matrix[i, :]) for i in range(len(qda_confusion_matrix))]
|
||||
print(f"QDA Overall Accuracy: {qda_accuracy*100:.2f}%")
|
||||
print("QDA Class-Specific Accuracies:")
|
||||
for i, acc in enumerate(qda_class_accuracies):
|
||||
print(f"Class {i}: {acc*100:.2f}%")
|
||||
|
||||
# 5
|
||||
if True:
|
||||
def get_overall_accuracy(X_train, X_test, y_train, y_test):
|
||||
qda_classifier = QuadraticDiscriminantAnalysis()
|
||||
qda_classifier.fit(X_train, y_train)
|
||||
|
||||
qda_predictions = qda_classifier.predict(X_test)
|
||||
qda_accuracy = np.mean(y_test == qda_predictions)
|
||||
return qda_accuracy
|
||||
|
||||
init_X_train = X_train.copy()
|
||||
init_X_test = X_test.copy()
|
||||
|
||||
# remove_every_nth_column_3 0.7632459564974903
|
||||
init_X_train = remove_every_nth_column(init_X_train, 3)
|
||||
init_X_test = remove_every_nth_column(init_X_test, 3)
|
||||
|
||||
# remove_every_nth_column_11 0.7717512548800892
|
||||
init_X_train = remove_every_nth_column(init_X_train, 11)
|
||||
init_X_test = remove_every_nth_column(init_X_test, 11)
|
||||
|
||||
print(len(init_X_train.columns))
|
||||
# indices = list(int(c[5:]) for c in init_X_train.columns)
|
||||
|
||||
# for y in range(28):
|
||||
# for x in range(28):
|
||||
# if y*28+x+1 in indices:
|
||||
# print("1;", end='')
|
||||
# else:
|
||||
# print("0;", end='')
|
||||
|
||||
# print("")
|
||||
|
||||
# results = []
|
||||
# for (name, reduced_X_train, reduced_X_test) in iter_operations(init_X_train, init_X_test):
|
||||
# print(name)
|
||||
# accuracy = get_overall_accuracy(reduced_X_train, reduced_X_test, y_train, y_test)
|
||||
# results.append((name, accuracy))
|
||||
|
||||
print(get_overall_accuracy(init_X_train, init_X_test, y_train, y_test))
|
||||
# results.sort(key=lambda e: -e[1])
|
||||
# for (name, accuracy) in results[:8]:
|
||||
# print(name, accuracy)
|
||||
7173
Lab1/4.1/sign_mnist_test.csv
Normal file
7173
Lab1/4.1/sign_mnist_test.csv
Normal file
File diff suppressed because one or more lines are too long
27456
Lab1/4.1/sign_mnist_train.csv
Normal file
27456
Lab1/4.1/sign_mnist_train.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
Lab1/Extra.xlsx
Normal file
BIN
Lab1/Extra.xlsx
Normal file
Binary file not shown.
BIN
Lab1/Lab1_Rokas-Puzonas_IF-1-1.pdf
Normal file
BIN
Lab1/Lab1_Rokas-Puzonas_IF-1-1.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user