machine-learing-methods/Lab1/3.1/main.py
2023-11-30 20:24:45 +02:00

265 lines
10 KiB
Python

import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score
import numpy as np
import seaborn as sns
from scipy.stats import boxcox
def normalize_column(column):
min_value = min(column)
max_value = max(column)
return (column-min_value)/(max_value-min_value)
# 1.
main_df = pd.read_csv("music_spotify.csv")
# https://developer.spotify.com/documentation/web-api/reference/get-audio-features
#print(len(main_df.columns))
# 2.
if False:
main_df["duration_ms_01"] = normalize_column(main_df["duration_ms"])
for column_name in ["acousticness", "danceability", "energy", "instrumentalness", "duration_ms"]:
main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0")
main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1")
plt.title(f"Density plot ({column_name} vs target)")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
plt.xlabel(column_name)
plt.grid()
plt.show()
# 3.
if False:
# main_df["loudness_01"] = normalize_column(main_df["loudness"])
# main_df["tempo_01"] = normalize_column(main_df["tempo"])
for column_name in ["liveness", "loudness", "speechiness", "tempo", "valence"]:
main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0")
main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1")
plt.title(f"Density plot ({column_name} vs target)")
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
plt.xlabel(column_name)
plt.grid()
plt.show()
# 4.
if False:
for column_name in ["key", "mode", "time_signature"]:
df_new = main_df[[column_name, "target"]].pivot_table(columns=column_name, index="target", aggfunc=len, fill_value=0).T#.apply(lambda x: x/sum(x), axis=1)
ax = df_new.plot.bar(stacked=True)
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
plt.style.use('ggplot')
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
plt.ylabel("occurrences")
plt.title(f"Bar graph ({column_name} vs target)")
plt.show()
# 5.
# Explain parts 1-4 in report
# 6.
X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3)
if False:
# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler and transform the features
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
# Now use the scaled features to train your logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
preds = model.predict(X_test_scaled)
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
print(classification_report(y_test, np.where(preds_probs > 0.5, 1, 0)))
conf_matrix = confusion_matrix(y_test, preds)
print("----- Confusion matrix ---------")
print(" Predicted 0, Predicted 1")
print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}")
print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}")
print("---------- Coeffs --------------")
print(model.coef_[0])
# Use sigmoid for equation
# 7.
if False:
# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler and transform the features
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
# Now use the scaled features to train your logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
for threshold in [0.3, 0.4, 0.5]:
preds = np.where(preds_probs > threshold, 1, 0)
print(f"=========== THRESHOLD: {threshold} ===========")
print(classification_report(y_test, preds))
conf_matrix = confusion_matrix(y_test, preds)
print("----- Confusion matrix ---------")
print(" Predicted 0, Predicted 1")
print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}")
print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}")
# 8.
if False:
def get_accuracy(X_train, X_test, y_train, y_test):
# Initialize the scaler
scaler = StandardScaler()
# Fit the scaler and transform the features
scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
# Now use the scaled features to train your logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
preds = model.predict(X_test_scaled)
#return accuracy_score(y_test, preds)
return np.mean(preds == y_test)
iterations = 50
diffs = {}
for column_name in X_train.columns:
diffs[column_name] = 0
for _ in range(iterations):
X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3)
baseline_accuracy = get_accuracy(X_train, X_test, y_train, y_test)
for column_name in X_train.columns:
accuracy = get_accuracy(X_train.drop(columns=[column_name]), X_test.drop(columns=[column_name]), y_train, y_test)
diffs[column_name] += (accuracy - baseline_accuracy)
#diffs.append((column_name, accuracy - baseline_accuracy))
diffs_array = list(diffs.items())
diffs_array.sort(key=lambda e: -e[1])
for (column_name, diff) in diffs_array:
print(f"{column_name:20} {diff / iterations}")
# 9.
if True:
# plt.figure(figsize=(8, 6))
# sns.heatmap(main_df.drop(columns=["X", "song_title", "artist"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
# plt.title("Correlation Matrix")
# plt.show()
# main_df["duration_ms"] = normalize_column(main_df["duration_ms"])
# main_df["loudness"] = normalize_column(main_df["loudness"])
# main_df["tempo"] = normalize_column(main_df["tempo"])
# main_df.drop(columns=["X"]).boxplot()
# plt.show()
transformations = {
"log": (
lambda column: np.log(column + 0.1),
lambda column: all(v > 0 for v in column)
),
"square": (
lambda column: np.square(column),
lambda column: True
),
"sqrt": (
lambda column: np.sqrt(column),
lambda column: all(v >= 0 for v in column)
),
"sqrt3": (
lambda column: np.float_power(column, 1/3),
lambda column: all(v >= 0 for v in column)
),
"exp": (
lambda column: np.exp(column),
lambda column: all(v <= 1000 for v in column)
)
}
results = []
for column_name in X_train.columns:
for (transform_name, (transform, criteria)) in transformations.items():
if not criteria(main_df[column_name]): continue
new_X_train = X_train.copy()
new_X_test = X_test.copy()
new_X_train[f"{column_name}_{transform_name}"] = transform(X_train[column_name])
new_X_test[f"{column_name}_{transform_name}"] = transform(X_test[column_name])
scaler = StandardScaler()
scaler.fit(new_X_train)
X_train_scaled=scaler.transform(new_X_train)
X_test_scaled=scaler.transform(new_X_test)
model = LogisticRegression()
model.fit(X_train_scaled, y_train)
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
result_entry = {
"column": column_name,
"transform": transform_name,
"thresholds": {},
}
results.append(result_entry)
for threshold in [0.3, 0.5, 0.7]:
preds = np.where(preds_probs > threshold, 1, 0)
confusion = confusion_matrix(y_test, preds)
diag_values = np.diag(confusion)
row_sums = np.sum(confusion, axis=1)
percentage_accuracy = (diag_values / row_sums)
result_entry["thresholds"][threshold] = {
"overall_accuracy": np.mean(y_test == preds),
"class0_accuracy": percentage_accuracy[0],
"class1_accuracy": percentage_accuracy[1],
}
results.sort(key=lambda e: -max(a["overall_accuracy"] for a in e["thresholds"].values()))
best_result = results[0]
for result in results[:5]:
print("------")
print("column", result["column"], result["transform"])
for threshold in result["thresholds"].keys():
print(f'[{threshold}] class0_accuracy {result["thresholds"][threshold]["class0_accuracy"]:.6f}', )
print(f'[{threshold}] class1_accuracy {result["thresholds"][threshold]["class1_accuracy"]:.6f}')
print(f'[{threshold}] overall_accuracy {result["thresholds"][threshold]["overall_accuracy"]:.6f}')
#print("error_rate", result["error_rate"])
# print("======================")
# print("overall_accuracy", best_result["overall_accuracy"])
# print("class0_accuracy", best_result["class0_accuracy"])
# print("class1_accuracy", best_result["class1_accuracy"])
# print("error_rate", best_result["error_rate"])
# print("threshold", best_result["threshold"])
# print("column", best_result["column"], best_result["transform"])
# Calculate ROC curve and AUC
# fpr, tpr, thresholds = roc_curve(y_test, best_result["probs"])
# roc_auc = auc(fpr, tpr)
# # Plot ROC curve
# plt.figure(figsize=(5,5))
# plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
# plt.xlim([0.0, 1.0])
# plt.ylim([0.0, 1.05])
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.title('Receiver Operating Characteristic (ROC) Curve')
# plt.legend(loc='lower right')
# plt.show()
# # Print AUC value
# print("AUC:", roc_auc)