265 lines
10 KiB
Python
265 lines
10 KiB
Python
import pandas as pd
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.linear_model import LogisticRegression
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import confusion_matrix, accuracy_score
|
|
import matplotlib.pyplot as plt
|
|
from sklearn.metrics import classification_report
|
|
from sklearn.metrics import roc_curve, auc, roc_auc_score
|
|
import numpy as np
|
|
import seaborn as sns
|
|
from scipy.stats import boxcox
|
|
|
|
def normalize_column(column):
|
|
min_value = min(column)
|
|
max_value = max(column)
|
|
return (column-min_value)/(max_value-min_value)
|
|
|
|
# 1.
|
|
main_df = pd.read_csv("music_spotify.csv")
|
|
# https://developer.spotify.com/documentation/web-api/reference/get-audio-features
|
|
|
|
#print(len(main_df.columns))
|
|
|
|
# 2.
|
|
if False:
|
|
main_df["duration_ms_01"] = normalize_column(main_df["duration_ms"])
|
|
for column_name in ["acousticness", "danceability", "energy", "instrumentalness", "duration_ms"]:
|
|
main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0")
|
|
main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1")
|
|
plt.title(f"Density plot ({column_name} vs target)")
|
|
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
|
|
plt.xlabel(column_name)
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
# 3.
|
|
if False:
|
|
# main_df["loudness_01"] = normalize_column(main_df["loudness"])
|
|
# main_df["tempo_01"] = normalize_column(main_df["tempo"])
|
|
for column_name in ["liveness", "loudness", "speechiness", "tempo", "valence"]:
|
|
main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0")
|
|
main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1")
|
|
plt.title(f"Density plot ({column_name} vs target)")
|
|
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
|
|
plt.xlabel(column_name)
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
# 4.
|
|
if False:
|
|
for column_name in ["key", "mode", "time_signature"]:
|
|
df_new = main_df[[column_name, "target"]].pivot_table(columns=column_name, index="target", aggfunc=len, fill_value=0).T#.apply(lambda x: x/sum(x), axis=1)
|
|
ax = df_new.plot.bar(stacked=True)
|
|
ax.set_xticklabels(ax.get_xticklabels(), rotation=0)
|
|
plt.style.use('ggplot')
|
|
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0);
|
|
plt.ylabel("occurrences")
|
|
plt.title(f"Bar graph ({column_name} vs target)")
|
|
plt.show()
|
|
|
|
# 5.
|
|
# Explain parts 1-4 in report
|
|
|
|
# 6.
|
|
X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3)
|
|
if False:
|
|
# Initialize the scaler
|
|
scaler = StandardScaler()
|
|
|
|
# Fit the scaler and transform the features
|
|
scaler.fit(X_train)
|
|
X_train_scaled=scaler.transform(X_train)
|
|
X_test_scaled=scaler.transform(X_test)
|
|
# Now use the scaled features to train your logistic regression model
|
|
model = LogisticRegression()
|
|
model.fit(X_train_scaled, y_train)
|
|
|
|
preds = model.predict(X_test_scaled)
|
|
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
|
|
print(classification_report(y_test, np.where(preds_probs > 0.5, 1, 0)))
|
|
conf_matrix = confusion_matrix(y_test, preds)
|
|
print("----- Confusion matrix ---------")
|
|
print(" Predicted 0, Predicted 1")
|
|
print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}")
|
|
print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}")
|
|
print("---------- Coeffs --------------")
|
|
print(model.coef_[0])
|
|
|
|
# Use sigmoid for equation
|
|
|
|
# 7.
|
|
if False:
|
|
# Initialize the scaler
|
|
scaler = StandardScaler()
|
|
|
|
# Fit the scaler and transform the features
|
|
scaler.fit(X_train)
|
|
X_train_scaled=scaler.transform(X_train)
|
|
X_test_scaled=scaler.transform(X_test)
|
|
# Now use the scaled features to train your logistic regression model
|
|
model = LogisticRegression()
|
|
model.fit(X_train_scaled, y_train)
|
|
|
|
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
|
|
for threshold in [0.3, 0.4, 0.5]:
|
|
preds = np.where(preds_probs > threshold, 1, 0)
|
|
print(f"=========== THRESHOLD: {threshold} ===========")
|
|
print(classification_report(y_test, preds))
|
|
conf_matrix = confusion_matrix(y_test, preds)
|
|
print("----- Confusion matrix ---------")
|
|
print(" Predicted 0, Predicted 1")
|
|
print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}")
|
|
print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}")
|
|
|
|
# 8.
|
|
if False:
|
|
def get_accuracy(X_train, X_test, y_train, y_test):
|
|
# Initialize the scaler
|
|
scaler = StandardScaler()
|
|
|
|
# Fit the scaler and transform the features
|
|
scaler.fit(X_train)
|
|
X_train_scaled=scaler.transform(X_train)
|
|
X_test_scaled=scaler.transform(X_test)
|
|
# Now use the scaled features to train your logistic regression model
|
|
model = LogisticRegression()
|
|
model.fit(X_train_scaled, y_train)
|
|
|
|
preds = model.predict(X_test_scaled)
|
|
|
|
#return accuracy_score(y_test, preds)
|
|
return np.mean(preds == y_test)
|
|
|
|
iterations = 50
|
|
diffs = {}
|
|
for column_name in X_train.columns:
|
|
diffs[column_name] = 0
|
|
|
|
for _ in range(iterations):
|
|
X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3)
|
|
baseline_accuracy = get_accuracy(X_train, X_test, y_train, y_test)
|
|
|
|
for column_name in X_train.columns:
|
|
accuracy = get_accuracy(X_train.drop(columns=[column_name]), X_test.drop(columns=[column_name]), y_train, y_test)
|
|
diffs[column_name] += (accuracy - baseline_accuracy)
|
|
#diffs.append((column_name, accuracy - baseline_accuracy))
|
|
|
|
diffs_array = list(diffs.items())
|
|
diffs_array.sort(key=lambda e: -e[1])
|
|
for (column_name, diff) in diffs_array:
|
|
print(f"{column_name:20} {diff / iterations}")
|
|
|
|
# 9.
|
|
if True:
|
|
# plt.figure(figsize=(8, 6))
|
|
# sns.heatmap(main_df.drop(columns=["X", "song_title", "artist"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
|
|
# plt.title("Correlation Matrix")
|
|
# plt.show()
|
|
|
|
# main_df["duration_ms"] = normalize_column(main_df["duration_ms"])
|
|
# main_df["loudness"] = normalize_column(main_df["loudness"])
|
|
# main_df["tempo"] = normalize_column(main_df["tempo"])
|
|
# main_df.drop(columns=["X"]).boxplot()
|
|
# plt.show()
|
|
|
|
transformations = {
|
|
"log": (
|
|
lambda column: np.log(column + 0.1),
|
|
lambda column: all(v > 0 for v in column)
|
|
),
|
|
"square": (
|
|
lambda column: np.square(column),
|
|
lambda column: True
|
|
),
|
|
"sqrt": (
|
|
lambda column: np.sqrt(column),
|
|
lambda column: all(v >= 0 for v in column)
|
|
),
|
|
"sqrt3": (
|
|
lambda column: np.float_power(column, 1/3),
|
|
lambda column: all(v >= 0 for v in column)
|
|
),
|
|
"exp": (
|
|
lambda column: np.exp(column),
|
|
lambda column: all(v <= 1000 for v in column)
|
|
)
|
|
}
|
|
|
|
results = []
|
|
for column_name in X_train.columns:
|
|
for (transform_name, (transform, criteria)) in transformations.items():
|
|
if not criteria(main_df[column_name]): continue
|
|
new_X_train = X_train.copy()
|
|
new_X_test = X_test.copy()
|
|
|
|
new_X_train[f"{column_name}_{transform_name}"] = transform(X_train[column_name])
|
|
new_X_test[f"{column_name}_{transform_name}"] = transform(X_test[column_name])
|
|
|
|
scaler = StandardScaler()
|
|
scaler.fit(new_X_train)
|
|
X_train_scaled=scaler.transform(new_X_train)
|
|
X_test_scaled=scaler.transform(new_X_test)
|
|
|
|
model = LogisticRegression()
|
|
model.fit(X_train_scaled, y_train)
|
|
|
|
preds_probs = model.predict_proba(X_test_scaled)[:, 1]
|
|
result_entry = {
|
|
"column": column_name,
|
|
"transform": transform_name,
|
|
"thresholds": {},
|
|
}
|
|
results.append(result_entry)
|
|
for threshold in [0.3, 0.5, 0.7]:
|
|
preds = np.where(preds_probs > threshold, 1, 0)
|
|
confusion = confusion_matrix(y_test, preds)
|
|
diag_values = np.diag(confusion)
|
|
row_sums = np.sum(confusion, axis=1)
|
|
percentage_accuracy = (diag_values / row_sums)
|
|
result_entry["thresholds"][threshold] = {
|
|
"overall_accuracy": np.mean(y_test == preds),
|
|
"class0_accuracy": percentage_accuracy[0],
|
|
"class1_accuracy": percentage_accuracy[1],
|
|
}
|
|
|
|
results.sort(key=lambda e: -max(a["overall_accuracy"] for a in e["thresholds"].values()))
|
|
best_result = results[0]
|
|
for result in results[:5]:
|
|
print("------")
|
|
print("column", result["column"], result["transform"])
|
|
for threshold in result["thresholds"].keys():
|
|
print(f'[{threshold}] class0_accuracy {result["thresholds"][threshold]["class0_accuracy"]:.6f}', )
|
|
print(f'[{threshold}] class1_accuracy {result["thresholds"][threshold]["class1_accuracy"]:.6f}')
|
|
print(f'[{threshold}] overall_accuracy {result["thresholds"][threshold]["overall_accuracy"]:.6f}')
|
|
#print("error_rate", result["error_rate"])
|
|
|
|
# print("======================")
|
|
# print("overall_accuracy", best_result["overall_accuracy"])
|
|
# print("class0_accuracy", best_result["class0_accuracy"])
|
|
# print("class1_accuracy", best_result["class1_accuracy"])
|
|
# print("error_rate", best_result["error_rate"])
|
|
# print("threshold", best_result["threshold"])
|
|
# print("column", best_result["column"], best_result["transform"])
|
|
|
|
# Calculate ROC curve and AUC
|
|
# fpr, tpr, thresholds = roc_curve(y_test, best_result["probs"])
|
|
# roc_auc = auc(fpr, tpr)
|
|
|
|
# # Plot ROC curve
|
|
# plt.figure(figsize=(5,5))
|
|
# plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
|
|
# plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
|
|
# plt.xlim([0.0, 1.0])
|
|
# plt.ylim([0.0, 1.05])
|
|
# plt.xlabel('False Positive Rate')
|
|
# plt.ylabel('True Positive Rate')
|
|
# plt.title('Receiver Operating Characteristic (ROC) Curve')
|
|
# plt.legend(loc='lower right')
|
|
# plt.show()
|
|
|
|
# # Print AUC value
|
|
# print("AUC:", roc_auc)
|
|
|
|
|