import pandas as pd from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix, accuracy_score import matplotlib.pyplot as plt from sklearn.metrics import classification_report from sklearn.metrics import roc_curve, auc, roc_auc_score import numpy as np import seaborn as sns from scipy.stats import boxcox def normalize_column(column): min_value = min(column) max_value = max(column) return (column-min_value)/(max_value-min_value) # 1. main_df = pd.read_csv("music_spotify.csv") # https://developer.spotify.com/documentation/web-api/reference/get-audio-features #print(len(main_df.columns)) # 2. if False: main_df["duration_ms_01"] = normalize_column(main_df["duration_ms"]) for column_name in ["acousticness", "danceability", "energy", "instrumentalness", "duration_ms"]: main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0") main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1") plt.title(f"Density plot ({column_name} vs target)") plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0); plt.xlabel(column_name) plt.grid() plt.show() # 3. if False: # main_df["loudness_01"] = normalize_column(main_df["loudness"]) # main_df["tempo_01"] = normalize_column(main_df["tempo"]) for column_name in ["liveness", "loudness", "speechiness", "tempo", "valence"]: main_df[main_df["target"] == 0][column_name].plot.density(color="green", label="0") main_df[main_df["target"] == 1][column_name].plot.density(color="red", label="1") plt.title(f"Density plot ({column_name} vs target)") plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0); plt.xlabel(column_name) plt.grid() plt.show() # 4. if False: for column_name in ["key", "mode", "time_signature"]: df_new = main_df[[column_name, "target"]].pivot_table(columns=column_name, index="target", aggfunc=len, fill_value=0).T#.apply(lambda x: x/sum(x), axis=1) ax = df_new.plot.bar(stacked=True) ax.set_xticklabels(ax.get_xticklabels(), rotation=0) plt.style.use('ggplot') plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), title="target", framealpha=0); plt.ylabel("occurrences") plt.title(f"Bar graph ({column_name} vs target)") plt.show() # 5. # Explain parts 1-4 in report # 6. X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3) if False: # Initialize the scaler scaler = StandardScaler() # Fit the scaler and transform the features scaler.fit(X_train) X_train_scaled=scaler.transform(X_train) X_test_scaled=scaler.transform(X_test) # Now use the scaled features to train your logistic regression model model = LogisticRegression() model.fit(X_train_scaled, y_train) preds = model.predict(X_test_scaled) preds_probs = model.predict_proba(X_test_scaled)[:, 1] print(classification_report(y_test, np.where(preds_probs > 0.5, 1, 0))) conf_matrix = confusion_matrix(y_test, preds) print("----- Confusion matrix ---------") print(" Predicted 0, Predicted 1") print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}") print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}") print("---------- Coeffs --------------") print(model.coef_[0]) # Use sigmoid for equation # 7. if False: # Initialize the scaler scaler = StandardScaler() # Fit the scaler and transform the features scaler.fit(X_train) X_train_scaled=scaler.transform(X_train) X_test_scaled=scaler.transform(X_test) # Now use the scaled features to train your logistic regression model model = LogisticRegression() model.fit(X_train_scaled, y_train) preds_probs = model.predict_proba(X_test_scaled)[:, 1] for threshold in [0.3, 0.4, 0.5]: preds = np.where(preds_probs > threshold, 1, 0) print(f"=========== THRESHOLD: {threshold} ===========") print(classification_report(y_test, preds)) conf_matrix = confusion_matrix(y_test, preds) print("----- Confusion matrix ---------") print(" Predicted 0, Predicted 1") print(f"True 0 {conf_matrix[0,0]:12d}, {conf_matrix[0,1]:12d}") print(f"True 1 {conf_matrix[1,0]:12d}, {conf_matrix[1,1]:12d}") # 8. if False: def get_accuracy(X_train, X_test, y_train, y_test): # Initialize the scaler scaler = StandardScaler() # Fit the scaler and transform the features scaler.fit(X_train) X_train_scaled=scaler.transform(X_train) X_test_scaled=scaler.transform(X_test) # Now use the scaled features to train your logistic regression model model = LogisticRegression() model.fit(X_train_scaled, y_train) preds = model.predict(X_test_scaled) #return accuracy_score(y_test, preds) return np.mean(preds == y_test) iterations = 50 diffs = {} for column_name in X_train.columns: diffs[column_name] = 0 for _ in range(iterations): X_train, X_test, y_train, y_test = train_test_split(main_df.drop(columns=["target", "X", "song_title", "artist"]), main_df["target"], test_size=0.3) baseline_accuracy = get_accuracy(X_train, X_test, y_train, y_test) for column_name in X_train.columns: accuracy = get_accuracy(X_train.drop(columns=[column_name]), X_test.drop(columns=[column_name]), y_train, y_test) diffs[column_name] += (accuracy - baseline_accuracy) #diffs.append((column_name, accuracy - baseline_accuracy)) diffs_array = list(diffs.items()) diffs_array.sort(key=lambda e: -e[1]) for (column_name, diff) in diffs_array: print(f"{column_name:20} {diff / iterations}") # 9. if True: # plt.figure(figsize=(8, 6)) # sns.heatmap(main_df.drop(columns=["X", "song_title", "artist"]).corr(), annot=True, cmap='coolwarm', fmt=".2f") # plt.title("Correlation Matrix") # plt.show() # main_df["duration_ms"] = normalize_column(main_df["duration_ms"]) # main_df["loudness"] = normalize_column(main_df["loudness"]) # main_df["tempo"] = normalize_column(main_df["tempo"]) # main_df.drop(columns=["X"]).boxplot() # plt.show() transformations = { "log": ( lambda column: np.log(column + 0.1), lambda column: all(v > 0 for v in column) ), "square": ( lambda column: np.square(column), lambda column: True ), "sqrt": ( lambda column: np.sqrt(column), lambda column: all(v >= 0 for v in column) ), "sqrt3": ( lambda column: np.float_power(column, 1/3), lambda column: all(v >= 0 for v in column) ), "exp": ( lambda column: np.exp(column), lambda column: all(v <= 1000 for v in column) ) } results = [] for column_name in X_train.columns: for (transform_name, (transform, criteria)) in transformations.items(): if not criteria(main_df[column_name]): continue new_X_train = X_train.copy() new_X_test = X_test.copy() new_X_train[f"{column_name}_{transform_name}"] = transform(X_train[column_name]) new_X_test[f"{column_name}_{transform_name}"] = transform(X_test[column_name]) scaler = StandardScaler() scaler.fit(new_X_train) X_train_scaled=scaler.transform(new_X_train) X_test_scaled=scaler.transform(new_X_test) model = LogisticRegression() model.fit(X_train_scaled, y_train) preds_probs = model.predict_proba(X_test_scaled)[:, 1] result_entry = { "column": column_name, "transform": transform_name, "thresholds": {}, } results.append(result_entry) for threshold in [0.3, 0.5, 0.7]: preds = np.where(preds_probs > threshold, 1, 0) confusion = confusion_matrix(y_test, preds) diag_values = np.diag(confusion) row_sums = np.sum(confusion, axis=1) percentage_accuracy = (diag_values / row_sums) result_entry["thresholds"][threshold] = { "overall_accuracy": np.mean(y_test == preds), "class0_accuracy": percentage_accuracy[0], "class1_accuracy": percentage_accuracy[1], } results.sort(key=lambda e: -max(a["overall_accuracy"] for a in e["thresholds"].values())) best_result = results[0] for result in results[:5]: print("------") print("column", result["column"], result["transform"]) for threshold in result["thresholds"].keys(): print(f'[{threshold}] class0_accuracy {result["thresholds"][threshold]["class0_accuracy"]:.6f}', ) print(f'[{threshold}] class1_accuracy {result["thresholds"][threshold]["class1_accuracy"]:.6f}') print(f'[{threshold}] overall_accuracy {result["thresholds"][threshold]["overall_accuracy"]:.6f}') #print("error_rate", result["error_rate"]) # print("======================") # print("overall_accuracy", best_result["overall_accuracy"]) # print("class0_accuracy", best_result["class0_accuracy"]) # print("class1_accuracy", best_result["class1_accuracy"]) # print("error_rate", best_result["error_rate"]) # print("threshold", best_result["threshold"]) # print("column", best_result["column"], best_result["transform"]) # Calculate ROC curve and AUC # fpr, tpr, thresholds = roc_curve(y_test, best_result["probs"]) # roc_auc = auc(fpr, tpr) # # Plot ROC curve # plt.figure(figsize=(5,5)) # plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})') # plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') # plt.xlim([0.0, 1.0]) # plt.ylim([0.0, 1.05]) # plt.xlabel('False Positive Rate') # plt.ylabel('True Positive Rate') # plt.title('Receiver Operating Characteristic (ROC) Curve') # plt.legend(loc='lower right') # plt.show() # # Print AUC value # print("AUC:", roc_auc)