diff --git a/.gitignore b/.gitignore index 898c710..f7774bb 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ Lab1/Figures/ +Lab2/Figures/ \ No newline at end of file diff --git a/Lab2/5.2/part1.py b/Lab2/5.2/part1.py new file mode 100644 index 0000000..be1b652 --- /dev/null +++ b/Lab2/5.2/part1.py @@ -0,0 +1,118 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree +from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix +import matplotlib.pyplot as plt +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import cross_val_score + +# Load the Carseats data +MusicSpotify = pd.read_csv('music_spotify.csv') +MusicSpotify = MusicSpotify.drop(columns=['X', 'artist', 'song_title']) +#print(MusicSpotify.head()) + +label_encoder = LabelEncoder() +label_encoder.fit_transform(MusicSpotify['target']) + +# Data division +X = MusicSpotify.drop(columns=['target']) #dropping Sales, because class variable High is made of variable Sales. +y = MusicSpotify['target'] + +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) + +# Tree training (no cost complexity pruning) +tree_music_spotify = DecisionTreeClassifier(ccp_alpha=0.00, min_samples_split=2) +tree_music_spotify.fit(X_train, y_train) + +pred_rbf = tree_music_spotify.predict(X_test) + + +if False: + # Plot the Decision Tree with matplotlib + feature_names = X_train.columns.tolist() # Assuming X_train is a DataFrame + class_names = y_train.unique().astype(str) + plt.figure(figsize=(12, 8)) + plot_tree(tree_music_spotify, filled=True, feature_names=feature_names, class_names=class_names, rounded=True, fontsize=10) + plt.show() + +# Tree information and visualization +path = tree_music_spotify.cost_complexity_pruning_path(X_train, y_train) +ccp_alphas, impurities = path.ccp_alphas, path.impurities +#path = tree_music_spotify.cost_complexity_pruning_path(X_train, y_train) +if False: + fig, ax = plt.subplots() + ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post") + ax.set_xlabel("effective alpha") + ax.set_ylabel("total impurity of leaves") + ax.set_title("Total Impurity vs effective alpha for training set") + plt.grid() + plt.show() + +if False: + # Vary the hyperparameter (e.g., max depth) + depth_values = range(1, 25) + cv_scores = [] + + for depth in depth_values: + dt_classifier = DecisionTreeClassifier(max_depth=depth, random_state=20) + scores = cross_val_score(dt_classifier, X, y, cv=10, scoring='accuracy') + cv_scores.append(np.mean(scores)) + + # Plot the cross-validated error + plt.plot(depth_values, 1 - np.array(cv_scores), marker='o') + plt.xlabel('Tree depth') + plt.ylabel('Cross-validated error rate') + plt.title('Cross-validated error vs. tree depth') + plt.grid(True) + plt.show() + +if False: + ccp_alphas_collect=[] + accuracy_collect=[] + for ccp_alpha in ccp_alphas: + tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha) + tree.fit(X_train, y_train) + accuracy = tree.score(X_test, y_test) + ccp_alphas_collect.append(ccp_alpha) + accuracy_collect.append(accuracy) + + plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect)) + plt.grid() + plt.xlim(0, 0.03) + plt.xlabel('CP alpha') + plt.ylabel('Accuracy') + plt.show() + +if False: + plt.figure(figsize=(12, 6)) + plot_tree(tree_music_spotify, filled=True, feature_names=X.columns.tolist()) + plt.show() + +# Prediction and accuracy +pred = tree_music_spotify.predict(X_test) +accuracy = accuracy_score(y_test, pred) * 100 +print("Unprunned:") +print(f"Accuracy: {accuracy:.2f}%") +print("10-fold cross-validation score: ", cross_val_score(tree_music_spotify, X, y, cv=10).mean()) +print("Confusion Matrix:\n", confusion_matrix(pred, y_test)) + +# Pruning +music_spotify_prunned = DecisionTreeClassifier(ccp_alpha=0.006, random_state=20) +music_spotify_prunned.fit(X_train, y_train) +pred = music_spotify_prunned.predict(X_test) +print("Prunned:") +print(f"Accuracy: {accuracy_score(y_test, pred) * 100:.2f}%") +print("10-fold cross-validation score: ", cross_val_score(music_spotify_prunned, X, y, cv=10).mean()) +print("Confusion Matrix:\n", confusion_matrix(pred, y_test)) + +if False: + plt.figure(figsize=(12, 6)) + plot_tree(music_spotify_prunned, filled=True, feature_names=X.columns.tolist()) + plt.show() + +if True: + feat_importances = pd.DataFrame(tree_music_spotify.feature_importances_, index=X_train.columns, columns=["Importance"]) + feat_importances.sort_values(by='Importance', ascending=False, inplace=True) + feat_importances.plot(kind='bar', figsize=(8,6)) + plt.show() \ No newline at end of file diff --git a/Lab2/5.2/part2.py b/Lab2/5.2/part2.py new file mode 100644 index 0000000..ddc4f07 --- /dev/null +++ b/Lab2/5.2/part2.py @@ -0,0 +1,107 @@ +import pandas as pd +import numpy as np +from sklearn.model_selection import train_test_split +from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree +from sklearn.metrics import accuracy_score +import matplotlib.pyplot as plt +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import cross_val_score +from time import time + +print("Loading data") +train_df = pd.read_csv("sign_mnist_train.csv") +test_df = pd.read_csv("sign_mnist_test.csv") + +def remove_every_nth_column(df, count): + return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0] + +def leave_every_nth_column(df, count): + return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0] + +X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"]) +y_train, y_test = train_df["label"], test_df["label"] + +# # ('leave_every_nth_column_5', 0.6002509760178472) +# X_train = leave_every_nth_column(X_train, 5) +# X_test = leave_every_nth_column(X_test, 5) + +# # ('remove_every_nth_column_15', 0.6167038482989403) +# X_train = remove_every_nth_column(X_train, 15) +# X_test = remove_every_nth_column(X_test, 15) + +# # ('remove_every_nth_column_14', 0.6179587283881762) +# X_train = remove_every_nth_column(X_train, 14) +# X_test = remove_every_nth_column(X_test, 14) + +# # ('remove_every_nth_column_11', 0.6183770217512549) +# X_train = remove_every_nth_column(X_train, 11) +# X_test = remove_every_nth_column(X_test, 11) + +start = time() +ccp_alpha = 0.0009 +print("Training", ccp_alpha) +decision_tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha) +decision_tree.fit(X_train, y_train) +print(time() - start) + +if False: + print("Viz") + ccp_alphas_collect=[] + accuracy_collect=[] + for ccp_alpha in ccp_alphas: + tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha) + tree.fit(X_train, y_train) + accuracy = tree.score(X_test, y_test) + ccp_alphas_collect.append(ccp_alpha) + accuracy_collect.append(accuracy) + + plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect)) + plt.grid() + plt.xlabel('effective alpha') + plt.ylabel('Accuracy of test set') + plt.show() + +pred = decision_tree.predict(X_test) +accuracy = accuracy_score(y_test, pred) * 100 +print("Unprunned:") +print(f"Accuracy: {accuracy:.2f}%") +#print("10-fold cross-validation score: ", cross_val_score(decision_tree, X_test, y_test, cv=10).mean()) + +if False: + plt.figure(figsize=(12, 6)) + plot_tree(decision_tree, filled=True, feature_names=X_test.columns.tolist()) + plt.show() + +if False: + # Pruning + decision_tree_prunned = DecisionTreeClassifier(ccp_alpha=0.01) + decision_tree_prunned.fit(X_train, y_train) + print("Prunned:") + print(f"Accuracy: {accuracy_score(decision_tree_prunned.predict(X_test), pred) * 100:.2f}%") + print("10-fold cross-validation score: ", cross_val_score(decision_tree_prunned, X_test, y_test, cv=10).mean()) + +if True: + if True: + feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"]) + print(decision_tree.feature_importances_) + print(len(decision_tree.feature_importances_)) + print(X_train.columns) + + image = [] + for y in range(28): + image.append([0]*28) + + + for idx, importance in enumerate(decision_tree.feature_importances_): + pixel_idx = int(X_train.columns[idx].removeprefix("pixel"))-1 + pixel_x = pixel_idx % 28 + pixel_y = pixel_idx // 28 + image[pixel_y][pixel_x] = importance + + for y in range(28): + print(",".join(str(a) for a in image[y])) + + # feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"]) + # feat_importances.sort_values(by='Importance', ascending=False, inplace=True) + # feat_importances.plot(kind='bar', figsize=(8,6)) + # plt.show() \ No newline at end of file diff --git a/Lab2/6.1/example.py b/Lab2/6.1/example.py new file mode 100644 index 0000000..6f596be --- /dev/null +++ b/Lab2/6.1/example.py @@ -0,0 +1,151 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn import svm +from sklearn.model_selection import train_test_split +from sklearn.model_selection import GridSearchCV +from sklearn.metrics import confusion_matrix + +# 1. Linear SVC +np.random.seed(1) +x = np.random.normal(size=(20, 2)) +y = np.concatenate([-np.ones(10), np.ones(10)]) +x[y == 1, :] = x[y == 1, :] + 1 +# plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k') +# plt.grid(True) +# plt.show() + +# 2. Linear SVC fitting +data = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y}) +svm_model = svm.SVC(kernel='linear', C=10) +svm_model.fit(data[['x1', 'x2']], data['y']) + +# 3. Visualizing +if False: + xx, yy = np.meshgrid(np.linspace(-2.1, 2.5, 100), np.linspace(-1.3, 2.6, 100)) + Z = svm_model.decision_function(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + + plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5) + plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k') + plt.xlim(-2.1, 2.5) + plt.ylim(-1.3, 2.6) + plt.grid(True) + plt.show() + +# 4. Indices of Support vectors +print("Indices of Support Vectors:", svm_model.support_) + +# 6. Controlling the cost=1/C hyperparameter +svm_model_low_cost = svm.SVC(kernel='linear', C=100) +svm_model_low_cost.fit(data[['x1', 'x2']], data['y']) + +if False: + xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100)) + Z = svm_model_low_cost.decision_function(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + + plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5) + plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k') + plt.xlim(-2.1, 2.5) + plt.ylim(-1.3, 2.6) + plt.grid(True) + plt.show() + +print("Indices of Support Vectors (Low Cost):", svm_model_low_cost.support_) +print("Model Summary (Low Cost):\n", svm_model_low_cost) + +# 7. Which cost value is better? +np.random.seed(1) +tune_params = {'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 1, 2, 3, 4]} +svm_tune = GridSearchCV(svm.SVC(kernel='linear'), tune_params, cv=5) +svm_tune.fit(data[['x1', 'x2']], data['y']) + +# 8. Using the best model +best_model = svm_tune.best_estimator_ +print("Best Model:\n", best_model) + +# 9. Testing your model +x_test = np.random.normal(size=(20, 2)) +y_test = np.random.choice([-1, 1], size=20, replace=True) +x_test[y_test == 1, :] = x_test[y_test == 1, :] + 1 +test_data = pd.DataFrame({'x1': x_test[:, 0], 'x2': x_test[:, 1], 'y': y_test}) + +y_pred = best_model.predict(test_data[['x1', 'x2']]) +print("Confusion Matrix:\n", confusion_matrix(y_pred, test_data['y'])) + +# 10. Is linear boundary always the best choice? +np.random.seed(1) +x = np.random.normal(size=(200, 2)) +x[:100, :] = x[:100, :] + 2 +x[100:150, :] = x[100:150, :] - 2 +y = np.concatenate([-np.ones(150), np.ones(50)]) +data = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y}) +if False: + plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k') + plt.grid(True) + plt.show() + +# 11. Splitting and fitting +train, test = train_test_split(data, test_size=0.2, random_state=1) +svm_fit = svm.SVC(kernel='rbf', gamma=0.05, C=100) +svm_fit.fit(train[['x1', 'x2']], train['y']) + +if False: + xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100)) + Z = svm_fit.decision_function(np.c_[xx.ravel(), yy.ravel()]) + Z = Z.reshape(xx.shape) + plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5) + plt.scatter(train['x1'], train['x2'], c=(3 - train['y']), marker='o', edgecolors='k') + #plt.xlim(-2.1, 2.5) + #plt.ylim(-1.3, 2.6) + plt.grid(True) + plt.show() + +print("Model Summary:\n", svm_fit) + +# Tuning hyperparameters +if False: + tune_params_rbf = {'C': np.arange(0.1, 3.1, 0.2), 'gamma': np.arange(0.1, 5.1, 0.1)} + svm_tune_rbf = GridSearchCV(svm.SVC(kernel='rbf'), tune_params_rbf, cv=5) + svm_tune_rbf.fit(train[['x1', 'x2']], train['y']) + + print("Best Model (RBF Kernel):\n", svm_tune_rbf.best_estimator_) + print("Best Parameters (RBF Kernel):\n", svm_tune_rbf.best_params_) + + # 12. Prediction + pred_rbf = svm_tune_rbf.predict(test[['x1', 'x2']]) + print("Confusion Matrix (RBF Kernel):\n", confusion_matrix(pred_rbf, test['y'])) + +# 13. Multiclass +np.random.seed(1) +x = np.vstack([x, np.random.normal(size=(50, 2))]) +y = np.concatenate([y, np.zeros(50)]) +x = np.vstack([x, [[0, 0]]]) +y = np.concatenate([y, [2]]) + + +x[y == 0, 1] = x[y == 0, 1] + 2 +data_multiclass = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y}) + +if False: + plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k') + plt.grid(True) + plt.show() + +svm_fit_multiclass = svm.SVC(kernel='linear') +svm_fit_multiclass.fit(data_multiclass[['x1', 'x2']], data_multiclass['y']) + +if True: + xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100)) + Z = svm_fit.decision_function(np.c_[xx.ravel(), yy.ravel()]) + print("xx.shape", xx.shape) + print(Z.shape) + Z = Z.reshape(xx.shape) + print(Z.shape) + plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5) + plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k') + plt.xlim(-5,5) + plt.ylim(-5,5) + plt.grid(True) + plt.show() diff --git a/Lab2/6.1/main.py b/Lab2/6.1/main.py new file mode 100644 index 0000000..5fa80ca --- /dev/null +++ b/Lab2/6.1/main.py @@ -0,0 +1,133 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from sklearn import svm +from sklearn.model_selection import train_test_split +from sklearn.model_selection import GridSearchCV +from sklearn.model_selection import cross_val_score +from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix +from sklearn.decomposition import PCA + +train_df = pd.read_csv("satellite_train.csv") +test_df = pd.read_csv("satellite_test.csv") + +X_train, X_test = train_df.drop(columns=["V37"]), test_df.drop(columns=["V37"]) +y_train, y_test = train_df["V37"], test_df["V37"] + +if False: + value_counts = y_train.value_counts(sort=False) + asc_index = sorted(value_counts.index) + asc_values = [value_counts[idx] for idx in asc_index ] + + plt.bar(asc_index, asc_values, alpha=0.7) + plt.xticks(asc_index, labels=["red soil", "cotton crop", "grey soil","damp grey soil","soil with vegetation", "very damp grey soil"]) + plt.xlabel("Values") + plt.ylabel("Frequency") + plt.show() + +#clf = svm.SVC(kernel='rbf', C=0.7, gamma=0.1) +#clf = svm.SVC(kernel='linear') +#clf.fit(X_train[["V1", "V2"]], y_train) + +#clf_tuned = GridSearchCV(svm.SVC(kernel='linear'), {'C': np.arange(0.1, 1.6, 0.2)}, cv=5) +clf_tuned = svm.SVC(kernel='rbf',C=0.19, gamma=0.00024) #GridSearchCV(svm.SVC(kernel='rbf'), {'C': [0.17, 0.18, 0.19, 0.2], 'gamma': [0.00023, 0.00024, 0.00025, 0.00026, 0.00027]}, cv=5) +clf_tuned.fit(X_train, y_train) +pred = clf_tuned.predict(X_test) +accuracy = accuracy_score(y_test, pred) * 100 +#print("Best Model (Linear Kernel):\n", clf_tuned.best_estimator_) +#print("Best Parameters (Linear Kernel):\n", clf_tuned.best_params_) +print(f"Accuracy: {accuracy:.2f}%") +print("Confusion Matrix:\n", confusion_matrix(pred, y_test)) +print("10-fold cross-validation score: ", cross_val_score(clf_tuned, X_test, y_test, cv=10).mean()) + +if False: + plt.figure(figsize=(8, 6)) + + # Plot the training points + plt.scatter(X_train["V1"], X_train["V2"], c=y_train, cmap=plt.cm.Paired, marker='.', s=20) + + h = 0.2 # step size in the mesh + x_min, x_max = X_train["V1"].min(), X_train["V1"].max() + y_min, y_max = X_train["V2"].min(), X_train["V2"].max() + xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1)) + Z = clf_tuned.predict(np.c_[xx.ravel(), yy.ravel()],) + Z = Z.reshape(xx.shape) + + # Plot decision boundary and margins + plt.contour(xx, yy, Z, colors='k', alpha=0.5) + + # Highlight the support vectors + #plt.scatter(clf_tuned.support_vectors_[:, 0], clf_tuned.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors') + + #plt.title('SVM Decision Boundary') + plt.xlabel('Feature V1') + plt.ylabel('Feature V2') + plt.show() + +if False: + # Apply PCA to reduce the dimensionality to 2D + pca = PCA(n_components=2) + X_2d = pca.fit_transform(X_train) + #print(pca.components_) + + # Plot the 2D representation of the data + plt.figure(figsize=(8, 6)) + plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_train, cmap=plt.cm.Paired, marker='.', s=20) + plt.title('2D Projection of High-Dimensional Data using PCA') + plt.xlabel('Principal Component 1') + plt.ylabel('Principal Component 2') + plt.show() + +if False: + tune_params_linear = {'C': np.arange(0.1, 2.1, 0.2)} + svm_tune_linear = GridSearchCV(svm.SVC(kernel='linear'), tune_params_linear, cv=10) + svm_tune_linear.fit(X_train, y_train) + + print("Best Model (Linear Kernel):\n", svm_tune_linear.best_estimator_) + print("Best Parameters (Linear Kernel):\n", svm_tune_linear.best_params_) + + plt.figure(figsize=(8, 6)) + h = 0.2 # step size in the mesh + x_min, x_max = X_train["V1"].min(), X_train["V1"].max() + y_min, y_max = X_train["V2"].min(), X_train["V2"].max() + xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1)) + Z = svm_tune_linear.predict(np.c_[xx.ravel(), yy.ravel()]) + + # Plot decision boundary and margins + #plt.contour(xx, yy, Z, colors='k', alpha=0.5) + + # Highlight the support vectors + plt.scatter(svm_tune_linear.support_vectors_[:, 0], svm_tune_linear.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors') + + plt.title('SVM Decision Boundary') + plt.xlabel('Feature V1') + plt.ylabel('Feature V2') + plt.legend() + plt.show() + +if False: + tune_params_linear = {'C': np.arange(0.1, 2.1, 0.2), 'gamma': np.arange(0.1, 4.1, 0.1)} + svm_tune_rbf = GridSearchCV(svm.SVC(kernel='rbf'), tune_params_linear, cv=5) + svm_tune_rbf.fit(X_train, y_train) + + print("Best Model (RBF Kernel):\n", svm_tune_rbf.best_estimator_) + print("Best Parameters (RBF Kernel):\n", svm_tune_rbf.best_params_) + + plt.figure(figsize=(8, 6)) + h = 0.2 # step size in the mesh + x_min, x_max = X_train["V1"].min(), X_train["V1"].max() + y_min, y_max = X_train["V2"].min(), X_train["V2"].max() + xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1)) + Z = svm_tune_rbf.predict(np.c_[xx.ravel(), yy.ravel()],) + + # Plot decision boundary and margins + plt.contour(xx, yy, Z, colors='k', alpha=0.5) + + # Highlight the support vectors + plt.scatter(svm_tune_rbf.support_vectors_[:, 0], svm_tune_rbf.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors') + + plt.title('SVM Decision Boundary') + plt.xlabel('Feature V1') + plt.ylabel('Feature V2') + plt.legend() + plt.show() \ No newline at end of file diff --git a/Lab2/7.1/example.py b/Lab2/7.1/example.py new file mode 100644 index 0000000..6e96004 --- /dev/null +++ b/Lab2/7.1/example.py @@ -0,0 +1,115 @@ +import h2o +from h2o.estimators.deeplearning import H2ODeepLearningEstimator +from h2o.grid.grid_search import H2OGridSearch +import pandas as pd +from sklearn.model_selection import train_test_split +import matplotlib.pyplot as plt +import seaborn as sns + +# Initialize H2O cluster +h2o.init(nthreads=-1, max_mem_size="2G") + +# Load data +data = pd.read_csv("customer_churn.csv") +#print(data.head()) + +tenure_churned = data[data['Churn'] == 'Yes']['tenure'] +tenure_stayed = data[data['Churn'] == 'No']['tenure'] + +if False: + plt.figure() + sns.kdeplot(tenure_churned, color='firebrick', linewidth=3, label='Churned') + sns.kdeplot(tenure_stayed, color='dodgerblue', linewidth=3, label='Stayed') + + plt.title('Tenure Variable') + plt.xlabel('Number of months the customer is with the company') + plt.legend(loc='upper right') + plt.grid(True) + plt.show() + +# Split data +seed = 1 +X_train, X_test, y_train, y_test = train_test_split( + data.drop("Churn", axis=1), # Features + data["Churn"], # Target variable + test_size=0.2, + random_state=seed +) +X_train['Churn']=y_train +X_test['Churn']=y_test + +# Assuming 'train' is a pandas DataFrame +h2o_train = h2o.H2OFrame(X_train) + +# Assuming 'test' is a pandas DataFrame +h2o_test = h2o.H2OFrame(X_test) + +# Training the neural network +dl_model = H2ODeepLearningEstimator( + activation="Rectifier", + hidden=[2,2], + loss="CrossEntropy", + score_each_iteration=True, + epochs=10000, + balance_classes=False, + rate=0.01, + adaptive_rate=False, + stopping_rounds=0, + classification_stop=-1 +) + +dl_model.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train) + +if False: + plt.plot(dl_model.scoring_history().epochs,dl_model.scoring_history().training_logloss) + plt.xlabel('Epochs') + plt.ylabel('log_loss') + plt.grid() + plt.show() + +# Prediction +prediction = dl_model.predict(h2o_test) +prediction = prediction["predict"].as_data_frame() +h2o_test_df = h2o_test.as_data_frame() + +# Calculate accuracy +accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100 +print(f"Accuracy: {accuracy:.2f}%") + +conf_matrix = pd.crosstab(prediction["predict"], h2o_test_df["Churn"], rownames=["Predicted"], colnames=["Actual"]) +conf_matrix_diag = conf_matrix.values.diagonal() / conf_matrix.sum(axis=0) * 100 +print("Confusion Matrix:") +print(conf_matrix) +print("Diagonal Percentages:") +print(conf_matrix_diag) + +print("-------------- DEEPER --------------") + +# Deeper model +dl_model_balanced = H2ODeepLearningEstimator( + activation="Rectifier", + hidden=[10,10,10], + loss="CrossEntropy", + score_each_iteration=True, + epochs=10000, + balance_classes=False, + rate=0.01, + adaptive_rate=False, + stopping_rounds=0, + classification_stop=-1 +) + +dl_model_balanced.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train); + +plt.plot(dl_model_balanced.scoring_history().epochs,dl_model_balanced.scoring_history().training_logloss) +plt.xlabel('Epochs') +plt.ylabel('log_loss') +plt.grid() + +# Prediction +prediction = dl_model_balanced.predict(h2o_test) +prediction = prediction["predict"].as_data_frame() +h2o_test_df = h2o_test.as_data_frame() +# Calculate accuracy +accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100 +print(f"Accuracy: {accuracy:.2f}%") \ No newline at end of file diff --git a/Lab2/7.1/main.py b/Lab2/7.1/main.py new file mode 100644 index 0000000..b080b03 --- /dev/null +++ b/Lab2/7.1/main.py @@ -0,0 +1,122 @@ +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt + +import h2o +from h2o.estimators.deeplearning import H2ODeepLearningEstimator +from h2o.grid.grid_search import H2OGridSearch + +train_df = pd.read_csv("arrhythmia_train.csv") +test_df = pd.read_csv("arrhythmia_test.csv") + +#X_train, X_test = train_df.drop(columns=["arrhythmia"]), test_df.drop(columns=["arrhythmia"]) +#y_train, y_test = train_df["arrhythmia"], test_df["arrhythmia"] + +# 1. +if False: + target_class = 0 + max_count = 10 + count = 0 + cmap = plt.cm.get_cmap('hsv', max_count) + idx = 0 + for i in range(len(X_train["X1"])): + if y_train[i] != target_class: continue + y = [] + x = [] + + for j in range(187): + if X_train[f"X{j+1}"][i] == 0: break + x.append(idx) + y.append(X_train[f"X{j+1}"][i]) + idx += 1 + + plt.plot(x, y, c=cmap(count)) + + count += 1 + if count == max_count: break + plt.show() + +# 2. +if False: + x_feature = "X20" + y_feature = "X50" + + for point_class in y_test.unique(): + plt.scatter(X_train[y_train == point_class][x_feature], X_train[y_train == point_class][y_feature], s=2, label=str(point_class)) + + plt.xlabel(x_feature) + plt.ylabel(y_feature) + plt.legend() + plt.show() + +# 3. +if False: + value_counts = y_train.value_counts(sort=False) + asc_index = sorted(value_counts.index) + asc_values = [value_counts[idx] for idx in asc_index] + + plt.bar(asc_index, asc_values, alpha=0.7) + plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"]) + plt.ylabel("Frequencies") + plt.title("Training dataset class distribution") + plt.show() + +if False: + value_counts = y_test.value_counts(sort=False) + asc_index = sorted(value_counts.index) + asc_values = [value_counts[idx] for idx in asc_index] + + plt.bar(asc_index, asc_values, alpha=0.7) + plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"]) + plt.ylabel("Frequencies") + plt.title("Test dataset class distribution") + plt.show() + +# 4. +if True: + # Initialize H2O cluster + h2o.init(nthreads=-1, max_mem_size="6G") + + # Assuming 'train' is a pandas DataFrame + h2o_train = h2o.H2OFrame(train_df) + + # Assuming 'test' is a pandas DataFrame + h2o_test = h2o.H2OFrame(test_df) + + # Training the neural network + dl_model = H2ODeepLearningEstimator( + activation="tanh", + hidden=[50, 20], + input_dropout_ratio=0.2, + l1=1e-5, + epochs=20 + ) + + dl_model.train(x=h2o_train.columns[:-1], y="arrhythmia", training_frame=h2o_train, validation_frame=h2o_test) + + # Prediction + prediction = dl_model.predict(h2o_test) + prediction = prediction["predict"].as_data_frame() + h2o_test_df = h2o_test.as_data_frame() + + # Calculate accuracy + classess = [0,1,2,4] + confusion_matrix = [[0]*5, [0]*5, [0]*5, [0]*5] + for i in range(len(h2o_test_df["arrhythmia"])): + expected = h2o_test_df["arrhythmia"][i] + actual = round(prediction['predict'][i]) + if actual not in classess: + confusion_matrix[classess.index(expected)][-1] += 1 + else: + actual_idx = classess.index(actual) + confusion_matrix[classess.index(expected)][actual_idx] += 1 + + + accuracy = (abs(prediction['predict'] - h2o_test_df["arrhythmia"]) < 0.5).mean() * 100 + print(f"Accuracy: {accuracy:.2f}%") + + print("Confusion Matrix:") + print("Expected | Predicted") + print(" | 0 1 2 4 -1") + for i, row in enumerate(confusion_matrix): + print(" " + str(classess[i]) + " | "+ " ".join(f"{cell:5}" for cell in row)) diff --git a/Lab2/IF-1-1_Rokas_Puzonas.pdf b/Lab2/IF-1-1_Rokas_Puzonas.pdf new file mode 100644 index 0000000..6696a31 Binary files /dev/null and b/Lab2/IF-1-1_Rokas_Puzonas.pdf differ