Remove figures

This commit is contained in:
Rokas Puzonas 2023-12-05 23:50:32 +02:00
parent 61313c1e87
commit a0dd07b94a
8 changed files with 747 additions and 0 deletions

1
.gitignore vendored
View File

@ -1 +1,2 @@
Lab1/Figures/
Lab2/Figures/

118
Lab2/5.2/part1.py Normal file
View File

@ -0,0 +1,118 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
# Load the Carseats data
MusicSpotify = pd.read_csv('music_spotify.csv')
MusicSpotify = MusicSpotify.drop(columns=['X', 'artist', 'song_title'])
#print(MusicSpotify.head())
label_encoder = LabelEncoder()
label_encoder.fit_transform(MusicSpotify['target'])
# Data division
X = MusicSpotify.drop(columns=['target']) #dropping Sales, because class variable High is made of variable Sales.
y = MusicSpotify['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
# Tree training (no cost complexity pruning)
tree_music_spotify = DecisionTreeClassifier(ccp_alpha=0.00, min_samples_split=2)
tree_music_spotify.fit(X_train, y_train)
pred_rbf = tree_music_spotify.predict(X_test)
if False:
# Plot the Decision Tree with matplotlib
feature_names = X_train.columns.tolist() # Assuming X_train is a DataFrame
class_names = y_train.unique().astype(str)
plt.figure(figsize=(12, 8))
plot_tree(tree_music_spotify, filled=True, feature_names=feature_names, class_names=class_names, rounded=True, fontsize=10)
plt.show()
# Tree information and visualization
path = tree_music_spotify.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = path.ccp_alphas, path.impurities
#path = tree_music_spotify.cost_complexity_pruning_path(X_train, y_train)
if False:
fig, ax = plt.subplots()
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
ax.set_xlabel("effective alpha")
ax.set_ylabel("total impurity of leaves")
ax.set_title("Total Impurity vs effective alpha for training set")
plt.grid()
plt.show()
if False:
# Vary the hyperparameter (e.g., max depth)
depth_values = range(1, 25)
cv_scores = []
for depth in depth_values:
dt_classifier = DecisionTreeClassifier(max_depth=depth, random_state=20)
scores = cross_val_score(dt_classifier, X, y, cv=10, scoring='accuracy')
cv_scores.append(np.mean(scores))
# Plot the cross-validated error
plt.plot(depth_values, 1 - np.array(cv_scores), marker='o')
plt.xlabel('Tree depth')
plt.ylabel('Cross-validated error rate')
plt.title('Cross-validated error vs. tree depth')
plt.grid(True)
plt.show()
if False:
ccp_alphas_collect=[]
accuracy_collect=[]
for ccp_alpha in ccp_alphas:
tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
tree.fit(X_train, y_train)
accuracy = tree.score(X_test, y_test)
ccp_alphas_collect.append(ccp_alpha)
accuracy_collect.append(accuracy)
plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect))
plt.grid()
plt.xlim(0, 0.03)
plt.xlabel('CP alpha')
plt.ylabel('Accuracy')
plt.show()
if False:
plt.figure(figsize=(12, 6))
plot_tree(tree_music_spotify, filled=True, feature_names=X.columns.tolist())
plt.show()
# Prediction and accuracy
pred = tree_music_spotify.predict(X_test)
accuracy = accuracy_score(y_test, pred) * 100
print("Unprunned:")
print(f"Accuracy: {accuracy:.2f}%")
print("10-fold cross-validation score: ", cross_val_score(tree_music_spotify, X, y, cv=10).mean())
print("Confusion Matrix:\n", confusion_matrix(pred, y_test))
# Pruning
music_spotify_prunned = DecisionTreeClassifier(ccp_alpha=0.006, random_state=20)
music_spotify_prunned.fit(X_train, y_train)
pred = music_spotify_prunned.predict(X_test)
print("Prunned:")
print(f"Accuracy: {accuracy_score(y_test, pred) * 100:.2f}%")
print("10-fold cross-validation score: ", cross_val_score(music_spotify_prunned, X, y, cv=10).mean())
print("Confusion Matrix:\n", confusion_matrix(pred, y_test))
if False:
plt.figure(figsize=(12, 6))
plot_tree(music_spotify_prunned, filled=True, feature_names=X.columns.tolist())
plt.show()
if True:
feat_importances = pd.DataFrame(tree_music_spotify.feature_importances_, index=X_train.columns, columns=["Importance"])
feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
feat_importances.plot(kind='bar', figsize=(8,6))
plt.show()

107
Lab2/5.2/part2.py Normal file
View File

@ -0,0 +1,107 @@
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from time import time
print("Loading data")
train_df = pd.read_csv("sign_mnist_train.csv")
test_df = pd.read_csv("sign_mnist_test.csv")
def remove_every_nth_column(df, count):
return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0]
def leave_every_nth_column(df, count):
return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0]
X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"])
y_train, y_test = train_df["label"], test_df["label"]
# # ('leave_every_nth_column_5', 0.6002509760178472)
# X_train = leave_every_nth_column(X_train, 5)
# X_test = leave_every_nth_column(X_test, 5)
# # ('remove_every_nth_column_15', 0.6167038482989403)
# X_train = remove_every_nth_column(X_train, 15)
# X_test = remove_every_nth_column(X_test, 15)
# # ('remove_every_nth_column_14', 0.6179587283881762)
# X_train = remove_every_nth_column(X_train, 14)
# X_test = remove_every_nth_column(X_test, 14)
# # ('remove_every_nth_column_11', 0.6183770217512549)
# X_train = remove_every_nth_column(X_train, 11)
# X_test = remove_every_nth_column(X_test, 11)
start = time()
ccp_alpha = 0.0009
print("Training", ccp_alpha)
decision_tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
decision_tree.fit(X_train, y_train)
print(time() - start)
if False:
print("Viz")
ccp_alphas_collect=[]
accuracy_collect=[]
for ccp_alpha in ccp_alphas:
tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
tree.fit(X_train, y_train)
accuracy = tree.score(X_test, y_test)
ccp_alphas_collect.append(ccp_alpha)
accuracy_collect.append(accuracy)
plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect))
plt.grid()
plt.xlabel('effective alpha')
plt.ylabel('Accuracy of test set')
plt.show()
pred = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, pred) * 100
print("Unprunned:")
print(f"Accuracy: {accuracy:.2f}%")
#print("10-fold cross-validation score: ", cross_val_score(decision_tree, X_test, y_test, cv=10).mean())
if False:
plt.figure(figsize=(12, 6))
plot_tree(decision_tree, filled=True, feature_names=X_test.columns.tolist())
plt.show()
if False:
# Pruning
decision_tree_prunned = DecisionTreeClassifier(ccp_alpha=0.01)
decision_tree_prunned.fit(X_train, y_train)
print("Prunned:")
print(f"Accuracy: {accuracy_score(decision_tree_prunned.predict(X_test), pred) * 100:.2f}%")
print("10-fold cross-validation score: ", cross_val_score(decision_tree_prunned, X_test, y_test, cv=10).mean())
if True:
if True:
feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
print(decision_tree.feature_importances_)
print(len(decision_tree.feature_importances_))
print(X_train.columns)
image = []
for y in range(28):
image.append([0]*28)
for idx, importance in enumerate(decision_tree.feature_importances_):
pixel_idx = int(X_train.columns[idx].removeprefix("pixel"))-1
pixel_x = pixel_idx % 28
pixel_y = pixel_idx // 28
image[pixel_y][pixel_x] = importance
for y in range(28):
print(",".join(str(a) for a in image[y]))
# feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
# feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
# feat_importances.plot(kind='bar', figsize=(8,6))
# plt.show()

151
Lab2/6.1/example.py Normal file
View File

@ -0,0 +1,151 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
# 1. Linear SVC
np.random.seed(1)
x = np.random.normal(size=(20, 2))
y = np.concatenate([-np.ones(10), np.ones(10)])
x[y == 1, :] = x[y == 1, :] + 1
# plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
# plt.grid(True)
# plt.show()
# 2. Linear SVC fitting
data = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
svm_model = svm.SVC(kernel='linear', C=10)
svm_model.fit(data[['x1', 'x2']], data['y'])
# 3. Visualizing
if False:
xx, yy = np.meshgrid(np.linspace(-2.1, 2.5, 100), np.linspace(-1.3, 2.6, 100))
Z = svm_model.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
plt.xlim(-2.1, 2.5)
plt.ylim(-1.3, 2.6)
plt.grid(True)
plt.show()
# 4. Indices of Support vectors
print("Indices of Support Vectors:", svm_model.support_)
# 6. Controlling the cost=1/C hyperparameter
svm_model_low_cost = svm.SVC(kernel='linear', C=100)
svm_model_low_cost.fit(data[['x1', 'x2']], data['y'])
if False:
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
Z = svm_model_low_cost.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
plt.xlim(-2.1, 2.5)
plt.ylim(-1.3, 2.6)
plt.grid(True)
plt.show()
print("Indices of Support Vectors (Low Cost):", svm_model_low_cost.support_)
print("Model Summary (Low Cost):\n", svm_model_low_cost)
# 7. Which cost value is better?
np.random.seed(1)
tune_params = {'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 1, 2, 3, 4]}
svm_tune = GridSearchCV(svm.SVC(kernel='linear'), tune_params, cv=5)
svm_tune.fit(data[['x1', 'x2']], data['y'])
# 8. Using the best model
best_model = svm_tune.best_estimator_
print("Best Model:\n", best_model)
# 9. Testing your model
x_test = np.random.normal(size=(20, 2))
y_test = np.random.choice([-1, 1], size=20, replace=True)
x_test[y_test == 1, :] = x_test[y_test == 1, :] + 1
test_data = pd.DataFrame({'x1': x_test[:, 0], 'x2': x_test[:, 1], 'y': y_test})
y_pred = best_model.predict(test_data[['x1', 'x2']])
print("Confusion Matrix:\n", confusion_matrix(y_pred, test_data['y']))
# 10. Is linear boundary always the best choice?
np.random.seed(1)
x = np.random.normal(size=(200, 2))
x[:100, :] = x[:100, :] + 2
x[100:150, :] = x[100:150, :] - 2
y = np.concatenate([-np.ones(150), np.ones(50)])
data = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
if False:
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
plt.grid(True)
plt.show()
# 11. Splitting and fitting
train, test = train_test_split(data, test_size=0.2, random_state=1)
svm_fit = svm.SVC(kernel='rbf', gamma=0.05, C=100)
svm_fit.fit(train[['x1', 'x2']], train['y'])
if False:
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
Z = svm_fit.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
plt.scatter(train['x1'], train['x2'], c=(3 - train['y']), marker='o', edgecolors='k')
#plt.xlim(-2.1, 2.5)
#plt.ylim(-1.3, 2.6)
plt.grid(True)
plt.show()
print("Model Summary:\n", svm_fit)
# Tuning hyperparameters
if False:
tune_params_rbf = {'C': np.arange(0.1, 3.1, 0.2), 'gamma': np.arange(0.1, 5.1, 0.1)}
svm_tune_rbf = GridSearchCV(svm.SVC(kernel='rbf'), tune_params_rbf, cv=5)
svm_tune_rbf.fit(train[['x1', 'x2']], train['y'])
print("Best Model (RBF Kernel):\n", svm_tune_rbf.best_estimator_)
print("Best Parameters (RBF Kernel):\n", svm_tune_rbf.best_params_)
# 12. Prediction
pred_rbf = svm_tune_rbf.predict(test[['x1', 'x2']])
print("Confusion Matrix (RBF Kernel):\n", confusion_matrix(pred_rbf, test['y']))
# 13. Multiclass
np.random.seed(1)
x = np.vstack([x, np.random.normal(size=(50, 2))])
y = np.concatenate([y, np.zeros(50)])
x = np.vstack([x, [[0, 0]]])
y = np.concatenate([y, [2]])
x[y == 0, 1] = x[y == 0, 1] + 2
data_multiclass = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
if False:
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
plt.grid(True)
plt.show()
svm_fit_multiclass = svm.SVC(kernel='linear')
svm_fit_multiclass.fit(data_multiclass[['x1', 'x2']], data_multiclass['y'])
if True:
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
Z = svm_fit.decision_function(np.c_[xx.ravel(), yy.ravel()])
print("xx.shape", xx.shape)
print(Z.shape)
Z = Z.reshape(xx.shape)
print(Z.shape)
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
plt.xlim(-5,5)
plt.ylim(-5,5)
plt.grid(True)
plt.show()

133
Lab2/6.1/main.py Normal file
View File

@ -0,0 +1,133 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix
from sklearn.decomposition import PCA
train_df = pd.read_csv("satellite_train.csv")
test_df = pd.read_csv("satellite_test.csv")
X_train, X_test = train_df.drop(columns=["V37"]), test_df.drop(columns=["V37"])
y_train, y_test = train_df["V37"], test_df["V37"]
if False:
value_counts = y_train.value_counts(sort=False)
asc_index = sorted(value_counts.index)
asc_values = [value_counts[idx] for idx in asc_index ]
plt.bar(asc_index, asc_values, alpha=0.7)
plt.xticks(asc_index, labels=["red soil", "cotton crop", "grey soil","damp grey soil","soil with vegetation", "very damp grey soil"])
plt.xlabel("Values")
plt.ylabel("Frequency")
plt.show()
#clf = svm.SVC(kernel='rbf', C=0.7, gamma=0.1)
#clf = svm.SVC(kernel='linear')
#clf.fit(X_train[["V1", "V2"]], y_train)
#clf_tuned = GridSearchCV(svm.SVC(kernel='linear'), {'C': np.arange(0.1, 1.6, 0.2)}, cv=5)
clf_tuned = svm.SVC(kernel='rbf',C=0.19, gamma=0.00024) #GridSearchCV(svm.SVC(kernel='rbf'), {'C': [0.17, 0.18, 0.19, 0.2], 'gamma': [0.00023, 0.00024, 0.00025, 0.00026, 0.00027]}, cv=5)
clf_tuned.fit(X_train, y_train)
pred = clf_tuned.predict(X_test)
accuracy = accuracy_score(y_test, pred) * 100
#print("Best Model (Linear Kernel):\n", clf_tuned.best_estimator_)
#print("Best Parameters (Linear Kernel):\n", clf_tuned.best_params_)
print(f"Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(pred, y_test))
print("10-fold cross-validation score: ", cross_val_score(clf_tuned, X_test, y_test, cv=10).mean())
if False:
plt.figure(figsize=(8, 6))
# Plot the training points
plt.scatter(X_train["V1"], X_train["V2"], c=y_train, cmap=plt.cm.Paired, marker='.', s=20)
h = 0.2 # step size in the mesh
x_min, x_max = X_train["V1"].min(), X_train["V1"].max()
y_min, y_max = X_train["V2"].min(), X_train["V2"].max()
xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1))
Z = clf_tuned.predict(np.c_[xx.ravel(), yy.ravel()],)
Z = Z.reshape(xx.shape)
# Plot decision boundary and margins
plt.contour(xx, yy, Z, colors='k', alpha=0.5)
# Highlight the support vectors
#plt.scatter(clf_tuned.support_vectors_[:, 0], clf_tuned.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors')
#plt.title('SVM Decision Boundary')
plt.xlabel('Feature V1')
plt.ylabel('Feature V2')
plt.show()
if False:
# Apply PCA to reduce the dimensionality to 2D
pca = PCA(n_components=2)
X_2d = pca.fit_transform(X_train)
#print(pca.components_)
# Plot the 2D representation of the data
plt.figure(figsize=(8, 6))
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_train, cmap=plt.cm.Paired, marker='.', s=20)
plt.title('2D Projection of High-Dimensional Data using PCA')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
if False:
tune_params_linear = {'C': np.arange(0.1, 2.1, 0.2)}
svm_tune_linear = GridSearchCV(svm.SVC(kernel='linear'), tune_params_linear, cv=10)
svm_tune_linear.fit(X_train, y_train)
print("Best Model (Linear Kernel):\n", svm_tune_linear.best_estimator_)
print("Best Parameters (Linear Kernel):\n", svm_tune_linear.best_params_)
plt.figure(figsize=(8, 6))
h = 0.2 # step size in the mesh
x_min, x_max = X_train["V1"].min(), X_train["V1"].max()
y_min, y_max = X_train["V2"].min(), X_train["V2"].max()
xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1))
Z = svm_tune_linear.predict(np.c_[xx.ravel(), yy.ravel()])
# Plot decision boundary and margins
#plt.contour(xx, yy, Z, colors='k', alpha=0.5)
# Highlight the support vectors
plt.scatter(svm_tune_linear.support_vectors_[:, 0], svm_tune_linear.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors')
plt.title('SVM Decision Boundary')
plt.xlabel('Feature V1')
plt.ylabel('Feature V2')
plt.legend()
plt.show()
if False:
tune_params_linear = {'C': np.arange(0.1, 2.1, 0.2), 'gamma': np.arange(0.1, 4.1, 0.1)}
svm_tune_rbf = GridSearchCV(svm.SVC(kernel='rbf'), tune_params_linear, cv=5)
svm_tune_rbf.fit(X_train, y_train)
print("Best Model (RBF Kernel):\n", svm_tune_rbf.best_estimator_)
print("Best Parameters (RBF Kernel):\n", svm_tune_rbf.best_params_)
plt.figure(figsize=(8, 6))
h = 0.2 # step size in the mesh
x_min, x_max = X_train["V1"].min(), X_train["V1"].max()
y_min, y_max = X_train["V2"].min(), X_train["V2"].max()
xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1))
Z = svm_tune_rbf.predict(np.c_[xx.ravel(), yy.ravel()],)
# Plot decision boundary and margins
plt.contour(xx, yy, Z, colors='k', alpha=0.5)
# Highlight the support vectors
plt.scatter(svm_tune_rbf.support_vectors_[:, 0], svm_tune_rbf.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors')
plt.title('SVM Decision Boundary')
plt.xlabel('Feature V1')
plt.ylabel('Feature V2')
plt.legend()
plt.show()

115
Lab2/7.1/example.py Normal file
View File

@ -0,0 +1,115 @@
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
# Initialize H2O cluster
h2o.init(nthreads=-1, max_mem_size="2G")
# Load data
data = pd.read_csv("customer_churn.csv")
#print(data.head())
tenure_churned = data[data['Churn'] == 'Yes']['tenure']
tenure_stayed = data[data['Churn'] == 'No']['tenure']
if False:
plt.figure()
sns.kdeplot(tenure_churned, color='firebrick', linewidth=3, label='Churned')
sns.kdeplot(tenure_stayed, color='dodgerblue', linewidth=3, label='Stayed')
plt.title('Tenure Variable')
plt.xlabel('Number of months the customer is with the company')
plt.legend(loc='upper right')
plt.grid(True)
plt.show()
# Split data
seed = 1
X_train, X_test, y_train, y_test = train_test_split(
data.drop("Churn", axis=1), # Features
data["Churn"], # Target variable
test_size=0.2,
random_state=seed
)
X_train['Churn']=y_train
X_test['Churn']=y_test
# Assuming 'train' is a pandas DataFrame
h2o_train = h2o.H2OFrame(X_train)
# Assuming 'test' is a pandas DataFrame
h2o_test = h2o.H2OFrame(X_test)
# Training the neural network
dl_model = H2ODeepLearningEstimator(
activation="Rectifier",
hidden=[2,2],
loss="CrossEntropy",
score_each_iteration=True,
epochs=10000,
balance_classes=False,
rate=0.01,
adaptive_rate=False,
stopping_rounds=0,
classification_stop=-1
)
dl_model.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train)
if False:
plt.plot(dl_model.scoring_history().epochs,dl_model.scoring_history().training_logloss)
plt.xlabel('Epochs')
plt.ylabel('log_loss')
plt.grid()
plt.show()
# Prediction
prediction = dl_model.predict(h2o_test)
prediction = prediction["predict"].as_data_frame()
h2o_test_df = h2o_test.as_data_frame()
# Calculate accuracy
accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100
print(f"Accuracy: {accuracy:.2f}%")
conf_matrix = pd.crosstab(prediction["predict"], h2o_test_df["Churn"], rownames=["Predicted"], colnames=["Actual"])
conf_matrix_diag = conf_matrix.values.diagonal() / conf_matrix.sum(axis=0) * 100
print("Confusion Matrix:")
print(conf_matrix)
print("Diagonal Percentages:")
print(conf_matrix_diag)
print("-------------- DEEPER --------------")
# Deeper model
dl_model_balanced = H2ODeepLearningEstimator(
activation="Rectifier",
hidden=[10,10,10],
loss="CrossEntropy",
score_each_iteration=True,
epochs=10000,
balance_classes=False,
rate=0.01,
adaptive_rate=False,
stopping_rounds=0,
classification_stop=-1
)
dl_model_balanced.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train);
plt.plot(dl_model_balanced.scoring_history().epochs,dl_model_balanced.scoring_history().training_logloss)
plt.xlabel('Epochs')
plt.ylabel('log_loss')
plt.grid()
# Prediction
prediction = dl_model_balanced.predict(h2o_test)
prediction = prediction["predict"].as_data_frame()
h2o_test_df = h2o_test.as_data_frame()
# Calculate accuracy
accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100
print(f"Accuracy: {accuracy:.2f}%")

122
Lab2/7.1/main.py Normal file
View File

@ -0,0 +1,122 @@
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch
train_df = pd.read_csv("arrhythmia_train.csv")
test_df = pd.read_csv("arrhythmia_test.csv")
#X_train, X_test = train_df.drop(columns=["arrhythmia"]), test_df.drop(columns=["arrhythmia"])
#y_train, y_test = train_df["arrhythmia"], test_df["arrhythmia"]
# 1.
if False:
target_class = 0
max_count = 10
count = 0
cmap = plt.cm.get_cmap('hsv', max_count)
idx = 0
for i in range(len(X_train["X1"])):
if y_train[i] != target_class: continue
y = []
x = []
for j in range(187):
if X_train[f"X{j+1}"][i] == 0: break
x.append(idx)
y.append(X_train[f"X{j+1}"][i])
idx += 1
plt.plot(x, y, c=cmap(count))
count += 1
if count == max_count: break
plt.show()
# 2.
if False:
x_feature = "X20"
y_feature = "X50"
for point_class in y_test.unique():
plt.scatter(X_train[y_train == point_class][x_feature], X_train[y_train == point_class][y_feature], s=2, label=str(point_class))
plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.legend()
plt.show()
# 3.
if False:
value_counts = y_train.value_counts(sort=False)
asc_index = sorted(value_counts.index)
asc_values = [value_counts[idx] for idx in asc_index]
plt.bar(asc_index, asc_values, alpha=0.7)
plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"])
plt.ylabel("Frequencies")
plt.title("Training dataset class distribution")
plt.show()
if False:
value_counts = y_test.value_counts(sort=False)
asc_index = sorted(value_counts.index)
asc_values = [value_counts[idx] for idx in asc_index]
plt.bar(asc_index, asc_values, alpha=0.7)
plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"])
plt.ylabel("Frequencies")
plt.title("Test dataset class distribution")
plt.show()
# 4.
if True:
# Initialize H2O cluster
h2o.init(nthreads=-1, max_mem_size="6G")
# Assuming 'train' is a pandas DataFrame
h2o_train = h2o.H2OFrame(train_df)
# Assuming 'test' is a pandas DataFrame
h2o_test = h2o.H2OFrame(test_df)
# Training the neural network
dl_model = H2ODeepLearningEstimator(
activation="tanh",
hidden=[50, 20],
input_dropout_ratio=0.2,
l1=1e-5,
epochs=20
)
dl_model.train(x=h2o_train.columns[:-1], y="arrhythmia", training_frame=h2o_train, validation_frame=h2o_test)
# Prediction
prediction = dl_model.predict(h2o_test)
prediction = prediction["predict"].as_data_frame()
h2o_test_df = h2o_test.as_data_frame()
# Calculate accuracy
classess = [0,1,2,4]
confusion_matrix = [[0]*5, [0]*5, [0]*5, [0]*5]
for i in range(len(h2o_test_df["arrhythmia"])):
expected = h2o_test_df["arrhythmia"][i]
actual = round(prediction['predict'][i])
if actual not in classess:
confusion_matrix[classess.index(expected)][-1] += 1
else:
actual_idx = classess.index(actual)
confusion_matrix[classess.index(expected)][actual_idx] += 1
accuracy = (abs(prediction['predict'] - h2o_test_df["arrhythmia"]) < 0.5).mean() * 100
print(f"Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:")
print("Expected | Predicted")
print(" | 0 1 2 4 -1")
for i, row in enumerate(confusion_matrix):
print(" " + str(classess[i]) + " | "+ " ".join(f"{cell:5}" for cell in row))

Binary file not shown.