machine-learing-methods/Lab2/7.1/main.py
2023-12-06 00:00:07 +02:00

123 lines
3.6 KiB
Python

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch
train_df = pd.read_csv("arrhythmia_train.csv")
test_df = pd.read_csv("arrhythmia_test.csv")
#X_train, X_test = train_df.drop(columns=["arrhythmia"]), test_df.drop(columns=["arrhythmia"])
#y_train, y_test = train_df["arrhythmia"], test_df["arrhythmia"]
# 1.
if False:
target_class = 0
max_count = 10
count = 0
cmap = plt.cm.get_cmap('hsv', max_count)
idx = 0
for i in range(len(X_train["X1"])):
if y_train[i] != target_class: continue
y = []
x = []
for j in range(187):
if X_train[f"X{j+1}"][i] == 0: break
x.append(idx)
y.append(X_train[f"X{j+1}"][i])
idx += 1
plt.plot(x, y, c=cmap(count))
count += 1
if count == max_count: break
plt.show()
# 2.
if False:
x_feature = "X20"
y_feature = "X50"
for point_class in y_test.unique():
plt.scatter(X_train[y_train == point_class][x_feature], X_train[y_train == point_class][y_feature], s=2, label=str(point_class))
plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.legend()
plt.show()
# 3.
if False:
value_counts = y_train.value_counts(sort=False)
asc_index = sorted(value_counts.index)
asc_values = [value_counts[idx] for idx in asc_index]
plt.bar(asc_index, asc_values, alpha=0.7)
plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"])
plt.ylabel("Frequencies")
plt.title("Training dataset class distribution")
plt.show()
if False:
value_counts = y_test.value_counts(sort=False)
asc_index = sorted(value_counts.index)
asc_values = [value_counts[idx] for idx in asc_index]
plt.bar(asc_index, asc_values, alpha=0.7)
plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"])
plt.ylabel("Frequencies")
plt.title("Test dataset class distribution")
plt.show()
# 4.
if True:
# Initialize H2O cluster
h2o.init(nthreads=-1, max_mem_size="6G")
# Assuming 'train' is a pandas DataFrame
h2o_train = h2o.H2OFrame(train_df)
# Assuming 'test' is a pandas DataFrame
h2o_test = h2o.H2OFrame(test_df)
# Training the neural network
dl_model = H2ODeepLearningEstimator(
activation="tanh",
hidden=[50, 20],
input_dropout_ratio=0.2,
l1=1e-5,
epochs=20
)
dl_model.train(x=h2o_train.columns[:-1], y="arrhythmia", training_frame=h2o_train, validation_frame=h2o_test)
# Prediction
prediction = dl_model.predict(h2o_test)
prediction = prediction["predict"].as_data_frame()
h2o_test_df = h2o_test.as_data_frame()
# Calculate accuracy
classess = [0,1,2,4]
confusion_matrix = [[0]*5, [0]*5, [0]*5, [0]*5]
for i in range(len(h2o_test_df["arrhythmia"])):
expected = h2o_test_df["arrhythmia"][i]
actual = round(prediction['predict'][i])
if actual not in classess:
confusion_matrix[classess.index(expected)][-1] += 1
else:
actual_idx = classess.index(actual)
confusion_matrix[classess.index(expected)][actual_idx] += 1
accuracy = (abs(prediction['predict'] - h2o_test_df["arrhythmia"]) < 0.5).mean() * 100
print(f"Accuracy: {accuracy:.2f}%")
print("Confusion Matrix:")
print("Expected | Predicted")
print(" | 0 1 2 4 -1")
for i, row in enumerate(confusion_matrix):
print(" " + str(classess[i]) + " | "+ " ".join(f"{cell:5}" for cell in row))