123 lines
3.6 KiB
Python
123 lines
3.6 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
|
|
import h2o
|
|
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
|
|
from h2o.grid.grid_search import H2OGridSearch
|
|
|
|
train_df = pd.read_csv("arrhythmia_train.csv")
|
|
test_df = pd.read_csv("arrhythmia_test.csv")
|
|
|
|
#X_train, X_test = train_df.drop(columns=["arrhythmia"]), test_df.drop(columns=["arrhythmia"])
|
|
#y_train, y_test = train_df["arrhythmia"], test_df["arrhythmia"]
|
|
|
|
# 1.
|
|
if False:
|
|
target_class = 0
|
|
max_count = 10
|
|
count = 0
|
|
cmap = plt.cm.get_cmap('hsv', max_count)
|
|
idx = 0
|
|
for i in range(len(X_train["X1"])):
|
|
if y_train[i] != target_class: continue
|
|
y = []
|
|
x = []
|
|
|
|
for j in range(187):
|
|
if X_train[f"X{j+1}"][i] == 0: break
|
|
x.append(idx)
|
|
y.append(X_train[f"X{j+1}"][i])
|
|
idx += 1
|
|
|
|
plt.plot(x, y, c=cmap(count))
|
|
|
|
count += 1
|
|
if count == max_count: break
|
|
plt.show()
|
|
|
|
# 2.
|
|
if False:
|
|
x_feature = "X20"
|
|
y_feature = "X50"
|
|
|
|
for point_class in y_test.unique():
|
|
plt.scatter(X_train[y_train == point_class][x_feature], X_train[y_train == point_class][y_feature], s=2, label=str(point_class))
|
|
|
|
plt.xlabel(x_feature)
|
|
plt.ylabel(y_feature)
|
|
plt.legend()
|
|
plt.show()
|
|
|
|
# 3.
|
|
if False:
|
|
value_counts = y_train.value_counts(sort=False)
|
|
asc_index = sorted(value_counts.index)
|
|
asc_values = [value_counts[idx] for idx in asc_index]
|
|
|
|
plt.bar(asc_index, asc_values, alpha=0.7)
|
|
plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"])
|
|
plt.ylabel("Frequencies")
|
|
plt.title("Training dataset class distribution")
|
|
plt.show()
|
|
|
|
if False:
|
|
value_counts = y_test.value_counts(sort=False)
|
|
asc_index = sorted(value_counts.index)
|
|
asc_values = [value_counts[idx] for idx in asc_index]
|
|
|
|
plt.bar(asc_index, asc_values, alpha=0.7)
|
|
plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"])
|
|
plt.ylabel("Frequencies")
|
|
plt.title("Test dataset class distribution")
|
|
plt.show()
|
|
|
|
# 4.
|
|
if True:
|
|
# Initialize H2O cluster
|
|
h2o.init(nthreads=-1, max_mem_size="6G")
|
|
|
|
# Assuming 'train' is a pandas DataFrame
|
|
h2o_train = h2o.H2OFrame(train_df)
|
|
|
|
# Assuming 'test' is a pandas DataFrame
|
|
h2o_test = h2o.H2OFrame(test_df)
|
|
|
|
# Training the neural network
|
|
dl_model = H2ODeepLearningEstimator(
|
|
activation="tanh",
|
|
hidden=[50, 20],
|
|
input_dropout_ratio=0.2,
|
|
l1=1e-5,
|
|
epochs=20
|
|
)
|
|
|
|
dl_model.train(x=h2o_train.columns[:-1], y="arrhythmia", training_frame=h2o_train, validation_frame=h2o_test)
|
|
|
|
# Prediction
|
|
prediction = dl_model.predict(h2o_test)
|
|
prediction = prediction["predict"].as_data_frame()
|
|
h2o_test_df = h2o_test.as_data_frame()
|
|
|
|
# Calculate accuracy
|
|
classess = [0,1,2,4]
|
|
confusion_matrix = [[0]*5, [0]*5, [0]*5, [0]*5]
|
|
for i in range(len(h2o_test_df["arrhythmia"])):
|
|
expected = h2o_test_df["arrhythmia"][i]
|
|
actual = round(prediction['predict'][i])
|
|
if actual not in classess:
|
|
confusion_matrix[classess.index(expected)][-1] += 1
|
|
else:
|
|
actual_idx = classess.index(actual)
|
|
confusion_matrix[classess.index(expected)][actual_idx] += 1
|
|
|
|
|
|
accuracy = (abs(prediction['predict'] - h2o_test_df["arrhythmia"]) < 0.5).mean() * 100
|
|
print(f"Accuracy: {accuracy:.2f}%")
|
|
|
|
print("Confusion Matrix:")
|
|
print("Expected | Predicted")
|
|
print(" | 0 1 2 4 -1")
|
|
for i, row in enumerate(confusion_matrix):
|
|
print(" " + str(classess[i]) + " | "+ " ".join(f"{cell:5}" for cell in row))
|