import numpy as np import pandas as pd import matplotlib.pyplot as plt import h2o from h2o.estimators.deeplearning import H2ODeepLearningEstimator from h2o.grid.grid_search import H2OGridSearch train_df = pd.read_csv("arrhythmia_train.csv") test_df = pd.read_csv("arrhythmia_test.csv") #X_train, X_test = train_df.drop(columns=["arrhythmia"]), test_df.drop(columns=["arrhythmia"]) #y_train, y_test = train_df["arrhythmia"], test_df["arrhythmia"] # 1. if False: target_class = 0 max_count = 10 count = 0 cmap = plt.cm.get_cmap('hsv', max_count) idx = 0 for i in range(len(X_train["X1"])): if y_train[i] != target_class: continue y = [] x = [] for j in range(187): if X_train[f"X{j+1}"][i] == 0: break x.append(idx) y.append(X_train[f"X{j+1}"][i]) idx += 1 plt.plot(x, y, c=cmap(count)) count += 1 if count == max_count: break plt.show() # 2. if False: x_feature = "X20" y_feature = "X50" for point_class in y_test.unique(): plt.scatter(X_train[y_train == point_class][x_feature], X_train[y_train == point_class][y_feature], s=2, label=str(point_class)) plt.xlabel(x_feature) plt.ylabel(y_feature) plt.legend() plt.show() # 3. if False: value_counts = y_train.value_counts(sort=False) asc_index = sorted(value_counts.index) asc_values = [value_counts[idx] for idx in asc_index] plt.bar(asc_index, asc_values, alpha=0.7) plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"]) plt.ylabel("Frequencies") plt.title("Training dataset class distribution") plt.show() if False: value_counts = y_test.value_counts(sort=False) asc_index = sorted(value_counts.index) asc_values = [value_counts[idx] for idx in asc_index] plt.bar(asc_index, asc_values, alpha=0.7) plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"]) plt.ylabel("Frequencies") plt.title("Test dataset class distribution") plt.show() # 4. if True: # Initialize H2O cluster h2o.init(nthreads=-1, max_mem_size="6G") # Assuming 'train' is a pandas DataFrame h2o_train = h2o.H2OFrame(train_df) # Assuming 'test' is a pandas DataFrame h2o_test = h2o.H2OFrame(test_df) # Training the neural network dl_model = H2ODeepLearningEstimator( activation="tanh", hidden=[50, 20], input_dropout_ratio=0.2, l1=1e-5, epochs=20 ) dl_model.train(x=h2o_train.columns[:-1], y="arrhythmia", training_frame=h2o_train, validation_frame=h2o_test) # Prediction prediction = dl_model.predict(h2o_test) prediction = prediction["predict"].as_data_frame() h2o_test_df = h2o_test.as_data_frame() # Calculate accuracy classess = [0,1,2,4] confusion_matrix = [[0]*5, [0]*5, [0]*5, [0]*5] for i in range(len(h2o_test_df["arrhythmia"])): expected = h2o_test_df["arrhythmia"][i] actual = round(prediction['predict'][i]) if actual not in classess: confusion_matrix[classess.index(expected)][-1] += 1 else: actual_idx = classess.index(actual) confusion_matrix[classess.index(expected)][actual_idx] += 1 accuracy = (abs(prediction['predict'] - h2o_test_df["arrhythmia"]) < 0.5).mean() * 100 print(f"Accuracy: {accuracy:.2f}%") print("Confusion Matrix:") print("Expected | Predicted") print(" | 0 1 2 4 -1") for i, row in enumerate(confusion_matrix): print(" " + str(classess[i]) + " | "+ " ".join(f"{cell:5}" for cell in row))