machine-learing-methods/Lab2/7.1/example.py

import h2o
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
from h2o.grid.grid_search import H2OGridSearch
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize H2O cluster
h2o.init(nthreads=-1, max_mem_size="2G")

# Load data
data = pd.read_csv("customer_churn.csv")
#print(data.head())

tenure_churned = data[data['Churn'] == 'Yes']['tenure']
tenure_stayed = data[data['Churn'] == 'No']['tenure']

if False:
    plt.figure()
    sns.kdeplot(tenure_churned, color='firebrick', linewidth=3, label='Churned')
    sns.kdeplot(tenure_stayed, color='dodgerblue', linewidth=3, label='Stayed')

    plt.title('Tenure Variable')
    plt.xlabel('Number of months the customer is with the company')
    plt.legend(loc='upper right')
    plt.grid(True)
    plt.show()

# Split data
seed = 1
X_train, X_test, y_train, y_test = train_test_split(
    data.drop("Churn", axis=1),  # Features
    data["Churn"],  # Target variable
    test_size=0.2,
    random_state=seed
)
X_train['Churn']=y_train
X_test['Churn']=y_test

# Assuming 'train' is a pandas DataFrame
h2o_train = h2o.H2OFrame(X_train)

# Assuming 'test' is a pandas DataFrame
h2o_test = h2o.H2OFrame(X_test)

# Training the neural network
dl_model = H2ODeepLearningEstimator(
    activation="Rectifier",
    hidden=[2,2],
    loss="CrossEntropy",
    score_each_iteration=True,
    epochs=10000,
    balance_classes=False,
    rate=0.01,
    adaptive_rate=False,
    stopping_rounds=0,
    classification_stop=-1
)

dl_model.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train)

if False:
    plt.plot(dl_model.scoring_history().epochs,dl_model.scoring_history().training_logloss)
    plt.xlabel('Epochs')
    plt.ylabel('log_loss')
    plt.grid()
    plt.show()

# Prediction
prediction = dl_model.predict(h2o_test)
prediction = prediction["predict"].as_data_frame()
h2o_test_df = h2o_test.as_data_frame()

# Calculate accuracy
accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100
print(f"Accuracy: {accuracy:.2f}%")

conf_matrix = pd.crosstab(prediction["predict"], h2o_test_df["Churn"], rownames=["Predicted"], colnames=["Actual"])
conf_matrix_diag = conf_matrix.values.diagonal() / conf_matrix.sum(axis=0) * 100
print("Confusion Matrix:")
print(conf_matrix)
print("Diagonal Percentages:")
print(conf_matrix_diag)

print("-------------- DEEPER --------------")

# Deeper model
dl_model_balanced = H2ODeepLearningEstimator(
    activation="Rectifier",
    hidden=[10,10,10],
    loss="CrossEntropy",
    score_each_iteration=True,
    epochs=10000,
    balance_classes=False,
    rate=0.01,
    adaptive_rate=False,
    stopping_rounds=0,
    classification_stop=-1
)

dl_model_balanced.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train);

plt.plot(dl_model_balanced.scoring_history().epochs,dl_model_balanced.scoring_history().training_logloss)
plt.xlabel('Epochs')
plt.ylabel('log_loss')
plt.grid()

# Prediction
prediction = dl_model_balanced.predict(h2o_test)
prediction = prediction["predict"].as_data_frame()
h2o_test_df = h2o_test.as_data_frame()
# Calculate accuracy
accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100
print(f"Accuracy: {accuracy:.2f}%")