115 lines
3.2 KiB
Python
115 lines
3.2 KiB
Python
import h2o
|
|
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
|
|
from h2o.grid.grid_search import H2OGridSearch
|
|
import pandas as pd
|
|
from sklearn.model_selection import train_test_split
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
|
|
# Initialize H2O cluster
|
|
h2o.init(nthreads=-1, max_mem_size="2G")
|
|
|
|
# Load data
|
|
data = pd.read_csv("customer_churn.csv")
|
|
#print(data.head())
|
|
|
|
tenure_churned = data[data['Churn'] == 'Yes']['tenure']
|
|
tenure_stayed = data[data['Churn'] == 'No']['tenure']
|
|
|
|
if False:
|
|
plt.figure()
|
|
sns.kdeplot(tenure_churned, color='firebrick', linewidth=3, label='Churned')
|
|
sns.kdeplot(tenure_stayed, color='dodgerblue', linewidth=3, label='Stayed')
|
|
|
|
plt.title('Tenure Variable')
|
|
plt.xlabel('Number of months the customer is with the company')
|
|
plt.legend(loc='upper right')
|
|
plt.grid(True)
|
|
plt.show()
|
|
|
|
# Split data
|
|
seed = 1
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
data.drop("Churn", axis=1), # Features
|
|
data["Churn"], # Target variable
|
|
test_size=0.2,
|
|
random_state=seed
|
|
)
|
|
X_train['Churn']=y_train
|
|
X_test['Churn']=y_test
|
|
|
|
# Assuming 'train' is a pandas DataFrame
|
|
h2o_train = h2o.H2OFrame(X_train)
|
|
|
|
# Assuming 'test' is a pandas DataFrame
|
|
h2o_test = h2o.H2OFrame(X_test)
|
|
|
|
# Training the neural network
|
|
dl_model = H2ODeepLearningEstimator(
|
|
activation="Rectifier",
|
|
hidden=[2,2],
|
|
loss="CrossEntropy",
|
|
score_each_iteration=True,
|
|
epochs=10000,
|
|
balance_classes=False,
|
|
rate=0.01,
|
|
adaptive_rate=False,
|
|
stopping_rounds=0,
|
|
classification_stop=-1
|
|
)
|
|
|
|
dl_model.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train)
|
|
|
|
if False:
|
|
plt.plot(dl_model.scoring_history().epochs,dl_model.scoring_history().training_logloss)
|
|
plt.xlabel('Epochs')
|
|
plt.ylabel('log_loss')
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
# Prediction
|
|
prediction = dl_model.predict(h2o_test)
|
|
prediction = prediction["predict"].as_data_frame()
|
|
h2o_test_df = h2o_test.as_data_frame()
|
|
|
|
# Calculate accuracy
|
|
accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100
|
|
print(f"Accuracy: {accuracy:.2f}%")
|
|
|
|
conf_matrix = pd.crosstab(prediction["predict"], h2o_test_df["Churn"], rownames=["Predicted"], colnames=["Actual"])
|
|
conf_matrix_diag = conf_matrix.values.diagonal() / conf_matrix.sum(axis=0) * 100
|
|
print("Confusion Matrix:")
|
|
print(conf_matrix)
|
|
print("Diagonal Percentages:")
|
|
print(conf_matrix_diag)
|
|
|
|
print("-------------- DEEPER --------------")
|
|
|
|
# Deeper model
|
|
dl_model_balanced = H2ODeepLearningEstimator(
|
|
activation="Rectifier",
|
|
hidden=[10,10,10],
|
|
loss="CrossEntropy",
|
|
score_each_iteration=True,
|
|
epochs=10000,
|
|
balance_classes=False,
|
|
rate=0.01,
|
|
adaptive_rate=False,
|
|
stopping_rounds=0,
|
|
classification_stop=-1
|
|
)
|
|
|
|
dl_model_balanced.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train);
|
|
|
|
plt.plot(dl_model_balanced.scoring_history().epochs,dl_model_balanced.scoring_history().training_logloss)
|
|
plt.xlabel('Epochs')
|
|
plt.ylabel('log_loss')
|
|
plt.grid()
|
|
|
|
# Prediction
|
|
prediction = dl_model_balanced.predict(h2o_test)
|
|
prediction = prediction["predict"].as_data_frame()
|
|
h2o_test_df = h2o_test.as_data_frame()
|
|
# Calculate accuracy
|
|
accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100
|
|
print(f"Accuracy: {accuracy:.2f}%") |