Remove figures
This commit is contained in:
parent
61313c1e87
commit
a0dd07b94a
1
.gitignore
vendored
1
.gitignore
vendored
@ -1 +1,2 @@
|
||||
Lab1/Figures/
|
||||
Lab2/Figures/
|
118
Lab2/5.2/part1.py
Normal file
118
Lab2/5.2/part1.py
Normal file
@ -0,0 +1,118 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
|
||||
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.model_selection import cross_val_score
|
||||
|
||||
# Load the Carseats data
|
||||
MusicSpotify = pd.read_csv('music_spotify.csv')
|
||||
MusicSpotify = MusicSpotify.drop(columns=['X', 'artist', 'song_title'])
|
||||
#print(MusicSpotify.head())
|
||||
|
||||
label_encoder = LabelEncoder()
|
||||
label_encoder.fit_transform(MusicSpotify['target'])
|
||||
|
||||
# Data division
|
||||
X = MusicSpotify.drop(columns=['target']) #dropping Sales, because class variable High is made of variable Sales.
|
||||
y = MusicSpotify['target']
|
||||
|
||||
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
|
||||
|
||||
# Tree training (no cost complexity pruning)
|
||||
tree_music_spotify = DecisionTreeClassifier(ccp_alpha=0.00, min_samples_split=2)
|
||||
tree_music_spotify.fit(X_train, y_train)
|
||||
|
||||
pred_rbf = tree_music_spotify.predict(X_test)
|
||||
|
||||
|
||||
if False:
|
||||
# Plot the Decision Tree with matplotlib
|
||||
feature_names = X_train.columns.tolist() # Assuming X_train is a DataFrame
|
||||
class_names = y_train.unique().astype(str)
|
||||
plt.figure(figsize=(12, 8))
|
||||
plot_tree(tree_music_spotify, filled=True, feature_names=feature_names, class_names=class_names, rounded=True, fontsize=10)
|
||||
plt.show()
|
||||
|
||||
# Tree information and visualization
|
||||
path = tree_music_spotify.cost_complexity_pruning_path(X_train, y_train)
|
||||
ccp_alphas, impurities = path.ccp_alphas, path.impurities
|
||||
#path = tree_music_spotify.cost_complexity_pruning_path(X_train, y_train)
|
||||
if False:
|
||||
fig, ax = plt.subplots()
|
||||
ax.plot(ccp_alphas[:-1], impurities[:-1], marker="o", drawstyle="steps-post")
|
||||
ax.set_xlabel("effective alpha")
|
||||
ax.set_ylabel("total impurity of leaves")
|
||||
ax.set_title("Total Impurity vs effective alpha for training set")
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
if False:
|
||||
# Vary the hyperparameter (e.g., max depth)
|
||||
depth_values = range(1, 25)
|
||||
cv_scores = []
|
||||
|
||||
for depth in depth_values:
|
||||
dt_classifier = DecisionTreeClassifier(max_depth=depth, random_state=20)
|
||||
scores = cross_val_score(dt_classifier, X, y, cv=10, scoring='accuracy')
|
||||
cv_scores.append(np.mean(scores))
|
||||
|
||||
# Plot the cross-validated error
|
||||
plt.plot(depth_values, 1 - np.array(cv_scores), marker='o')
|
||||
plt.xlabel('Tree depth')
|
||||
plt.ylabel('Cross-validated error rate')
|
||||
plt.title('Cross-validated error vs. tree depth')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
if False:
|
||||
ccp_alphas_collect=[]
|
||||
accuracy_collect=[]
|
||||
for ccp_alpha in ccp_alphas:
|
||||
tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
|
||||
tree.fit(X_train, y_train)
|
||||
accuracy = tree.score(X_test, y_test)
|
||||
ccp_alphas_collect.append(ccp_alpha)
|
||||
accuracy_collect.append(accuracy)
|
||||
|
||||
plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect))
|
||||
plt.grid()
|
||||
plt.xlim(0, 0.03)
|
||||
plt.xlabel('CP alpha')
|
||||
plt.ylabel('Accuracy')
|
||||
plt.show()
|
||||
|
||||
if False:
|
||||
plt.figure(figsize=(12, 6))
|
||||
plot_tree(tree_music_spotify, filled=True, feature_names=X.columns.tolist())
|
||||
plt.show()
|
||||
|
||||
# Prediction and accuracy
|
||||
pred = tree_music_spotify.predict(X_test)
|
||||
accuracy = accuracy_score(y_test, pred) * 100
|
||||
print("Unprunned:")
|
||||
print(f"Accuracy: {accuracy:.2f}%")
|
||||
print("10-fold cross-validation score: ", cross_val_score(tree_music_spotify, X, y, cv=10).mean())
|
||||
print("Confusion Matrix:\n", confusion_matrix(pred, y_test))
|
||||
|
||||
# Pruning
|
||||
music_spotify_prunned = DecisionTreeClassifier(ccp_alpha=0.006, random_state=20)
|
||||
music_spotify_prunned.fit(X_train, y_train)
|
||||
pred = music_spotify_prunned.predict(X_test)
|
||||
print("Prunned:")
|
||||
print(f"Accuracy: {accuracy_score(y_test, pred) * 100:.2f}%")
|
||||
print("10-fold cross-validation score: ", cross_val_score(music_spotify_prunned, X, y, cv=10).mean())
|
||||
print("Confusion Matrix:\n", confusion_matrix(pred, y_test))
|
||||
|
||||
if False:
|
||||
plt.figure(figsize=(12, 6))
|
||||
plot_tree(music_spotify_prunned, filled=True, feature_names=X.columns.tolist())
|
||||
plt.show()
|
||||
|
||||
if True:
|
||||
feat_importances = pd.DataFrame(tree_music_spotify.feature_importances_, index=X_train.columns, columns=["Importance"])
|
||||
feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
|
||||
feat_importances.plot(kind='bar', figsize=(8,6))
|
||||
plt.show()
|
107
Lab2/5.2/part2.py
Normal file
107
Lab2/5.2/part2.py
Normal file
@ -0,0 +1,107 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, plot_tree
|
||||
from sklearn.metrics import accuracy_score
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from time import time
|
||||
|
||||
print("Loading data")
|
||||
train_df = pd.read_csv("sign_mnist_train.csv")
|
||||
test_df = pd.read_csv("sign_mnist_test.csv")
|
||||
|
||||
def remove_every_nth_column(df, count):
|
||||
return df.loc[:, (np.arange(len(df.columns)) + 1) % count != 0]
|
||||
|
||||
def leave_every_nth_column(df, count):
|
||||
return df.loc[:, (np.arange(len(df.columns)) + 1) % count == 0]
|
||||
|
||||
X_train, X_test = train_df.drop(columns=["label"]), test_df.drop(columns=["label"])
|
||||
y_train, y_test = train_df["label"], test_df["label"]
|
||||
|
||||
# # ('leave_every_nth_column_5', 0.6002509760178472)
|
||||
# X_train = leave_every_nth_column(X_train, 5)
|
||||
# X_test = leave_every_nth_column(X_test, 5)
|
||||
|
||||
# # ('remove_every_nth_column_15', 0.6167038482989403)
|
||||
# X_train = remove_every_nth_column(X_train, 15)
|
||||
# X_test = remove_every_nth_column(X_test, 15)
|
||||
|
||||
# # ('remove_every_nth_column_14', 0.6179587283881762)
|
||||
# X_train = remove_every_nth_column(X_train, 14)
|
||||
# X_test = remove_every_nth_column(X_test, 14)
|
||||
|
||||
# # ('remove_every_nth_column_11', 0.6183770217512549)
|
||||
# X_train = remove_every_nth_column(X_train, 11)
|
||||
# X_test = remove_every_nth_column(X_test, 11)
|
||||
|
||||
start = time()
|
||||
ccp_alpha = 0.0009
|
||||
print("Training", ccp_alpha)
|
||||
decision_tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
|
||||
decision_tree.fit(X_train, y_train)
|
||||
print(time() - start)
|
||||
|
||||
if False:
|
||||
print("Viz")
|
||||
ccp_alphas_collect=[]
|
||||
accuracy_collect=[]
|
||||
for ccp_alpha in ccp_alphas:
|
||||
tree = DecisionTreeClassifier(ccp_alpha=ccp_alpha)
|
||||
tree.fit(X_train, y_train)
|
||||
accuracy = tree.score(X_test, y_test)
|
||||
ccp_alphas_collect.append(ccp_alpha)
|
||||
accuracy_collect.append(accuracy)
|
||||
|
||||
plt.plot(np.array(ccp_alphas_collect),np.array(accuracy_collect))
|
||||
plt.grid()
|
||||
plt.xlabel('effective alpha')
|
||||
plt.ylabel('Accuracy of test set')
|
||||
plt.show()
|
||||
|
||||
pred = decision_tree.predict(X_test)
|
||||
accuracy = accuracy_score(y_test, pred) * 100
|
||||
print("Unprunned:")
|
||||
print(f"Accuracy: {accuracy:.2f}%")
|
||||
#print("10-fold cross-validation score: ", cross_val_score(decision_tree, X_test, y_test, cv=10).mean())
|
||||
|
||||
if False:
|
||||
plt.figure(figsize=(12, 6))
|
||||
plot_tree(decision_tree, filled=True, feature_names=X_test.columns.tolist())
|
||||
plt.show()
|
||||
|
||||
if False:
|
||||
# Pruning
|
||||
decision_tree_prunned = DecisionTreeClassifier(ccp_alpha=0.01)
|
||||
decision_tree_prunned.fit(X_train, y_train)
|
||||
print("Prunned:")
|
||||
print(f"Accuracy: {accuracy_score(decision_tree_prunned.predict(X_test), pred) * 100:.2f}%")
|
||||
print("10-fold cross-validation score: ", cross_val_score(decision_tree_prunned, X_test, y_test, cv=10).mean())
|
||||
|
||||
if True:
|
||||
if True:
|
||||
feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
|
||||
print(decision_tree.feature_importances_)
|
||||
print(len(decision_tree.feature_importances_))
|
||||
print(X_train.columns)
|
||||
|
||||
image = []
|
||||
for y in range(28):
|
||||
image.append([0]*28)
|
||||
|
||||
|
||||
for idx, importance in enumerate(decision_tree.feature_importances_):
|
||||
pixel_idx = int(X_train.columns[idx].removeprefix("pixel"))-1
|
||||
pixel_x = pixel_idx % 28
|
||||
pixel_y = pixel_idx // 28
|
||||
image[pixel_y][pixel_x] = importance
|
||||
|
||||
for y in range(28):
|
||||
print(",".join(str(a) for a in image[y]))
|
||||
|
||||
# feat_importances = pd.DataFrame(decision_tree.feature_importances_, index=X_train.columns, columns=["Importance"])
|
||||
# feat_importances.sort_values(by='Importance', ascending=False, inplace=True)
|
||||
# feat_importances.plot(kind='bar', figsize=(8,6))
|
||||
# plt.show()
|
151
Lab2/6.1/example.py
Normal file
151
Lab2/6.1/example.py
Normal file
@ -0,0 +1,151 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn import svm
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.metrics import confusion_matrix
|
||||
|
||||
# 1. Linear SVC
|
||||
np.random.seed(1)
|
||||
x = np.random.normal(size=(20, 2))
|
||||
y = np.concatenate([-np.ones(10), np.ones(10)])
|
||||
x[y == 1, :] = x[y == 1, :] + 1
|
||||
# plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
||||
# plt.grid(True)
|
||||
# plt.show()
|
||||
|
||||
# 2. Linear SVC fitting
|
||||
data = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
|
||||
svm_model = svm.SVC(kernel='linear', C=10)
|
||||
svm_model.fit(data[['x1', 'x2']], data['y'])
|
||||
|
||||
# 3. Visualizing
|
||||
if False:
|
||||
xx, yy = np.meshgrid(np.linspace(-2.1, 2.5, 100), np.linspace(-1.3, 2.6, 100))
|
||||
Z = svm_model.decision_function(np.c_[xx.ravel(), yy.ravel()])
|
||||
Z = Z.reshape(xx.shape)
|
||||
|
||||
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
|
||||
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
||||
plt.xlim(-2.1, 2.5)
|
||||
plt.ylim(-1.3, 2.6)
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
# 4. Indices of Support vectors
|
||||
print("Indices of Support Vectors:", svm_model.support_)
|
||||
|
||||
# 6. Controlling the cost=1/C hyperparameter
|
||||
svm_model_low_cost = svm.SVC(kernel='linear', C=100)
|
||||
svm_model_low_cost.fit(data[['x1', 'x2']], data['y'])
|
||||
|
||||
if False:
|
||||
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
|
||||
Z = svm_model_low_cost.decision_function(np.c_[xx.ravel(), yy.ravel()])
|
||||
Z = Z.reshape(xx.shape)
|
||||
|
||||
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
|
||||
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
||||
plt.xlim(-2.1, 2.5)
|
||||
plt.ylim(-1.3, 2.6)
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
print("Indices of Support Vectors (Low Cost):", svm_model_low_cost.support_)
|
||||
print("Model Summary (Low Cost):\n", svm_model_low_cost)
|
||||
|
||||
# 7. Which cost value is better?
|
||||
np.random.seed(1)
|
||||
tune_params = {'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 1, 2, 3, 4]}
|
||||
svm_tune = GridSearchCV(svm.SVC(kernel='linear'), tune_params, cv=5)
|
||||
svm_tune.fit(data[['x1', 'x2']], data['y'])
|
||||
|
||||
# 8. Using the best model
|
||||
best_model = svm_tune.best_estimator_
|
||||
print("Best Model:\n", best_model)
|
||||
|
||||
# 9. Testing your model
|
||||
x_test = np.random.normal(size=(20, 2))
|
||||
y_test = np.random.choice([-1, 1], size=20, replace=True)
|
||||
x_test[y_test == 1, :] = x_test[y_test == 1, :] + 1
|
||||
test_data = pd.DataFrame({'x1': x_test[:, 0], 'x2': x_test[:, 1], 'y': y_test})
|
||||
|
||||
y_pred = best_model.predict(test_data[['x1', 'x2']])
|
||||
print("Confusion Matrix:\n", confusion_matrix(y_pred, test_data['y']))
|
||||
|
||||
# 10. Is linear boundary always the best choice?
|
||||
np.random.seed(1)
|
||||
x = np.random.normal(size=(200, 2))
|
||||
x[:100, :] = x[:100, :] + 2
|
||||
x[100:150, :] = x[100:150, :] - 2
|
||||
y = np.concatenate([-np.ones(150), np.ones(50)])
|
||||
data = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
|
||||
if False:
|
||||
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
# 11. Splitting and fitting
|
||||
train, test = train_test_split(data, test_size=0.2, random_state=1)
|
||||
svm_fit = svm.SVC(kernel='rbf', gamma=0.05, C=100)
|
||||
svm_fit.fit(train[['x1', 'x2']], train['y'])
|
||||
|
||||
if False:
|
||||
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
|
||||
Z = svm_fit.decision_function(np.c_[xx.ravel(), yy.ravel()])
|
||||
Z = Z.reshape(xx.shape)
|
||||
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
|
||||
plt.scatter(train['x1'], train['x2'], c=(3 - train['y']), marker='o', edgecolors='k')
|
||||
#plt.xlim(-2.1, 2.5)
|
||||
#plt.ylim(-1.3, 2.6)
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
print("Model Summary:\n", svm_fit)
|
||||
|
||||
# Tuning hyperparameters
|
||||
if False:
|
||||
tune_params_rbf = {'C': np.arange(0.1, 3.1, 0.2), 'gamma': np.arange(0.1, 5.1, 0.1)}
|
||||
svm_tune_rbf = GridSearchCV(svm.SVC(kernel='rbf'), tune_params_rbf, cv=5)
|
||||
svm_tune_rbf.fit(train[['x1', 'x2']], train['y'])
|
||||
|
||||
print("Best Model (RBF Kernel):\n", svm_tune_rbf.best_estimator_)
|
||||
print("Best Parameters (RBF Kernel):\n", svm_tune_rbf.best_params_)
|
||||
|
||||
# 12. Prediction
|
||||
pred_rbf = svm_tune_rbf.predict(test[['x1', 'x2']])
|
||||
print("Confusion Matrix (RBF Kernel):\n", confusion_matrix(pred_rbf, test['y']))
|
||||
|
||||
# 13. Multiclass
|
||||
np.random.seed(1)
|
||||
x = np.vstack([x, np.random.normal(size=(50, 2))])
|
||||
y = np.concatenate([y, np.zeros(50)])
|
||||
x = np.vstack([x, [[0, 0]]])
|
||||
y = np.concatenate([y, [2]])
|
||||
|
||||
|
||||
x[y == 0, 1] = x[y == 0, 1] + 2
|
||||
data_multiclass = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
|
||||
|
||||
if False:
|
||||
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
svm_fit_multiclass = svm.SVC(kernel='linear')
|
||||
svm_fit_multiclass.fit(data_multiclass[['x1', 'x2']], data_multiclass['y'])
|
||||
|
||||
if True:
|
||||
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
|
||||
Z = svm_fit.decision_function(np.c_[xx.ravel(), yy.ravel()])
|
||||
print("xx.shape", xx.shape)
|
||||
print(Z.shape)
|
||||
Z = Z.reshape(xx.shape)
|
||||
print(Z.shape)
|
||||
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
|
||||
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
||||
plt.xlim(-5,5)
|
||||
plt.ylim(-5,5)
|
||||
plt.grid(True)
|
||||
plt.show()
|
133
Lab2/6.1/main.py
Normal file
133
Lab2/6.1/main.py
Normal file
@ -0,0 +1,133 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
from sklearn import svm
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.model_selection import GridSearchCV
|
||||
from sklearn.model_selection import cross_val_score
|
||||
from sklearn.metrics import accuracy_score, roc_curve, confusion_matrix
|
||||
from sklearn.decomposition import PCA
|
||||
|
||||
train_df = pd.read_csv("satellite_train.csv")
|
||||
test_df = pd.read_csv("satellite_test.csv")
|
||||
|
||||
X_train, X_test = train_df.drop(columns=["V37"]), test_df.drop(columns=["V37"])
|
||||
y_train, y_test = train_df["V37"], test_df["V37"]
|
||||
|
||||
if False:
|
||||
value_counts = y_train.value_counts(sort=False)
|
||||
asc_index = sorted(value_counts.index)
|
||||
asc_values = [value_counts[idx] for idx in asc_index ]
|
||||
|
||||
plt.bar(asc_index, asc_values, alpha=0.7)
|
||||
plt.xticks(asc_index, labels=["red soil", "cotton crop", "grey soil","damp grey soil","soil with vegetation", "very damp grey soil"])
|
||||
plt.xlabel("Values")
|
||||
plt.ylabel("Frequency")
|
||||
plt.show()
|
||||
|
||||
#clf = svm.SVC(kernel='rbf', C=0.7, gamma=0.1)
|
||||
#clf = svm.SVC(kernel='linear')
|
||||
#clf.fit(X_train[["V1", "V2"]], y_train)
|
||||
|
||||
#clf_tuned = GridSearchCV(svm.SVC(kernel='linear'), {'C': np.arange(0.1, 1.6, 0.2)}, cv=5)
|
||||
clf_tuned = svm.SVC(kernel='rbf',C=0.19, gamma=0.00024) #GridSearchCV(svm.SVC(kernel='rbf'), {'C': [0.17, 0.18, 0.19, 0.2], 'gamma': [0.00023, 0.00024, 0.00025, 0.00026, 0.00027]}, cv=5)
|
||||
clf_tuned.fit(X_train, y_train)
|
||||
pred = clf_tuned.predict(X_test)
|
||||
accuracy = accuracy_score(y_test, pred) * 100
|
||||
#print("Best Model (Linear Kernel):\n", clf_tuned.best_estimator_)
|
||||
#print("Best Parameters (Linear Kernel):\n", clf_tuned.best_params_)
|
||||
print(f"Accuracy: {accuracy:.2f}%")
|
||||
print("Confusion Matrix:\n", confusion_matrix(pred, y_test))
|
||||
print("10-fold cross-validation score: ", cross_val_score(clf_tuned, X_test, y_test, cv=10).mean())
|
||||
|
||||
if False:
|
||||
plt.figure(figsize=(8, 6))
|
||||
|
||||
# Plot the training points
|
||||
plt.scatter(X_train["V1"], X_train["V2"], c=y_train, cmap=plt.cm.Paired, marker='.', s=20)
|
||||
|
||||
h = 0.2 # step size in the mesh
|
||||
x_min, x_max = X_train["V1"].min(), X_train["V1"].max()
|
||||
y_min, y_max = X_train["V2"].min(), X_train["V2"].max()
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1))
|
||||
Z = clf_tuned.predict(np.c_[xx.ravel(), yy.ravel()],)
|
||||
Z = Z.reshape(xx.shape)
|
||||
|
||||
# Plot decision boundary and margins
|
||||
plt.contour(xx, yy, Z, colors='k', alpha=0.5)
|
||||
|
||||
# Highlight the support vectors
|
||||
#plt.scatter(clf_tuned.support_vectors_[:, 0], clf_tuned.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors')
|
||||
|
||||
#plt.title('SVM Decision Boundary')
|
||||
plt.xlabel('Feature V1')
|
||||
plt.ylabel('Feature V2')
|
||||
plt.show()
|
||||
|
||||
if False:
|
||||
# Apply PCA to reduce the dimensionality to 2D
|
||||
pca = PCA(n_components=2)
|
||||
X_2d = pca.fit_transform(X_train)
|
||||
#print(pca.components_)
|
||||
|
||||
# Plot the 2D representation of the data
|
||||
plt.figure(figsize=(8, 6))
|
||||
plt.scatter(X_2d[:, 0], X_2d[:, 1], c=y_train, cmap=plt.cm.Paired, marker='.', s=20)
|
||||
plt.title('2D Projection of High-Dimensional Data using PCA')
|
||||
plt.xlabel('Principal Component 1')
|
||||
plt.ylabel('Principal Component 2')
|
||||
plt.show()
|
||||
|
||||
if False:
|
||||
tune_params_linear = {'C': np.arange(0.1, 2.1, 0.2)}
|
||||
svm_tune_linear = GridSearchCV(svm.SVC(kernel='linear'), tune_params_linear, cv=10)
|
||||
svm_tune_linear.fit(X_train, y_train)
|
||||
|
||||
print("Best Model (Linear Kernel):\n", svm_tune_linear.best_estimator_)
|
||||
print("Best Parameters (Linear Kernel):\n", svm_tune_linear.best_params_)
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
h = 0.2 # step size in the mesh
|
||||
x_min, x_max = X_train["V1"].min(), X_train["V1"].max()
|
||||
y_min, y_max = X_train["V2"].min(), X_train["V2"].max()
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1))
|
||||
Z = svm_tune_linear.predict(np.c_[xx.ravel(), yy.ravel()])
|
||||
|
||||
# Plot decision boundary and margins
|
||||
#plt.contour(xx, yy, Z, colors='k', alpha=0.5)
|
||||
|
||||
# Highlight the support vectors
|
||||
plt.scatter(svm_tune_linear.support_vectors_[:, 0], svm_tune_linear.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors')
|
||||
|
||||
plt.title('SVM Decision Boundary')
|
||||
plt.xlabel('Feature V1')
|
||||
plt.ylabel('Feature V2')
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
if False:
|
||||
tune_params_linear = {'C': np.arange(0.1, 2.1, 0.2), 'gamma': np.arange(0.1, 4.1, 0.1)}
|
||||
svm_tune_rbf = GridSearchCV(svm.SVC(kernel='rbf'), tune_params_linear, cv=5)
|
||||
svm_tune_rbf.fit(X_train, y_train)
|
||||
|
||||
print("Best Model (RBF Kernel):\n", svm_tune_rbf.best_estimator_)
|
||||
print("Best Parameters (RBF Kernel):\n", svm_tune_rbf.best_params_)
|
||||
|
||||
plt.figure(figsize=(8, 6))
|
||||
h = 0.2 # step size in the mesh
|
||||
x_min, x_max = X_train["V1"].min(), X_train["V1"].max()
|
||||
y_min, y_max = X_train["V2"].min(), X_train["V2"].max()
|
||||
xx, yy = np.meshgrid(np.arange(x_min, x_max, 1), np.arange(y_min, y_max, 1))
|
||||
Z = svm_tune_rbf.predict(np.c_[xx.ravel(), yy.ravel()],)
|
||||
|
||||
# Plot decision boundary and margins
|
||||
plt.contour(xx, yy, Z, colors='k', alpha=0.5)
|
||||
|
||||
# Highlight the support vectors
|
||||
plt.scatter(svm_tune_rbf.support_vectors_[:, 0], svm_tune_rbf.support_vectors_[:, 1], s=20, linewidth=1, facecolors='none', edgecolors='k', marker='o', label='Support Vectors')
|
||||
|
||||
plt.title('SVM Decision Boundary')
|
||||
plt.xlabel('Feature V1')
|
||||
plt.ylabel('Feature V2')
|
||||
plt.legend()
|
||||
plt.show()
|
115
Lab2/7.1/example.py
Normal file
115
Lab2/7.1/example.py
Normal file
@ -0,0 +1,115 @@
|
||||
import h2o
|
||||
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
|
||||
from h2o.grid.grid_search import H2OGridSearch
|
||||
import pandas as pd
|
||||
from sklearn.model_selection import train_test_split
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
|
||||
# Initialize H2O cluster
|
||||
h2o.init(nthreads=-1, max_mem_size="2G")
|
||||
|
||||
# Load data
|
||||
data = pd.read_csv("customer_churn.csv")
|
||||
#print(data.head())
|
||||
|
||||
tenure_churned = data[data['Churn'] == 'Yes']['tenure']
|
||||
tenure_stayed = data[data['Churn'] == 'No']['tenure']
|
||||
|
||||
if False:
|
||||
plt.figure()
|
||||
sns.kdeplot(tenure_churned, color='firebrick', linewidth=3, label='Churned')
|
||||
sns.kdeplot(tenure_stayed, color='dodgerblue', linewidth=3, label='Stayed')
|
||||
|
||||
plt.title('Tenure Variable')
|
||||
plt.xlabel('Number of months the customer is with the company')
|
||||
plt.legend(loc='upper right')
|
||||
plt.grid(True)
|
||||
plt.show()
|
||||
|
||||
# Split data
|
||||
seed = 1
|
||||
X_train, X_test, y_train, y_test = train_test_split(
|
||||
data.drop("Churn", axis=1), # Features
|
||||
data["Churn"], # Target variable
|
||||
test_size=0.2,
|
||||
random_state=seed
|
||||
)
|
||||
X_train['Churn']=y_train
|
||||
X_test['Churn']=y_test
|
||||
|
||||
# Assuming 'train' is a pandas DataFrame
|
||||
h2o_train = h2o.H2OFrame(X_train)
|
||||
|
||||
# Assuming 'test' is a pandas DataFrame
|
||||
h2o_test = h2o.H2OFrame(X_test)
|
||||
|
||||
# Training the neural network
|
||||
dl_model = H2ODeepLearningEstimator(
|
||||
activation="Rectifier",
|
||||
hidden=[2,2],
|
||||
loss="CrossEntropy",
|
||||
score_each_iteration=True,
|
||||
epochs=10000,
|
||||
balance_classes=False,
|
||||
rate=0.01,
|
||||
adaptive_rate=False,
|
||||
stopping_rounds=0,
|
||||
classification_stop=-1
|
||||
)
|
||||
|
||||
dl_model.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train)
|
||||
|
||||
if False:
|
||||
plt.plot(dl_model.scoring_history().epochs,dl_model.scoring_history().training_logloss)
|
||||
plt.xlabel('Epochs')
|
||||
plt.ylabel('log_loss')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
# Prediction
|
||||
prediction = dl_model.predict(h2o_test)
|
||||
prediction = prediction["predict"].as_data_frame()
|
||||
h2o_test_df = h2o_test.as_data_frame()
|
||||
|
||||
# Calculate accuracy
|
||||
accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100
|
||||
print(f"Accuracy: {accuracy:.2f}%")
|
||||
|
||||
conf_matrix = pd.crosstab(prediction["predict"], h2o_test_df["Churn"], rownames=["Predicted"], colnames=["Actual"])
|
||||
conf_matrix_diag = conf_matrix.values.diagonal() / conf_matrix.sum(axis=0) * 100
|
||||
print("Confusion Matrix:")
|
||||
print(conf_matrix)
|
||||
print("Diagonal Percentages:")
|
||||
print(conf_matrix_diag)
|
||||
|
||||
print("-------------- DEEPER --------------")
|
||||
|
||||
# Deeper model
|
||||
dl_model_balanced = H2ODeepLearningEstimator(
|
||||
activation="Rectifier",
|
||||
hidden=[10,10,10],
|
||||
loss="CrossEntropy",
|
||||
score_each_iteration=True,
|
||||
epochs=10000,
|
||||
balance_classes=False,
|
||||
rate=0.01,
|
||||
adaptive_rate=False,
|
||||
stopping_rounds=0,
|
||||
classification_stop=-1
|
||||
)
|
||||
|
||||
dl_model_balanced.train(x=h2o_train.columns[0:19], y="Churn", training_frame=h2o_train);
|
||||
|
||||
plt.plot(dl_model_balanced.scoring_history().epochs,dl_model_balanced.scoring_history().training_logloss)
|
||||
plt.xlabel('Epochs')
|
||||
plt.ylabel('log_loss')
|
||||
plt.grid()
|
||||
|
||||
# Prediction
|
||||
prediction = dl_model_balanced.predict(h2o_test)
|
||||
prediction = prediction["predict"].as_data_frame()
|
||||
h2o_test_df = h2o_test.as_data_frame()
|
||||
# Calculate accuracy
|
||||
accuracy = (prediction['predict']== h2o_test_df["Churn"]).mean() * 100
|
||||
print(f"Accuracy: {accuracy:.2f}%")
|
122
Lab2/7.1/main.py
Normal file
122
Lab2/7.1/main.py
Normal file
@ -0,0 +1,122 @@
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
import h2o
|
||||
from h2o.estimators.deeplearning import H2ODeepLearningEstimator
|
||||
from h2o.grid.grid_search import H2OGridSearch
|
||||
|
||||
train_df = pd.read_csv("arrhythmia_train.csv")
|
||||
test_df = pd.read_csv("arrhythmia_test.csv")
|
||||
|
||||
#X_train, X_test = train_df.drop(columns=["arrhythmia"]), test_df.drop(columns=["arrhythmia"])
|
||||
#y_train, y_test = train_df["arrhythmia"], test_df["arrhythmia"]
|
||||
|
||||
# 1.
|
||||
if False:
|
||||
target_class = 0
|
||||
max_count = 10
|
||||
count = 0
|
||||
cmap = plt.cm.get_cmap('hsv', max_count)
|
||||
idx = 0
|
||||
for i in range(len(X_train["X1"])):
|
||||
if y_train[i] != target_class: continue
|
||||
y = []
|
||||
x = []
|
||||
|
||||
for j in range(187):
|
||||
if X_train[f"X{j+1}"][i] == 0: break
|
||||
x.append(idx)
|
||||
y.append(X_train[f"X{j+1}"][i])
|
||||
idx += 1
|
||||
|
||||
plt.plot(x, y, c=cmap(count))
|
||||
|
||||
count += 1
|
||||
if count == max_count: break
|
||||
plt.show()
|
||||
|
||||
# 2.
|
||||
if False:
|
||||
x_feature = "X20"
|
||||
y_feature = "X50"
|
||||
|
||||
for point_class in y_test.unique():
|
||||
plt.scatter(X_train[y_train == point_class][x_feature], X_train[y_train == point_class][y_feature], s=2, label=str(point_class))
|
||||
|
||||
plt.xlabel(x_feature)
|
||||
plt.ylabel(y_feature)
|
||||
plt.legend()
|
||||
plt.show()
|
||||
|
||||
# 3.
|
||||
if False:
|
||||
value_counts = y_train.value_counts(sort=False)
|
||||
asc_index = sorted(value_counts.index)
|
||||
asc_values = [value_counts[idx] for idx in asc_index]
|
||||
|
||||
plt.bar(asc_index, asc_values, alpha=0.7)
|
||||
plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"])
|
||||
plt.ylabel("Frequencies")
|
||||
plt.title("Training dataset class distribution")
|
||||
plt.show()
|
||||
|
||||
if False:
|
||||
value_counts = y_test.value_counts(sort=False)
|
||||
asc_index = sorted(value_counts.index)
|
||||
asc_values = [value_counts[idx] for idx in asc_index]
|
||||
|
||||
plt.bar(asc_index, asc_values, alpha=0.7)
|
||||
plt.xticks(asc_index, labels=["normal (0)", "supraventricular (1)", "ventricular (2)","unknown (4)"])
|
||||
plt.ylabel("Frequencies")
|
||||
plt.title("Test dataset class distribution")
|
||||
plt.show()
|
||||
|
||||
# 4.
|
||||
if True:
|
||||
# Initialize H2O cluster
|
||||
h2o.init(nthreads=-1, max_mem_size="6G")
|
||||
|
||||
# Assuming 'train' is a pandas DataFrame
|
||||
h2o_train = h2o.H2OFrame(train_df)
|
||||
|
||||
# Assuming 'test' is a pandas DataFrame
|
||||
h2o_test = h2o.H2OFrame(test_df)
|
||||
|
||||
# Training the neural network
|
||||
dl_model = H2ODeepLearningEstimator(
|
||||
activation="tanh",
|
||||
hidden=[50, 20],
|
||||
input_dropout_ratio=0.2,
|
||||
l1=1e-5,
|
||||
epochs=20
|
||||
)
|
||||
|
||||
dl_model.train(x=h2o_train.columns[:-1], y="arrhythmia", training_frame=h2o_train, validation_frame=h2o_test)
|
||||
|
||||
# Prediction
|
||||
prediction = dl_model.predict(h2o_test)
|
||||
prediction = prediction["predict"].as_data_frame()
|
||||
h2o_test_df = h2o_test.as_data_frame()
|
||||
|
||||
# Calculate accuracy
|
||||
classess = [0,1,2,4]
|
||||
confusion_matrix = [[0]*5, [0]*5, [0]*5, [0]*5]
|
||||
for i in range(len(h2o_test_df["arrhythmia"])):
|
||||
expected = h2o_test_df["arrhythmia"][i]
|
||||
actual = round(prediction['predict'][i])
|
||||
if actual not in classess:
|
||||
confusion_matrix[classess.index(expected)][-1] += 1
|
||||
else:
|
||||
actual_idx = classess.index(actual)
|
||||
confusion_matrix[classess.index(expected)][actual_idx] += 1
|
||||
|
||||
|
||||
accuracy = (abs(prediction['predict'] - h2o_test_df["arrhythmia"]) < 0.5).mean() * 100
|
||||
print(f"Accuracy: {accuracy:.2f}%")
|
||||
|
||||
print("Confusion Matrix:")
|
||||
print("Expected | Predicted")
|
||||
print(" | 0 1 2 4 -1")
|
||||
for i, row in enumerate(confusion_matrix):
|
||||
print(" " + str(classess[i]) + " | "+ " ".join(f"{cell:5}" for cell in row))
|
BIN
Lab2/IF-1-1_Rokas_Puzonas.pdf
Normal file
BIN
Lab2/IF-1-1_Rokas_Puzonas.pdf
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user