152 lines
5.1 KiB
Python
152 lines
5.1 KiB
Python
import numpy as np
|
|
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
from sklearn import svm
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.metrics import confusion_matrix
|
|
|
|
# 1. Linear SVC
|
|
np.random.seed(1)
|
|
x = np.random.normal(size=(20, 2))
|
|
y = np.concatenate([-np.ones(10), np.ones(10)])
|
|
x[y == 1, :] = x[y == 1, :] + 1
|
|
# plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
|
# plt.grid(True)
|
|
# plt.show()
|
|
|
|
# 2. Linear SVC fitting
|
|
data = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
|
|
svm_model = svm.SVC(kernel='linear', C=10)
|
|
svm_model.fit(data[['x1', 'x2']], data['y'])
|
|
|
|
# 3. Visualizing
|
|
if False:
|
|
xx, yy = np.meshgrid(np.linspace(-2.1, 2.5, 100), np.linspace(-1.3, 2.6, 100))
|
|
Z = svm_model.decision_function(np.c_[xx.ravel(), yy.ravel()])
|
|
Z = Z.reshape(xx.shape)
|
|
|
|
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
|
|
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
|
plt.xlim(-2.1, 2.5)
|
|
plt.ylim(-1.3, 2.6)
|
|
plt.grid(True)
|
|
plt.show()
|
|
|
|
# 4. Indices of Support vectors
|
|
print("Indices of Support Vectors:", svm_model.support_)
|
|
|
|
# 6. Controlling the cost=1/C hyperparameter
|
|
svm_model_low_cost = svm.SVC(kernel='linear', C=100)
|
|
svm_model_low_cost.fit(data[['x1', 'x2']], data['y'])
|
|
|
|
if False:
|
|
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
|
|
Z = svm_model_low_cost.decision_function(np.c_[xx.ravel(), yy.ravel()])
|
|
Z = Z.reshape(xx.shape)
|
|
|
|
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
|
|
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
|
plt.xlim(-2.1, 2.5)
|
|
plt.ylim(-1.3, 2.6)
|
|
plt.grid(True)
|
|
plt.show()
|
|
|
|
print("Indices of Support Vectors (Low Cost):", svm_model_low_cost.support_)
|
|
print("Model Summary (Low Cost):\n", svm_model_low_cost)
|
|
|
|
# 7. Which cost value is better?
|
|
np.random.seed(1)
|
|
tune_params = {'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 1, 2, 3, 4]}
|
|
svm_tune = GridSearchCV(svm.SVC(kernel='linear'), tune_params, cv=5)
|
|
svm_tune.fit(data[['x1', 'x2']], data['y'])
|
|
|
|
# 8. Using the best model
|
|
best_model = svm_tune.best_estimator_
|
|
print("Best Model:\n", best_model)
|
|
|
|
# 9. Testing your model
|
|
x_test = np.random.normal(size=(20, 2))
|
|
y_test = np.random.choice([-1, 1], size=20, replace=True)
|
|
x_test[y_test == 1, :] = x_test[y_test == 1, :] + 1
|
|
test_data = pd.DataFrame({'x1': x_test[:, 0], 'x2': x_test[:, 1], 'y': y_test})
|
|
|
|
y_pred = best_model.predict(test_data[['x1', 'x2']])
|
|
print("Confusion Matrix:\n", confusion_matrix(y_pred, test_data['y']))
|
|
|
|
# 10. Is linear boundary always the best choice?
|
|
np.random.seed(1)
|
|
x = np.random.normal(size=(200, 2))
|
|
x[:100, :] = x[:100, :] + 2
|
|
x[100:150, :] = x[100:150, :] - 2
|
|
y = np.concatenate([-np.ones(150), np.ones(50)])
|
|
data = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
|
|
if False:
|
|
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
|
plt.grid(True)
|
|
plt.show()
|
|
|
|
# 11. Splitting and fitting
|
|
train, test = train_test_split(data, test_size=0.2, random_state=1)
|
|
svm_fit = svm.SVC(kernel='rbf', gamma=0.05, C=100)
|
|
svm_fit.fit(train[['x1', 'x2']], train['y'])
|
|
|
|
if False:
|
|
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
|
|
Z = svm_fit.decision_function(np.c_[xx.ravel(), yy.ravel()])
|
|
Z = Z.reshape(xx.shape)
|
|
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
|
|
plt.scatter(train['x1'], train['x2'], c=(3 - train['y']), marker='o', edgecolors='k')
|
|
#plt.xlim(-2.1, 2.5)
|
|
#plt.ylim(-1.3, 2.6)
|
|
plt.grid(True)
|
|
plt.show()
|
|
|
|
print("Model Summary:\n", svm_fit)
|
|
|
|
# Tuning hyperparameters
|
|
if False:
|
|
tune_params_rbf = {'C': np.arange(0.1, 3.1, 0.2), 'gamma': np.arange(0.1, 5.1, 0.1)}
|
|
svm_tune_rbf = GridSearchCV(svm.SVC(kernel='rbf'), tune_params_rbf, cv=5)
|
|
svm_tune_rbf.fit(train[['x1', 'x2']], train['y'])
|
|
|
|
print("Best Model (RBF Kernel):\n", svm_tune_rbf.best_estimator_)
|
|
print("Best Parameters (RBF Kernel):\n", svm_tune_rbf.best_params_)
|
|
|
|
# 12. Prediction
|
|
pred_rbf = svm_tune_rbf.predict(test[['x1', 'x2']])
|
|
print("Confusion Matrix (RBF Kernel):\n", confusion_matrix(pred_rbf, test['y']))
|
|
|
|
# 13. Multiclass
|
|
np.random.seed(1)
|
|
x = np.vstack([x, np.random.normal(size=(50, 2))])
|
|
y = np.concatenate([y, np.zeros(50)])
|
|
x = np.vstack([x, [[0, 0]]])
|
|
y = np.concatenate([y, [2]])
|
|
|
|
|
|
x[y == 0, 1] = x[y == 0, 1] + 2
|
|
data_multiclass = pd.DataFrame({'x1': x[:, 0], 'x2': x[:, 1], 'y': y})
|
|
|
|
if False:
|
|
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
|
plt.grid(True)
|
|
plt.show()
|
|
|
|
svm_fit_multiclass = svm.SVC(kernel='linear')
|
|
svm_fit_multiclass.fit(data_multiclass[['x1', 'x2']], data_multiclass['y'])
|
|
|
|
if True:
|
|
xx, yy = np.meshgrid(np.linspace(-5,5, 100), np.linspace(-5,5, 100))
|
|
Z = svm_fit.decision_function(np.c_[xx.ravel(), yy.ravel()])
|
|
print("xx.shape", xx.shape)
|
|
print(Z.shape)
|
|
Z = Z.reshape(xx.shape)
|
|
print(Z.shape)
|
|
plt.contourf(xx, yy, Z, levels=[-1, 0, 1], cmap=plt.cm.terrain, alpha=0.5)
|
|
plt.scatter(x[:, 0], x[:, 1], c=(3 - y), marker='o', edgecolors='k')
|
|
plt.xlim(-5,5)
|
|
plt.ylim(-5,5)
|
|
plt.grid(True)
|
|
plt.show()
|