118 lines
3.9 KiB
Python
118 lines
3.9 KiB
Python
import pandas as pd
|
|
import matplotlib.pyplot as plt
|
|
import numpy as np
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.model_selection import train_test_split
|
|
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, mean_squared_error
|
|
from contextlib import contextmanager
|
|
import time
|
|
from sklearn.tree import export_graphviz
|
|
import graphviz
|
|
|
|
from sklearn.tree import DecisionTreeClassifier
|
|
|
|
def load_data():
|
|
apples = pd.read_csv("apple_quality_clean.csv")
|
|
|
|
train_apples, test_apples = train_test_split(apples, test_size=0.2, random_state=42)
|
|
# print(f"Training size: {len(train_apples)}")
|
|
# print(f"Test size: {len(test_apples)}")
|
|
|
|
train_apples_x, train_apples_y = train_apples.drop(columns=["Quality"]), train_apples["Quality"]
|
|
test_apples_x , test_apples_y = test_apples.drop(columns=["Quality"]) , test_apples["Quality"]
|
|
|
|
return train_apples_x, train_apples_y, test_apples_x , test_apples_y
|
|
|
|
def main_decision_tree():
|
|
train_X, train_Y, test_X, test_Y = load_data()
|
|
|
|
best_accuracy = 0
|
|
best_depth = 0
|
|
|
|
accuracies = []
|
|
times = []
|
|
depths = list(range(1, 19))
|
|
for max_depth in depths:
|
|
start_time = time.time()
|
|
tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
|
|
tree.fit(train_X, train_Y)
|
|
total_time = time.time() - start_time
|
|
|
|
y_prediction = tree.predict(test_X)
|
|
accuracy = accuracy_score(test_Y, y_prediction)
|
|
accuracies.append(accuracy)
|
|
times.append(total_time)
|
|
|
|
if accuracy > best_accuracy:
|
|
best_accuracy = accuracy
|
|
best_depth = max_depth
|
|
|
|
# Accuracy plot
|
|
print("last accuracy ", accuracies[-1], depths[-1])
|
|
print("best accuracy ", best_accuracy, best_depth)
|
|
|
|
_, axs = plt.subplots(1, 2)
|
|
axs[0].plot(depths, accuracies)
|
|
axs[0].set(xlabel="Maksimalus gylis", ylabel="Tikslumas")
|
|
|
|
axs[1].plot(depths, times)
|
|
axs[1].set(xlabel="Maksimalus gylis", ylabel="Mokymosi trukmė")
|
|
|
|
plt.show()
|
|
|
|
# Confusion matrix
|
|
# cm = confusion_matrix(test_apples_y, y_prediction, labels=tree.classes_)
|
|
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
|
|
# disp.plot()
|
|
# plt.show()
|
|
|
|
# Visualize tree
|
|
# class_names = apples["Quality"].unique().astype(str).tolist()
|
|
# failas = export_graphviz(tree, out_file=None,
|
|
# feature_names=apples.drop(columns=["Quality"]).columns,
|
|
# class_names=class_names,
|
|
# filled=True, rounded=True,
|
|
# special_characters=True)
|
|
# graph = graphviz.Source(failas)
|
|
# graph.render()
|
|
|
|
def main_random_forest():
|
|
train_X, train_Y, test_X, test_Y = load_data()
|
|
|
|
forest = RandomForestClassifier(5, random_state=42)
|
|
forest.fit(train_X, train_Y)
|
|
|
|
y_prediction = forest.predict(test_X)
|
|
accuracy = accuracy_score(test_Y, y_prediction)
|
|
print(max(tree.get_depth() for tree in forest.estimators_))
|
|
print(accuracy)
|
|
|
|
cm = confusion_matrix(test_Y, y_prediction, labels=forest.classes_)
|
|
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_)
|
|
disp.plot()
|
|
plt.show()
|
|
|
|
# best_accuracy = 0
|
|
# best_forest_size = 0
|
|
|
|
# accuracies = []
|
|
# forest_sizes = list(range(3, 9+1, 5))
|
|
# for forest_size in forest_sizes:
|
|
# forest = RandomForestClassifier(forest_size, random_state=42)
|
|
# forest.fit(train_X, train_Y)
|
|
|
|
# y_prediction = forest.predict(test_X)
|
|
# accuracy = accuracy_score(test_Y, y_prediction)
|
|
# accuracies.append(accuracy)
|
|
# if accuracy > best_accuracy:
|
|
# best_accuracy = accuracy
|
|
# best_forest_size = forest_size
|
|
|
|
# print("best", best_accuracy, best_forest_size)
|
|
# plt.plot(forest_sizes, accuracies)
|
|
# plt.xlabel("Medžių kiekis")
|
|
# plt.ylabel("Tikslumas")
|
|
# plt.show()
|
|
|
|
|
|
main_random_forest() |