1
0

complete lab2

This commit is contained in:
Rokas Puzonas 2024-03-24 21:13:24 +02:00
parent bd67f94727
commit c86d4ca749
7 changed files with 298 additions and 3 deletions

3
.gitignore vendored
View File

@ -1 +1,2 @@
venv
venv
__pycache__

BIN
Ataskaita lab2.docx Normal file

Binary file not shown.

BIN
Sprendimų medis.pdf Normal file

Binary file not shown.

167
decision_tree.py Normal file
View File

@ -0,0 +1,167 @@
import pandas as pd
import numpy as np
import math
from dataclasses import dataclass
from typing import Optional
from sklearn.base import BaseEstimator, ClassifierMixin
@dataclass
class Node:
# If leaf node
value: Optional[float] = None
# If branching node
feature: Optional[str] = None
threshold: Optional[float] = None
left: Optional["Node"] = None
right: Optional["Node"] = None
info_gain: Optional[float] = None
def leaf(value: float) -> "Node":
return Node(value=value)
def branch(feature: str, threshold: float, info_gain: float, left: "Node", right: "Node") -> "Node":
return Node(
feature=feature,
threshold=threshold,
left=left,
right=right,
info_gain=info_gain
)
@dataclass
class SplitResult:
feature: str
threshold: float
info_gain: float
left_X: pd.DataFrame
left_Y: pd.Series
right_X: pd.DataFrame
right_Y: pd.Series
class DecisionTree(BaseEstimator, ClassifierMixin):
root_node: Optional[Node]
def __init__(self, min_samples_split=2, max_depth=10) -> None:
self.root_node = None
self.min_samples_split = min_samples_split
self.max_depth = max_depth
def build_tree(self, X: pd.DataFrame, Y: pd.Series, current_depth = 0):
num_samples = np.shape(X)[0]
if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
best_split = self.get_best_split(X, Y)
if best_split and best_split.info_gain > 0:
left_node = self.build_tree(best_split.left_X , best_split.left_Y , current_depth + 1)
right_node = self.build_tree(best_split.right_X, best_split.right_Y, current_depth + 1)
return Node.branch(
best_split.feature,
best_split.threshold,
best_split.info_gain,
left_node,
right_node,
)
return Node.leaf(self.calculate_leaf_value(Y))
def get_best_split(self, X: pd.DataFrame, Y: pd.Series):
best_split = SplitResult(
"unknown",
0,
-math.inf,
pd.DataFrame(),
pd.Series(),
pd.DataFrame(),
pd.Series()
)
for column_name in X:
column = X[column_name]
thresholds = column.unique() # TODO: Should probably be cached
if len(thresholds) > 20:
continue
for threshold in thresholds:
left_X, left_Y, right_X, right_Y = self.split(X, Y, column_name, threshold)
if not left_X.empty and not right_X.empty:
info_gain = self.gini_information_gain(Y, left_Y, right_Y)
if info_gain > best_split.info_gain:
best_split.info_gain = info_gain
best_split.feature = column_name
best_split.threshold = threshold
best_split.left_X = left_X
best_split.left_Y = left_Y
best_split.right_X = right_X
best_split.right_Y = right_Y
if best_split.info_gain == -math.inf:
return None
return best_split
def split(self, X: pd.DataFrame, Y: pd.Series, column_name: str, threshold: float):
left_rows = X[column_name] <= threshold
right_rows = ~left_rows
return X[left_rows], Y[left_rows], X[right_rows], Y[right_rows]
def gini_information_gain(self, Y, left_Y, right_Y):
weight_left = len(left_Y) / len(Y)
weight_right = len(right_Y) / len(Y)
return self.gini_index(Y) - weight_left*self.gini_index(left_Y) - weight_right*self.gini_index(right_Y)
# return self.entropy(Y) - weight_left*self.entropy(left_Y) - weight_right*self.entropy(right_Y)
def gini_index(self, Y):
gini = 0
for y_value in np.unique(Y):
probability = (Y == y_value).sum() / len(Y)
gini += probability*probability
return 1 - gini
def entropy(self, Y):
entropy = 0
for y_value in np.unique(Y):
probability = (Y == y_value).sum() / len(Y)
entropy += -probability * np.log2(probability)
return entropy
def calculate_leaf_value(self, Y):
return Y.mode()[0]
def fit(self, X, Y):
self.root_node = self.build_tree(X, Y)
def predict(self, X: pd.DataFrame):
return X.apply(lambda x: self.make_prediction(x, self.root_node), axis=1)
def make_prediction(self, x, tree: Node):
if tree.value != None: return tree.value
if x[tree.feature] <= tree.threshold:
return self.make_prediction(x, tree.left)
else:
return self.make_prediction(x, tree.right)
def print(self, tree=None, indent=" "):
if not tree:
tree = self.root_node
if tree.value is not None:
print(tree.value)
else:
print(str(tree.feature), "<=", tree.threshold, "?", tree.info_gain)
print("%sleft: " % (indent), end="")
self.print(tree.left, indent + indent)
print("%sright: " % (indent), end="")
self.print(tree.right, indent + indent)
def __sklearn_is_fitted__(self):
return self.root_node != None

13
lab1.py
View File

@ -177,7 +177,7 @@ if False:
plt.show()
# 8.
if True:
if False:
#sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.show()
@ -197,4 +197,13 @@ if False:
normalized_apples["Quality"] = apples["Quality"]
normalized_apples.to_csv()
print(normalized_apples)
print(normalized_apples)
bin_count = int(1 + 3.22 * math.log(len(apples)))
column_name = "Ripeness"
# apples[column_name][0] = 20
# apples[column_name][1] = 22
# apples[column_name][2] = 19
apples[column_name].plot.box()
plt.title(column_name)
plt.show()

118
lab2.py Normal file
View File

@ -0,0 +1,118 @@
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, mean_squared_error
from contextlib import contextmanager
import time
from sklearn.tree import export_graphviz
import graphviz
from sklearn.tree import DecisionTreeClassifier
def load_data():
apples = pd.read_csv("apple_quality_clean.csv")
train_apples, test_apples = train_test_split(apples, test_size=0.2, random_state=42)
# print(f"Training size: {len(train_apples)}")
# print(f"Test size: {len(test_apples)}")
train_apples_x, train_apples_y = train_apples.drop(columns=["Quality"]), train_apples["Quality"]
test_apples_x , test_apples_y = test_apples.drop(columns=["Quality"]) , test_apples["Quality"]
return train_apples_x, train_apples_y, test_apples_x , test_apples_y
def main_decision_tree():
train_X, train_Y, test_X, test_Y = load_data()
best_accuracy = 0
best_depth = 0
accuracies = []
times = []
depths = list(range(1, 19))
for max_depth in depths:
start_time = time.time()
tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
tree.fit(train_X, train_Y)
total_time = time.time() - start_time
y_prediction = tree.predict(test_X)
accuracy = accuracy_score(test_Y, y_prediction)
accuracies.append(accuracy)
times.append(total_time)
if accuracy > best_accuracy:
best_accuracy = accuracy
best_depth = max_depth
# Accuracy plot
print("last accuracy ", accuracies[-1], depths[-1])
print("best accuracy ", best_accuracy, best_depth)
_, axs = plt.subplots(1, 2)
axs[0].plot(depths, accuracies)
axs[0].set(xlabel="Maksimalus gylis", ylabel="Tikslumas")
axs[1].plot(depths, times)
axs[1].set(xlabel="Maksimalus gylis", ylabel="Mokymosi trukmė")
plt.show()
# Confusion matrix
# cm = confusion_matrix(test_apples_y, y_prediction, labels=tree.classes_)
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
# disp.plot()
# plt.show()
# Visualize tree
# class_names = apples["Quality"].unique().astype(str).tolist()
# failas = export_graphviz(tree, out_file=None,
# feature_names=apples.drop(columns=["Quality"]).columns,
# class_names=class_names,
# filled=True, rounded=True,
# special_characters=True)
# graph = graphviz.Source(failas)
# graph.render()
def main_random_forest():
train_X, train_Y, test_X, test_Y = load_data()
forest = RandomForestClassifier(5, random_state=42)
forest.fit(train_X, train_Y)
y_prediction = forest.predict(test_X)
accuracy = accuracy_score(test_Y, y_prediction)
print(max(tree.get_depth() for tree in forest.estimators_))
print(accuracy)
cm = confusion_matrix(test_Y, y_prediction, labels=forest.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_)
disp.plot()
plt.show()
# best_accuracy = 0
# best_forest_size = 0
# accuracies = []
# forest_sizes = list(range(3, 9+1, 5))
# for forest_size in forest_sizes:
# forest = RandomForestClassifier(forest_size, random_state=42)
# forest.fit(train_X, train_Y)
# y_prediction = forest.predict(test_X)
# accuracy = accuracy_score(test_Y, y_prediction)
# accuracies.append(accuracy)
# if accuracy > best_accuracy:
# best_accuracy = accuracy
# best_forest_size = forest_size
# print("best", best_accuracy, best_forest_size)
# plt.plot(forest_sizes, accuracies)
# plt.xlabel("Medžių kiekis")
# plt.ylabel("Tikslumas")
# plt.show()
main_random_forest()

Binary file not shown.