complete lab2
This commit is contained in:
parent
bd67f94727
commit
c86d4ca749
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
||||
venv
|
||||
venv
|
||||
__pycache__
|
BIN
Ataskaita lab2.docx
Normal file
BIN
Ataskaita lab2.docx
Normal file
Binary file not shown.
BIN
Sprendimų medis.pdf
Normal file
BIN
Sprendimų medis.pdf
Normal file
Binary file not shown.
167
decision_tree.py
Normal file
167
decision_tree.py
Normal file
@ -0,0 +1,167 @@
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||
|
||||
@dataclass
|
||||
class Node:
|
||||
# If leaf node
|
||||
value: Optional[float] = None
|
||||
|
||||
# If branching node
|
||||
feature: Optional[str] = None
|
||||
threshold: Optional[float] = None
|
||||
left: Optional["Node"] = None
|
||||
right: Optional["Node"] = None
|
||||
info_gain: Optional[float] = None
|
||||
|
||||
def leaf(value: float) -> "Node":
|
||||
return Node(value=value)
|
||||
|
||||
def branch(feature: str, threshold: float, info_gain: float, left: "Node", right: "Node") -> "Node":
|
||||
return Node(
|
||||
feature=feature,
|
||||
threshold=threshold,
|
||||
left=left,
|
||||
right=right,
|
||||
info_gain=info_gain
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class SplitResult:
|
||||
feature: str
|
||||
threshold: float
|
||||
info_gain: float
|
||||
|
||||
left_X: pd.DataFrame
|
||||
left_Y: pd.Series
|
||||
|
||||
right_X: pd.DataFrame
|
||||
right_Y: pd.Series
|
||||
|
||||
class DecisionTree(BaseEstimator, ClassifierMixin):
|
||||
root_node: Optional[Node]
|
||||
|
||||
def __init__(self, min_samples_split=2, max_depth=10) -> None:
|
||||
self.root_node = None
|
||||
|
||||
self.min_samples_split = min_samples_split
|
||||
self.max_depth = max_depth
|
||||
|
||||
def build_tree(self, X: pd.DataFrame, Y: pd.Series, current_depth = 0):
|
||||
num_samples = np.shape(X)[0]
|
||||
if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
|
||||
best_split = self.get_best_split(X, Y)
|
||||
|
||||
if best_split and best_split.info_gain > 0:
|
||||
left_node = self.build_tree(best_split.left_X , best_split.left_Y , current_depth + 1)
|
||||
right_node = self.build_tree(best_split.right_X, best_split.right_Y, current_depth + 1)
|
||||
return Node.branch(
|
||||
best_split.feature,
|
||||
best_split.threshold,
|
||||
best_split.info_gain,
|
||||
left_node,
|
||||
right_node,
|
||||
)
|
||||
|
||||
return Node.leaf(self.calculate_leaf_value(Y))
|
||||
|
||||
def get_best_split(self, X: pd.DataFrame, Y: pd.Series):
|
||||
best_split = SplitResult(
|
||||
"unknown",
|
||||
0,
|
||||
-math.inf,
|
||||
pd.DataFrame(),
|
||||
pd.Series(),
|
||||
pd.DataFrame(),
|
||||
pd.Series()
|
||||
)
|
||||
|
||||
for column_name in X:
|
||||
column = X[column_name]
|
||||
thresholds = column.unique() # TODO: Should probably be cached
|
||||
if len(thresholds) > 20:
|
||||
continue
|
||||
|
||||
for threshold in thresholds:
|
||||
left_X, left_Y, right_X, right_Y = self.split(X, Y, column_name, threshold)
|
||||
if not left_X.empty and not right_X.empty:
|
||||
info_gain = self.gini_information_gain(Y, left_Y, right_Y)
|
||||
|
||||
if info_gain > best_split.info_gain:
|
||||
best_split.info_gain = info_gain
|
||||
best_split.feature = column_name
|
||||
best_split.threshold = threshold
|
||||
best_split.left_X = left_X
|
||||
best_split.left_Y = left_Y
|
||||
best_split.right_X = right_X
|
||||
best_split.right_Y = right_Y
|
||||
|
||||
if best_split.info_gain == -math.inf:
|
||||
return None
|
||||
|
||||
return best_split
|
||||
|
||||
def split(self, X: pd.DataFrame, Y: pd.Series, column_name: str, threshold: float):
|
||||
left_rows = X[column_name] <= threshold
|
||||
right_rows = ~left_rows
|
||||
|
||||
return X[left_rows], Y[left_rows], X[right_rows], Y[right_rows]
|
||||
|
||||
def gini_information_gain(self, Y, left_Y, right_Y):
|
||||
weight_left = len(left_Y) / len(Y)
|
||||
weight_right = len(right_Y) / len(Y)
|
||||
return self.gini_index(Y) - weight_left*self.gini_index(left_Y) - weight_right*self.gini_index(right_Y)
|
||||
# return self.entropy(Y) - weight_left*self.entropy(left_Y) - weight_right*self.entropy(right_Y)
|
||||
|
||||
def gini_index(self, Y):
|
||||
gini = 0
|
||||
for y_value in np.unique(Y):
|
||||
probability = (Y == y_value).sum() / len(Y)
|
||||
gini += probability*probability
|
||||
return 1 - gini
|
||||
|
||||
def entropy(self, Y):
|
||||
entropy = 0
|
||||
for y_value in np.unique(Y):
|
||||
probability = (Y == y_value).sum() / len(Y)
|
||||
entropy += -probability * np.log2(probability)
|
||||
return entropy
|
||||
|
||||
def calculate_leaf_value(self, Y):
|
||||
return Y.mode()[0]
|
||||
|
||||
def fit(self, X, Y):
|
||||
self.root_node = self.build_tree(X, Y)
|
||||
|
||||
def predict(self, X: pd.DataFrame):
|
||||
|
||||
return X.apply(lambda x: self.make_prediction(x, self.root_node), axis=1)
|
||||
|
||||
def make_prediction(self, x, tree: Node):
|
||||
if tree.value != None: return tree.value
|
||||
|
||||
if x[tree.feature] <= tree.threshold:
|
||||
return self.make_prediction(x, tree.left)
|
||||
else:
|
||||
return self.make_prediction(x, tree.right)
|
||||
|
||||
def print(self, tree=None, indent=" "):
|
||||
if not tree:
|
||||
tree = self.root_node
|
||||
|
||||
if tree.value is not None:
|
||||
print(tree.value)
|
||||
|
||||
else:
|
||||
print(str(tree.feature), "<=", tree.threshold, "?", tree.info_gain)
|
||||
print("%sleft: " % (indent), end="")
|
||||
self.print(tree.left, indent + indent)
|
||||
print("%sright: " % (indent), end="")
|
||||
self.print(tree.right, indent + indent)
|
||||
|
||||
|
||||
def __sklearn_is_fitted__(self):
|
||||
return self.root_node != None
|
13
lab1.py
13
lab1.py
@ -177,7 +177,7 @@ if False:
|
||||
plt.show()
|
||||
|
||||
# 8.
|
||||
if True:
|
||||
if False:
|
||||
#sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
|
||||
sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
|
||||
plt.show()
|
||||
@ -197,4 +197,13 @@ if False:
|
||||
normalized_apples["Quality"] = apples["Quality"]
|
||||
|
||||
normalized_apples.to_csv()
|
||||
print(normalized_apples)
|
||||
print(normalized_apples)
|
||||
|
||||
bin_count = int(1 + 3.22 * math.log(len(apples)))
|
||||
column_name = "Ripeness"
|
||||
# apples[column_name][0] = 20
|
||||
# apples[column_name][1] = 22
|
||||
# apples[column_name][2] = 19
|
||||
apples[column_name].plot.box()
|
||||
plt.title(column_name)
|
||||
plt.show()
|
118
lab2.py
Normal file
118
lab2.py
Normal file
@ -0,0 +1,118 @@
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import train_test_split
|
||||
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, mean_squared_error
|
||||
from contextlib import contextmanager
|
||||
import time
|
||||
from sklearn.tree import export_graphviz
|
||||
import graphviz
|
||||
|
||||
from sklearn.tree import DecisionTreeClassifier
|
||||
|
||||
def load_data():
|
||||
apples = pd.read_csv("apple_quality_clean.csv")
|
||||
|
||||
train_apples, test_apples = train_test_split(apples, test_size=0.2, random_state=42)
|
||||
# print(f"Training size: {len(train_apples)}")
|
||||
# print(f"Test size: {len(test_apples)}")
|
||||
|
||||
train_apples_x, train_apples_y = train_apples.drop(columns=["Quality"]), train_apples["Quality"]
|
||||
test_apples_x , test_apples_y = test_apples.drop(columns=["Quality"]) , test_apples["Quality"]
|
||||
|
||||
return train_apples_x, train_apples_y, test_apples_x , test_apples_y
|
||||
|
||||
def main_decision_tree():
|
||||
train_X, train_Y, test_X, test_Y = load_data()
|
||||
|
||||
best_accuracy = 0
|
||||
best_depth = 0
|
||||
|
||||
accuracies = []
|
||||
times = []
|
||||
depths = list(range(1, 19))
|
||||
for max_depth in depths:
|
||||
start_time = time.time()
|
||||
tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
|
||||
tree.fit(train_X, train_Y)
|
||||
total_time = time.time() - start_time
|
||||
|
||||
y_prediction = tree.predict(test_X)
|
||||
accuracy = accuracy_score(test_Y, y_prediction)
|
||||
accuracies.append(accuracy)
|
||||
times.append(total_time)
|
||||
|
||||
if accuracy > best_accuracy:
|
||||
best_accuracy = accuracy
|
||||
best_depth = max_depth
|
||||
|
||||
# Accuracy plot
|
||||
print("last accuracy ", accuracies[-1], depths[-1])
|
||||
print("best accuracy ", best_accuracy, best_depth)
|
||||
|
||||
_, axs = plt.subplots(1, 2)
|
||||
axs[0].plot(depths, accuracies)
|
||||
axs[0].set(xlabel="Maksimalus gylis", ylabel="Tikslumas")
|
||||
|
||||
axs[1].plot(depths, times)
|
||||
axs[1].set(xlabel="Maksimalus gylis", ylabel="Mokymosi trukmė")
|
||||
|
||||
plt.show()
|
||||
|
||||
# Confusion matrix
|
||||
# cm = confusion_matrix(test_apples_y, y_prediction, labels=tree.classes_)
|
||||
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
|
||||
# disp.plot()
|
||||
# plt.show()
|
||||
|
||||
# Visualize tree
|
||||
# class_names = apples["Quality"].unique().astype(str).tolist()
|
||||
# failas = export_graphviz(tree, out_file=None,
|
||||
# feature_names=apples.drop(columns=["Quality"]).columns,
|
||||
# class_names=class_names,
|
||||
# filled=True, rounded=True,
|
||||
# special_characters=True)
|
||||
# graph = graphviz.Source(failas)
|
||||
# graph.render()
|
||||
|
||||
def main_random_forest():
|
||||
train_X, train_Y, test_X, test_Y = load_data()
|
||||
|
||||
forest = RandomForestClassifier(5, random_state=42)
|
||||
forest.fit(train_X, train_Y)
|
||||
|
||||
y_prediction = forest.predict(test_X)
|
||||
accuracy = accuracy_score(test_Y, y_prediction)
|
||||
print(max(tree.get_depth() for tree in forest.estimators_))
|
||||
print(accuracy)
|
||||
|
||||
cm = confusion_matrix(test_Y, y_prediction, labels=forest.classes_)
|
||||
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_)
|
||||
disp.plot()
|
||||
plt.show()
|
||||
|
||||
# best_accuracy = 0
|
||||
# best_forest_size = 0
|
||||
|
||||
# accuracies = []
|
||||
# forest_sizes = list(range(3, 9+1, 5))
|
||||
# for forest_size in forest_sizes:
|
||||
# forest = RandomForestClassifier(forest_size, random_state=42)
|
||||
# forest.fit(train_X, train_Y)
|
||||
|
||||
# y_prediction = forest.predict(test_X)
|
||||
# accuracy = accuracy_score(test_Y, y_prediction)
|
||||
# accuracies.append(accuracy)
|
||||
# if accuracy > best_accuracy:
|
||||
# best_accuracy = accuracy
|
||||
# best_forest_size = forest_size
|
||||
|
||||
# print("best", best_accuracy, best_forest_size)
|
||||
# plt.plot(forest_sizes, accuracies)
|
||||
# plt.xlabel("Medžių kiekis")
|
||||
# plt.ylabel("Tikslumas")
|
||||
# plt.show()
|
||||
|
||||
|
||||
main_random_forest()
|
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
Loading…
Reference in New Issue
Block a user