complete lab2
This commit is contained in:
parent
bd67f94727
commit
c86d4ca749
3
.gitignore
vendored
3
.gitignore
vendored
@ -1 +1,2 @@
|
|||||||
venv
|
venv
|
||||||
|
__pycache__
|
BIN
Ataskaita lab2.docx
Normal file
BIN
Ataskaita lab2.docx
Normal file
Binary file not shown.
BIN
Sprendimų medis.pdf
Normal file
BIN
Sprendimų medis.pdf
Normal file
Binary file not shown.
167
decision_tree.py
Normal file
167
decision_tree.py
Normal file
@ -0,0 +1,167 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import numpy as np
|
||||||
|
import math
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
from sklearn.base import BaseEstimator, ClassifierMixin
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Node:
|
||||||
|
# If leaf node
|
||||||
|
value: Optional[float] = None
|
||||||
|
|
||||||
|
# If branching node
|
||||||
|
feature: Optional[str] = None
|
||||||
|
threshold: Optional[float] = None
|
||||||
|
left: Optional["Node"] = None
|
||||||
|
right: Optional["Node"] = None
|
||||||
|
info_gain: Optional[float] = None
|
||||||
|
|
||||||
|
def leaf(value: float) -> "Node":
|
||||||
|
return Node(value=value)
|
||||||
|
|
||||||
|
def branch(feature: str, threshold: float, info_gain: float, left: "Node", right: "Node") -> "Node":
|
||||||
|
return Node(
|
||||||
|
feature=feature,
|
||||||
|
threshold=threshold,
|
||||||
|
left=left,
|
||||||
|
right=right,
|
||||||
|
info_gain=info_gain
|
||||||
|
)
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SplitResult:
|
||||||
|
feature: str
|
||||||
|
threshold: float
|
||||||
|
info_gain: float
|
||||||
|
|
||||||
|
left_X: pd.DataFrame
|
||||||
|
left_Y: pd.Series
|
||||||
|
|
||||||
|
right_X: pd.DataFrame
|
||||||
|
right_Y: pd.Series
|
||||||
|
|
||||||
|
class DecisionTree(BaseEstimator, ClassifierMixin):
|
||||||
|
root_node: Optional[Node]
|
||||||
|
|
||||||
|
def __init__(self, min_samples_split=2, max_depth=10) -> None:
|
||||||
|
self.root_node = None
|
||||||
|
|
||||||
|
self.min_samples_split = min_samples_split
|
||||||
|
self.max_depth = max_depth
|
||||||
|
|
||||||
|
def build_tree(self, X: pd.DataFrame, Y: pd.Series, current_depth = 0):
|
||||||
|
num_samples = np.shape(X)[0]
|
||||||
|
if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
|
||||||
|
best_split = self.get_best_split(X, Y)
|
||||||
|
|
||||||
|
if best_split and best_split.info_gain > 0:
|
||||||
|
left_node = self.build_tree(best_split.left_X , best_split.left_Y , current_depth + 1)
|
||||||
|
right_node = self.build_tree(best_split.right_X, best_split.right_Y, current_depth + 1)
|
||||||
|
return Node.branch(
|
||||||
|
best_split.feature,
|
||||||
|
best_split.threshold,
|
||||||
|
best_split.info_gain,
|
||||||
|
left_node,
|
||||||
|
right_node,
|
||||||
|
)
|
||||||
|
|
||||||
|
return Node.leaf(self.calculate_leaf_value(Y))
|
||||||
|
|
||||||
|
def get_best_split(self, X: pd.DataFrame, Y: pd.Series):
|
||||||
|
best_split = SplitResult(
|
||||||
|
"unknown",
|
||||||
|
0,
|
||||||
|
-math.inf,
|
||||||
|
pd.DataFrame(),
|
||||||
|
pd.Series(),
|
||||||
|
pd.DataFrame(),
|
||||||
|
pd.Series()
|
||||||
|
)
|
||||||
|
|
||||||
|
for column_name in X:
|
||||||
|
column = X[column_name]
|
||||||
|
thresholds = column.unique() # TODO: Should probably be cached
|
||||||
|
if len(thresholds) > 20:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for threshold in thresholds:
|
||||||
|
left_X, left_Y, right_X, right_Y = self.split(X, Y, column_name, threshold)
|
||||||
|
if not left_X.empty and not right_X.empty:
|
||||||
|
info_gain = self.gini_information_gain(Y, left_Y, right_Y)
|
||||||
|
|
||||||
|
if info_gain > best_split.info_gain:
|
||||||
|
best_split.info_gain = info_gain
|
||||||
|
best_split.feature = column_name
|
||||||
|
best_split.threshold = threshold
|
||||||
|
best_split.left_X = left_X
|
||||||
|
best_split.left_Y = left_Y
|
||||||
|
best_split.right_X = right_X
|
||||||
|
best_split.right_Y = right_Y
|
||||||
|
|
||||||
|
if best_split.info_gain == -math.inf:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return best_split
|
||||||
|
|
||||||
|
def split(self, X: pd.DataFrame, Y: pd.Series, column_name: str, threshold: float):
|
||||||
|
left_rows = X[column_name] <= threshold
|
||||||
|
right_rows = ~left_rows
|
||||||
|
|
||||||
|
return X[left_rows], Y[left_rows], X[right_rows], Y[right_rows]
|
||||||
|
|
||||||
|
def gini_information_gain(self, Y, left_Y, right_Y):
|
||||||
|
weight_left = len(left_Y) / len(Y)
|
||||||
|
weight_right = len(right_Y) / len(Y)
|
||||||
|
return self.gini_index(Y) - weight_left*self.gini_index(left_Y) - weight_right*self.gini_index(right_Y)
|
||||||
|
# return self.entropy(Y) - weight_left*self.entropy(left_Y) - weight_right*self.entropy(right_Y)
|
||||||
|
|
||||||
|
def gini_index(self, Y):
|
||||||
|
gini = 0
|
||||||
|
for y_value in np.unique(Y):
|
||||||
|
probability = (Y == y_value).sum() / len(Y)
|
||||||
|
gini += probability*probability
|
||||||
|
return 1 - gini
|
||||||
|
|
||||||
|
def entropy(self, Y):
|
||||||
|
entropy = 0
|
||||||
|
for y_value in np.unique(Y):
|
||||||
|
probability = (Y == y_value).sum() / len(Y)
|
||||||
|
entropy += -probability * np.log2(probability)
|
||||||
|
return entropy
|
||||||
|
|
||||||
|
def calculate_leaf_value(self, Y):
|
||||||
|
return Y.mode()[0]
|
||||||
|
|
||||||
|
def fit(self, X, Y):
|
||||||
|
self.root_node = self.build_tree(X, Y)
|
||||||
|
|
||||||
|
def predict(self, X: pd.DataFrame):
|
||||||
|
|
||||||
|
return X.apply(lambda x: self.make_prediction(x, self.root_node), axis=1)
|
||||||
|
|
||||||
|
def make_prediction(self, x, tree: Node):
|
||||||
|
if tree.value != None: return tree.value
|
||||||
|
|
||||||
|
if x[tree.feature] <= tree.threshold:
|
||||||
|
return self.make_prediction(x, tree.left)
|
||||||
|
else:
|
||||||
|
return self.make_prediction(x, tree.right)
|
||||||
|
|
||||||
|
def print(self, tree=None, indent=" "):
|
||||||
|
if not tree:
|
||||||
|
tree = self.root_node
|
||||||
|
|
||||||
|
if tree.value is not None:
|
||||||
|
print(tree.value)
|
||||||
|
|
||||||
|
else:
|
||||||
|
print(str(tree.feature), "<=", tree.threshold, "?", tree.info_gain)
|
||||||
|
print("%sleft: " % (indent), end="")
|
||||||
|
self.print(tree.left, indent + indent)
|
||||||
|
print("%sright: " % (indent), end="")
|
||||||
|
self.print(tree.right, indent + indent)
|
||||||
|
|
||||||
|
|
||||||
|
def __sklearn_is_fitted__(self):
|
||||||
|
return self.root_node != None
|
13
lab1.py
13
lab1.py
@ -177,7 +177,7 @@ if False:
|
|||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
# 8.
|
# 8.
|
||||||
if True:
|
if False:
|
||||||
#sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
|
#sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
|
||||||
sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
|
sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
|
||||||
plt.show()
|
plt.show()
|
||||||
@ -197,4 +197,13 @@ if False:
|
|||||||
normalized_apples["Quality"] = apples["Quality"]
|
normalized_apples["Quality"] = apples["Quality"]
|
||||||
|
|
||||||
normalized_apples.to_csv()
|
normalized_apples.to_csv()
|
||||||
print(normalized_apples)
|
print(normalized_apples)
|
||||||
|
|
||||||
|
bin_count = int(1 + 3.22 * math.log(len(apples)))
|
||||||
|
column_name = "Ripeness"
|
||||||
|
# apples[column_name][0] = 20
|
||||||
|
# apples[column_name][1] = 22
|
||||||
|
# apples[column_name][2] = 19
|
||||||
|
apples[column_name].plot.box()
|
||||||
|
plt.title(column_name)
|
||||||
|
plt.show()
|
118
lab2.py
Normal file
118
lab2.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
import pandas as pd
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
import numpy as np
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.model_selection import train_test_split
|
||||||
|
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, mean_squared_error
|
||||||
|
from contextlib import contextmanager
|
||||||
|
import time
|
||||||
|
from sklearn.tree import export_graphviz
|
||||||
|
import graphviz
|
||||||
|
|
||||||
|
from sklearn.tree import DecisionTreeClassifier
|
||||||
|
|
||||||
|
def load_data():
|
||||||
|
apples = pd.read_csv("apple_quality_clean.csv")
|
||||||
|
|
||||||
|
train_apples, test_apples = train_test_split(apples, test_size=0.2, random_state=42)
|
||||||
|
# print(f"Training size: {len(train_apples)}")
|
||||||
|
# print(f"Test size: {len(test_apples)}")
|
||||||
|
|
||||||
|
train_apples_x, train_apples_y = train_apples.drop(columns=["Quality"]), train_apples["Quality"]
|
||||||
|
test_apples_x , test_apples_y = test_apples.drop(columns=["Quality"]) , test_apples["Quality"]
|
||||||
|
|
||||||
|
return train_apples_x, train_apples_y, test_apples_x , test_apples_y
|
||||||
|
|
||||||
|
def main_decision_tree():
|
||||||
|
train_X, train_Y, test_X, test_Y = load_data()
|
||||||
|
|
||||||
|
best_accuracy = 0
|
||||||
|
best_depth = 0
|
||||||
|
|
||||||
|
accuracies = []
|
||||||
|
times = []
|
||||||
|
depths = list(range(1, 19))
|
||||||
|
for max_depth in depths:
|
||||||
|
start_time = time.time()
|
||||||
|
tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
|
||||||
|
tree.fit(train_X, train_Y)
|
||||||
|
total_time = time.time() - start_time
|
||||||
|
|
||||||
|
y_prediction = tree.predict(test_X)
|
||||||
|
accuracy = accuracy_score(test_Y, y_prediction)
|
||||||
|
accuracies.append(accuracy)
|
||||||
|
times.append(total_time)
|
||||||
|
|
||||||
|
if accuracy > best_accuracy:
|
||||||
|
best_accuracy = accuracy
|
||||||
|
best_depth = max_depth
|
||||||
|
|
||||||
|
# Accuracy plot
|
||||||
|
print("last accuracy ", accuracies[-1], depths[-1])
|
||||||
|
print("best accuracy ", best_accuracy, best_depth)
|
||||||
|
|
||||||
|
_, axs = plt.subplots(1, 2)
|
||||||
|
axs[0].plot(depths, accuracies)
|
||||||
|
axs[0].set(xlabel="Maksimalus gylis", ylabel="Tikslumas")
|
||||||
|
|
||||||
|
axs[1].plot(depths, times)
|
||||||
|
axs[1].set(xlabel="Maksimalus gylis", ylabel="Mokymosi trukmė")
|
||||||
|
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# Confusion matrix
|
||||||
|
# cm = confusion_matrix(test_apples_y, y_prediction, labels=tree.classes_)
|
||||||
|
# disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
|
||||||
|
# disp.plot()
|
||||||
|
# plt.show()
|
||||||
|
|
||||||
|
# Visualize tree
|
||||||
|
# class_names = apples["Quality"].unique().astype(str).tolist()
|
||||||
|
# failas = export_graphviz(tree, out_file=None,
|
||||||
|
# feature_names=apples.drop(columns=["Quality"]).columns,
|
||||||
|
# class_names=class_names,
|
||||||
|
# filled=True, rounded=True,
|
||||||
|
# special_characters=True)
|
||||||
|
# graph = graphviz.Source(failas)
|
||||||
|
# graph.render()
|
||||||
|
|
||||||
|
def main_random_forest():
|
||||||
|
train_X, train_Y, test_X, test_Y = load_data()
|
||||||
|
|
||||||
|
forest = RandomForestClassifier(5, random_state=42)
|
||||||
|
forest.fit(train_X, train_Y)
|
||||||
|
|
||||||
|
y_prediction = forest.predict(test_X)
|
||||||
|
accuracy = accuracy_score(test_Y, y_prediction)
|
||||||
|
print(max(tree.get_depth() for tree in forest.estimators_))
|
||||||
|
print(accuracy)
|
||||||
|
|
||||||
|
cm = confusion_matrix(test_Y, y_prediction, labels=forest.classes_)
|
||||||
|
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_)
|
||||||
|
disp.plot()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
# best_accuracy = 0
|
||||||
|
# best_forest_size = 0
|
||||||
|
|
||||||
|
# accuracies = []
|
||||||
|
# forest_sizes = list(range(3, 9+1, 5))
|
||||||
|
# for forest_size in forest_sizes:
|
||||||
|
# forest = RandomForestClassifier(forest_size, random_state=42)
|
||||||
|
# forest.fit(train_X, train_Y)
|
||||||
|
|
||||||
|
# y_prediction = forest.predict(test_X)
|
||||||
|
# accuracy = accuracy_score(test_Y, y_prediction)
|
||||||
|
# accuracies.append(accuracy)
|
||||||
|
# if accuracy > best_accuracy:
|
||||||
|
# best_accuracy = accuracy
|
||||||
|
# best_forest_size = forest_size
|
||||||
|
|
||||||
|
# print("best", best_accuracy, best_forest_size)
|
||||||
|
# plt.plot(forest_sizes, accuracies)
|
||||||
|
# plt.xlabel("Medžių kiekis")
|
||||||
|
# plt.ylabel("Tikslumas")
|
||||||
|
# plt.show()
|
||||||
|
|
||||||
|
|
||||||
|
main_random_forest()
|
BIN
requirements.txt
BIN
requirements.txt
Binary file not shown.
Loading…
Reference in New Issue
Block a user