diff --git a/.gitignore b/.gitignore index f5e96db..d75edea 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -venv \ No newline at end of file +venv +__pycache__ \ No newline at end of file diff --git a/Ataskaita lab2.docx b/Ataskaita lab2.docx new file mode 100644 index 0000000..46e7821 Binary files /dev/null and b/Ataskaita lab2.docx differ diff --git a/Sprendimų medis.pdf b/Sprendimų medis.pdf new file mode 100644 index 0000000..3f2b898 Binary files /dev/null and b/Sprendimų medis.pdf differ diff --git a/decision_tree.py b/decision_tree.py new file mode 100644 index 0000000..9110a75 --- /dev/null +++ b/decision_tree.py @@ -0,0 +1,167 @@ +import pandas as pd +import numpy as np +import math +from dataclasses import dataclass +from typing import Optional +from sklearn.base import BaseEstimator, ClassifierMixin + +@dataclass +class Node: + # If leaf node + value: Optional[float] = None + + # If branching node + feature: Optional[str] = None + threshold: Optional[float] = None + left: Optional["Node"] = None + right: Optional["Node"] = None + info_gain: Optional[float] = None + + def leaf(value: float) -> "Node": + return Node(value=value) + + def branch(feature: str, threshold: float, info_gain: float, left: "Node", right: "Node") -> "Node": + return Node( + feature=feature, + threshold=threshold, + left=left, + right=right, + info_gain=info_gain + ) + +@dataclass +class SplitResult: + feature: str + threshold: float + info_gain: float + + left_X: pd.DataFrame + left_Y: pd.Series + + right_X: pd.DataFrame + right_Y: pd.Series + +class DecisionTree(BaseEstimator, ClassifierMixin): + root_node: Optional[Node] + + def __init__(self, min_samples_split=2, max_depth=10) -> None: + self.root_node = None + + self.min_samples_split = min_samples_split + self.max_depth = max_depth + + def build_tree(self, X: pd.DataFrame, Y: pd.Series, current_depth = 0): + num_samples = np.shape(X)[0] + if num_samples >= self.min_samples_split and current_depth <= self.max_depth: + best_split = self.get_best_split(X, Y) + + if best_split and best_split.info_gain > 0: + left_node = self.build_tree(best_split.left_X , best_split.left_Y , current_depth + 1) + right_node = self.build_tree(best_split.right_X, best_split.right_Y, current_depth + 1) + return Node.branch( + best_split.feature, + best_split.threshold, + best_split.info_gain, + left_node, + right_node, + ) + + return Node.leaf(self.calculate_leaf_value(Y)) + + def get_best_split(self, X: pd.DataFrame, Y: pd.Series): + best_split = SplitResult( + "unknown", + 0, + -math.inf, + pd.DataFrame(), + pd.Series(), + pd.DataFrame(), + pd.Series() + ) + + for column_name in X: + column = X[column_name] + thresholds = column.unique() # TODO: Should probably be cached + if len(thresholds) > 20: + continue + + for threshold in thresholds: + left_X, left_Y, right_X, right_Y = self.split(X, Y, column_name, threshold) + if not left_X.empty and not right_X.empty: + info_gain = self.gini_information_gain(Y, left_Y, right_Y) + + if info_gain > best_split.info_gain: + best_split.info_gain = info_gain + best_split.feature = column_name + best_split.threshold = threshold + best_split.left_X = left_X + best_split.left_Y = left_Y + best_split.right_X = right_X + best_split.right_Y = right_Y + + if best_split.info_gain == -math.inf: + return None + + return best_split + + def split(self, X: pd.DataFrame, Y: pd.Series, column_name: str, threshold: float): + left_rows = X[column_name] <= threshold + right_rows = ~left_rows + + return X[left_rows], Y[left_rows], X[right_rows], Y[right_rows] + + def gini_information_gain(self, Y, left_Y, right_Y): + weight_left = len(left_Y) / len(Y) + weight_right = len(right_Y) / len(Y) + return self.gini_index(Y) - weight_left*self.gini_index(left_Y) - weight_right*self.gini_index(right_Y) + # return self.entropy(Y) - weight_left*self.entropy(left_Y) - weight_right*self.entropy(right_Y) + + def gini_index(self, Y): + gini = 0 + for y_value in np.unique(Y): + probability = (Y == y_value).sum() / len(Y) + gini += probability*probability + return 1 - gini + + def entropy(self, Y): + entropy = 0 + for y_value in np.unique(Y): + probability = (Y == y_value).sum() / len(Y) + entropy += -probability * np.log2(probability) + return entropy + + def calculate_leaf_value(self, Y): + return Y.mode()[0] + + def fit(self, X, Y): + self.root_node = self.build_tree(X, Y) + + def predict(self, X: pd.DataFrame): + + return X.apply(lambda x: self.make_prediction(x, self.root_node), axis=1) + + def make_prediction(self, x, tree: Node): + if tree.value != None: return tree.value + + if x[tree.feature] <= tree.threshold: + return self.make_prediction(x, tree.left) + else: + return self.make_prediction(x, tree.right) + + def print(self, tree=None, indent=" "): + if not tree: + tree = self.root_node + + if tree.value is not None: + print(tree.value) + + else: + print(str(tree.feature), "<=", tree.threshold, "?", tree.info_gain) + print("%sleft: " % (indent), end="") + self.print(tree.left, indent + indent) + print("%sright: " % (indent), end="") + self.print(tree.right, indent + indent) + + + def __sklearn_is_fitted__(self): + return self.root_node != None \ No newline at end of file diff --git a/lab1.py b/lab1.py index 949f5fa..b137536 100644 --- a/lab1.py +++ b/lab1.py @@ -177,7 +177,7 @@ if False: plt.show() # 8. -if True: +if False: #sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f") sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f") plt.show() @@ -197,4 +197,13 @@ if False: normalized_apples["Quality"] = apples["Quality"] normalized_apples.to_csv() - print(normalized_apples) \ No newline at end of file + print(normalized_apples) + +bin_count = int(1 + 3.22 * math.log(len(apples))) +column_name = "Ripeness" +# apples[column_name][0] = 20 +# apples[column_name][1] = 22 +# apples[column_name][2] = 19 +apples[column_name].plot.box() +plt.title(column_name) +plt.show() \ No newline at end of file diff --git a/lab2.py b/lab2.py new file mode 100644 index 0000000..9bdd455 --- /dev/null +++ b/lab2.py @@ -0,0 +1,118 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, mean_squared_error +from contextlib import contextmanager +import time +from sklearn.tree import export_graphviz +import graphviz + +from sklearn.tree import DecisionTreeClassifier + +def load_data(): + apples = pd.read_csv("apple_quality_clean.csv") + + train_apples, test_apples = train_test_split(apples, test_size=0.2, random_state=42) + # print(f"Training size: {len(train_apples)}") + # print(f"Test size: {len(test_apples)}") + + train_apples_x, train_apples_y = train_apples.drop(columns=["Quality"]), train_apples["Quality"] + test_apples_x , test_apples_y = test_apples.drop(columns=["Quality"]) , test_apples["Quality"] + + return train_apples_x, train_apples_y, test_apples_x , test_apples_y + +def main_decision_tree(): + train_X, train_Y, test_X, test_Y = load_data() + + best_accuracy = 0 + best_depth = 0 + + accuracies = [] + times = [] + depths = list(range(1, 19)) + for max_depth in depths: + start_time = time.time() + tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42) + tree.fit(train_X, train_Y) + total_time = time.time() - start_time + + y_prediction = tree.predict(test_X) + accuracy = accuracy_score(test_Y, y_prediction) + accuracies.append(accuracy) + times.append(total_time) + + if accuracy > best_accuracy: + best_accuracy = accuracy + best_depth = max_depth + + # Accuracy plot + print("last accuracy ", accuracies[-1], depths[-1]) + print("best accuracy ", best_accuracy, best_depth) + + _, axs = plt.subplots(1, 2) + axs[0].plot(depths, accuracies) + axs[0].set(xlabel="Maksimalus gylis", ylabel="Tikslumas") + + axs[1].plot(depths, times) + axs[1].set(xlabel="Maksimalus gylis", ylabel="Mokymosi trukmė") + + plt.show() + + # Confusion matrix + # cm = confusion_matrix(test_apples_y, y_prediction, labels=tree.classes_) + # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_) + # disp.plot() + # plt.show() + + # Visualize tree + # class_names = apples["Quality"].unique().astype(str).tolist() + # failas = export_graphviz(tree, out_file=None, + # feature_names=apples.drop(columns=["Quality"]).columns, + # class_names=class_names, + # filled=True, rounded=True, + # special_characters=True) + # graph = graphviz.Source(failas) + # graph.render() + +def main_random_forest(): + train_X, train_Y, test_X, test_Y = load_data() + + forest = RandomForestClassifier(5, random_state=42) + forest.fit(train_X, train_Y) + + y_prediction = forest.predict(test_X) + accuracy = accuracy_score(test_Y, y_prediction) + print(max(tree.get_depth() for tree in forest.estimators_)) + print(accuracy) + + cm = confusion_matrix(test_Y, y_prediction, labels=forest.classes_) + disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_) + disp.plot() + plt.show() + + # best_accuracy = 0 + # best_forest_size = 0 + + # accuracies = [] + # forest_sizes = list(range(3, 9+1, 5)) + # for forest_size in forest_sizes: + # forest = RandomForestClassifier(forest_size, random_state=42) + # forest.fit(train_X, train_Y) + + # y_prediction = forest.predict(test_X) + # accuracy = accuracy_score(test_Y, y_prediction) + # accuracies.append(accuracy) + # if accuracy > best_accuracy: + # best_accuracy = accuracy + # best_forest_size = forest_size + + # print("best", best_accuracy, best_forest_size) + # plt.plot(forest_sizes, accuracies) + # plt.xlabel("Medžių kiekis") + # plt.ylabel("Tikslumas") + # plt.show() + + +main_random_forest() \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 66b7958..6c4c69c 100644 Binary files a/requirements.txt and b/requirements.txt differ