complete lab2

2024-03-24 21:13:24 +02:00 · 2024-03-24 21:13:24 +02:00 · c86d4ca749
commit c86d4ca749
parent bd67f94727
7 changed files with 298 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-venv
+venv
+__pycache__
--- a/lab2.docx
+++ b/lab2.docx
--- a/medis.pdf
+++ b/medis.pdf
--- a/decision_tree.py
+++ b/decision_tree.py
@ -0,0 +1,167 @@
+import pandas as pd
+import numpy as np
+import math
+from dataclasses import dataclass
+from typing import Optional
+from sklearn.base import BaseEstimator, ClassifierMixin
+
+@dataclass
+class Node:
+    # If leaf node
+    value: Optional[float] = None
+
+    # If branching node
+    feature: Optional[str] = None
+    threshold: Optional[float] = None
+    left: Optional["Node"] = None
+    right: Optional["Node"] = None
+    info_gain: Optional[float] = None
+
+    def leaf(value: float) -> "Node":
+        return Node(value=value)
+
+    def branch(feature: str, threshold: float, info_gain: float, left: "Node", right: "Node") -> "Node":
+        return Node(
+            feature=feature,
+            threshold=threshold,
+            left=left,
+            right=right,
+            info_gain=info_gain
+        )
+
+@dataclass
+class SplitResult:
+    feature: str
+    threshold: float
+    info_gain: float
+
+    left_X: pd.DataFrame
+    left_Y: pd.Series
+
+    right_X: pd.DataFrame
+    right_Y: pd.Series
+
+class DecisionTree(BaseEstimator, ClassifierMixin):
+    root_node: Optional[Node]
+
+    def __init__(self, min_samples_split=2, max_depth=10) -> None:
+        self.root_node = None
+
+        self.min_samples_split = min_samples_split
+        self.max_depth = max_depth
+
+    def build_tree(self, X: pd.DataFrame, Y: pd.Series, current_depth = 0):
+        num_samples = np.shape(X)[0]
+        if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
+            best_split = self.get_best_split(X, Y)
+
+            if best_split and best_split.info_gain > 0:
+                left_node  = self.build_tree(best_split.left_X , best_split.left_Y , current_depth + 1)
+                right_node = self.build_tree(best_split.right_X, best_split.right_Y, current_depth + 1)
+                return Node.branch(
+                    best_split.feature,
+                    best_split.threshold,
+                    best_split.info_gain,
+                    left_node,
+                    right_node,
+                )
+
+        return Node.leaf(self.calculate_leaf_value(Y))
+
+    def get_best_split(self, X: pd.DataFrame, Y: pd.Series):
+        best_split = SplitResult(
+            "unknown",
+            0,
+            -math.inf,
+            pd.DataFrame(),
+            pd.Series(),
+            pd.DataFrame(),
+            pd.Series()
+        )
+
+        for column_name in X:
+            column = X[column_name]
+            thresholds = column.unique() # TODO: Should probably be cached
+            if len(thresholds) > 20:
+                continue
+
+            for threshold in thresholds:
+                left_X, left_Y, right_X, right_Y = self.split(X, Y, column_name, threshold)
+                if not left_X.empty and not right_X.empty:
+                    info_gain = self.gini_information_gain(Y, left_Y, right_Y)
+
+                    if info_gain > best_split.info_gain:
+                        best_split.info_gain = info_gain
+                        best_split.feature = column_name
+                        best_split.threshold = threshold
+                        best_split.left_X = left_X
+                        best_split.left_Y = left_Y
+                        best_split.right_X = right_X
+                        best_split.right_Y = right_Y
+
+        if best_split.info_gain == -math.inf:
+            return None
+
+        return best_split
+
+    def split(self, X: pd.DataFrame, Y: pd.Series, column_name: str, threshold: float):
+        left_rows = X[column_name] <= threshold
+        right_rows = ~left_rows
+
+        return X[left_rows], Y[left_rows], X[right_rows], Y[right_rows]
+
+    def gini_information_gain(self, Y, left_Y, right_Y):
+        weight_left = len(left_Y) / len(Y)
+        weight_right = len(right_Y) / len(Y)
+        return self.gini_index(Y) - weight_left*self.gini_index(left_Y) - weight_right*self.gini_index(right_Y)
+        # return self.entropy(Y) - weight_left*self.entropy(left_Y) - weight_right*self.entropy(right_Y)
+
+    def gini_index(self, Y):
+        gini = 0
+        for y_value in np.unique(Y):
+            probability = (Y == y_value).sum() / len(Y)
+            gini += probability*probability
+        return 1 - gini
+
+    def entropy(self, Y):
+        entropy = 0
+        for y_value in np.unique(Y):
+            probability = (Y == y_value).sum() / len(Y)
+            entropy += -probability * np.log2(probability)
+        return entropy
+
+    def calculate_leaf_value(self, Y):
+        return Y.mode()[0]
+
+    def fit(self, X, Y):
+        self.root_node = self.build_tree(X, Y)
+
+    def predict(self, X: pd.DataFrame):
+
+        return X.apply(lambda x: self.make_prediction(x, self.root_node), axis=1)
+
+    def make_prediction(self, x, tree: Node):
+        if tree.value != None: return tree.value
+
+        if x[tree.feature] <= tree.threshold:
+            return self.make_prediction(x, tree.left)
+        else:
+            return self.make_prediction(x, tree.right)
+
+    def print(self, tree=None, indent=" "):
+        if not tree:
+            tree = self.root_node
+
+        if tree.value is not None:
+            print(tree.value)
+
+        else:
+            print(str(tree.feature), "<=", tree.threshold, "?", tree.info_gain)
+            print("%sleft: " % (indent), end="")
+            self.print(tree.left, indent + indent)
+            print("%sright: " % (indent), end="")
+            self.print(tree.right, indent + indent)
+
+
+    def __sklearn_is_fitted__(self):
+        return self.root_node != None
--- a/lab1.py
+++ b/lab1.py
@ -177,7 +177,7 @@ if False:
    plt.show()

 # 8.
-if True:
+if False:
    #sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
    sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.show()
@ -197,4 +197,13 @@ if False:
    normalized_apples["Quality"] = apples["Quality"]

    normalized_apples.to_csv()
-    print(normalized_apples)
+    print(normalized_apples)
+
+bin_count = int(1 + 3.22 * math.log(len(apples)))
+column_name = "Ripeness"
+# apples[column_name][0] = 20
+# apples[column_name][1] = 22
+# apples[column_name][2] = 19
+apples[column_name].plot.box()
+plt.title(column_name)
+plt.show()
--- a/lab2.py
+++ b/lab2.py
@ -0,0 +1,118 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, mean_squared_error
+from contextlib import contextmanager
+import time
+from sklearn.tree import export_graphviz
+import graphviz
+
+from sklearn.tree import DecisionTreeClassifier
+
+def load_data():
+    apples = pd.read_csv("apple_quality_clean.csv")
+
+    train_apples, test_apples = train_test_split(apples, test_size=0.2, random_state=42)
+    # print(f"Training size: {len(train_apples)}")
+    # print(f"Test size: {len(test_apples)}")
+
+    train_apples_x, train_apples_y = train_apples.drop(columns=["Quality"]), train_apples["Quality"]
+    test_apples_x , test_apples_y  = test_apples.drop(columns=["Quality"]) , test_apples["Quality"]
+
+    return train_apples_x, train_apples_y, test_apples_x , test_apples_y
+
+def main_decision_tree():
+    train_X, train_Y, test_X, test_Y = load_data()
+
+    best_accuracy = 0
+    best_depth = 0
+
+    accuracies = []
+    times = []
+    depths = list(range(1, 19))
+    for max_depth in depths:
+        start_time = time.time()
+        tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
+        tree.fit(train_X, train_Y)
+        total_time = time.time() - start_time
+
+        y_prediction = tree.predict(test_X)
+        accuracy = accuracy_score(test_Y, y_prediction)
+        accuracies.append(accuracy)
+        times.append(total_time)
+
+        if accuracy > best_accuracy:
+            best_accuracy = accuracy
+            best_depth = max_depth
+
+    # Accuracy plot
+    print("last accuracy ", accuracies[-1], depths[-1])
+    print("best accuracy ", best_accuracy, best_depth)
+
+    _, axs = plt.subplots(1, 2)
+    axs[0].plot(depths, accuracies)
+    axs[0].set(xlabel="Maksimalus gylis", ylabel="Tikslumas")
+
+    axs[1].plot(depths, times)
+    axs[1].set(xlabel="Maksimalus gylis", ylabel="Mokymosi trukmė")
+
+    plt.show()
+
+    # Confusion matrix
+    # cm = confusion_matrix(test_apples_y, y_prediction, labels=tree.classes_)
+    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
+    # disp.plot()
+    # plt.show()
+
+    # Visualize tree
+    # class_names = apples["Quality"].unique().astype(str).tolist()
+    # failas = export_graphviz(tree, out_file=None,
+    #                         feature_names=apples.drop(columns=["Quality"]).columns,
+    #                         class_names=class_names,
+    #                         filled=True, rounded=True,
+    #                         special_characters=True)
+    # graph = graphviz.Source(failas)
+    # graph.render()
+
+def main_random_forest():
+    train_X, train_Y, test_X, test_Y = load_data()
+
+    forest = RandomForestClassifier(5, random_state=42)
+    forest.fit(train_X, train_Y)
+
+    y_prediction = forest.predict(test_X)
+    accuracy = accuracy_score(test_Y, y_prediction)
+    print(max(tree.get_depth() for tree in forest.estimators_))
+    print(accuracy)
+
+    cm = confusion_matrix(test_Y, y_prediction, labels=forest.classes_)
+    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_)
+    disp.plot()
+    plt.show()
+
+    # best_accuracy = 0
+    # best_forest_size = 0
+
+    # accuracies = []
+    # forest_sizes = list(range(3, 9+1, 5))
+    # for forest_size in forest_sizes:
+    #     forest = RandomForestClassifier(forest_size, random_state=42)
+    #     forest.fit(train_X, train_Y)
+
+    #     y_prediction = forest.predict(test_X)
+    #     accuracy = accuracy_score(test_Y, y_prediction)
+    #     accuracies.append(accuracy)
+    #     if accuracy > best_accuracy:
+    #         best_accuracy = accuracy
+    #         best_forest_size = forest_size
+
+    # print("best", best_accuracy, best_forest_size)
+    # plt.plot(forest_sizes, accuracies)
+    # plt.xlabel("Medžių kiekis")
+    # plt.ylabel("Tikslumas")
+    # plt.show()
+
+
+main_random_forest()
--- a/requirements.txt
+++ b/requirements.txt