complete lab2

2024-03-24 21:13:24 +02:00 · 2024-03-24 21:13:24 +02:00 · c86d4ca749
commit c86d4ca749
parent bd67f94727
7 changed files with 298 additions and 3 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-venv
+venv
 __pycache__
--- a/lab2.docx
+++ b/lab2.docx
--- a/medis.pdf
+++ b/medis.pdf
--- a/decision_tree.py
+++ b/decision_tree.py
@ -0,0 +1,167 @@
 import pandas as pd
 import numpy as np
 import math
 from dataclasses import dataclass
 from typing import Optional
 from sklearn.base import BaseEstimator, ClassifierMixin
@dataclass
 class Node:
    # If leaf node
    value: Optional[float] = None
    # If branching node
    feature: Optional[str] = None
    threshold: Optional[float] = None
    left: Optional["Node"] = None
    right: Optional["Node"] = None
    info_gain: Optional[float] = None
    def leaf(value: float) -> "Node":
        return Node(value=value)
    def branch(feature: str, threshold: float, info_gain: float, left: "Node", right: "Node") -> "Node":
        return Node(
            feature=feature,
            threshold=threshold,
            left=left,
            right=right,
            info_gain=info_gain
        )
@dataclass
 class SplitResult:
    feature: str
    threshold: float
    info_gain: float
    left_X: pd.DataFrame
    left_Y: pd.Series
    right_X: pd.DataFrame
    right_Y: pd.Series
 class DecisionTree(BaseEstimator, ClassifierMixin):
    root_node: Optional[Node]
    def __init__(self, min_samples_split=2, max_depth=10) -> None:
        self.root_node = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
    def build_tree(self, X: pd.DataFrame, Y: pd.Series, current_depth = 0):
        num_samples = np.shape(X)[0]
        if num_samples >= self.min_samples_split and current_depth <= self.max_depth:
            best_split = self.get_best_split(X, Y)
            if best_split and best_split.info_gain > 0:
                left_node  = self.build_tree(best_split.left_X , best_split.left_Y , current_depth + 1)
                right_node = self.build_tree(best_split.right_X, best_split.right_Y, current_depth + 1)
                return Node.branch(
                    best_split.feature,
                    best_split.threshold,
                    best_split.info_gain,
                    left_node,
                    right_node,
                )
        return Node.leaf(self.calculate_leaf_value(Y))
    def get_best_split(self, X: pd.DataFrame, Y: pd.Series):
        best_split = SplitResult(
            "unknown",
            0,
            -math.inf,
            pd.DataFrame(),
            pd.Series(),
            pd.DataFrame(),
            pd.Series()
        )
        for column_name in X:
            column = X[column_name]
            thresholds = column.unique() # TODO: Should probably be cached
            if len(thresholds) > 20:
                continue
            for threshold in thresholds:
                left_X, left_Y, right_X, right_Y = self.split(X, Y, column_name, threshold)
                if not left_X.empty and not right_X.empty:
                    info_gain = self.gini_information_gain(Y, left_Y, right_Y)
                    if info_gain > best_split.info_gain:
                        best_split.info_gain = info_gain
                        best_split.feature = column_name
                        best_split.threshold = threshold
                        best_split.left_X = left_X
                        best_split.left_Y = left_Y
                        best_split.right_X = right_X
                        best_split.right_Y = right_Y
        if best_split.info_gain == -math.inf:
            return None
        return best_split
    def split(self, X: pd.DataFrame, Y: pd.Series, column_name: str, threshold: float):
        left_rows = X[column_name] <= threshold
        right_rows = ~left_rows
        return X[left_rows], Y[left_rows], X[right_rows], Y[right_rows]
    def gini_information_gain(self, Y, left_Y, right_Y):
        weight_left = len(left_Y) / len(Y)
        weight_right = len(right_Y) / len(Y)
        return self.gini_index(Y) - weight_left*self.gini_index(left_Y) - weight_right*self.gini_index(right_Y)
        # return self.entropy(Y) - weight_left*self.entropy(left_Y) - weight_right*self.entropy(right_Y)
    def gini_index(self, Y):
        gini = 0
        for y_value in np.unique(Y):
            probability = (Y == y_value).sum() / len(Y)
            gini += probability*probability
        return 1 - gini
    def entropy(self, Y):
        entropy = 0
        for y_value in np.unique(Y):
            probability = (Y == y_value).sum() / len(Y)
            entropy += -probability * np.log2(probability)
        return entropy
    def calculate_leaf_value(self, Y):
        return Y.mode()[0]
    def fit(self, X, Y):
        self.root_node = self.build_tree(X, Y)
    def predict(self, X: pd.DataFrame):
        return X.apply(lambda x: self.make_prediction(x, self.root_node), axis=1)
    def make_prediction(self, x, tree: Node):
        if tree.value != None: return tree.value
        if x[tree.feature] <= tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    def print(self, tree=None, indent=" "):
        if not tree:
            tree = self.root_node
        if tree.value is not None:
            print(tree.value)
        else:
            print(str(tree.feature), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft: " % (indent), end="")
            self.print(tree.left, indent + indent)
            print("%sright: " % (indent), end="")
            self.print(tree.right, indent + indent)
    def __sklearn_is_fitted__(self):
        return self.root_node != None
--- a/lab1.py
+++ b/lab1.py
@ -177,7 +177,7 @@ if False:
    plt.show()
 # 8.
-if True:
+if False:
    #sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
    sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
    plt.show()
@ -197,4 +197,13 @@ if False:
    normalized_apples["Quality"] = apples["Quality"]
    normalized_apples.to_csv()
-    print(normalized_apples)
+    print(normalized_apples)
 bin_count = int(1 + 3.22 * math.log(len(apples)))
 column_name = "Ripeness"
 # apples[column_name][0] = 20
 # apples[column_name][1] = 22
 # apples[column_name][2] = 19
 apples[column_name].plot.box()
 plt.title(column_name)
 plt.show()
--- a/lab2.py
+++ b/lab2.py
@ -0,0 +1,118 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 import numpy as np
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.model_selection import train_test_split
 from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, mean_squared_error
 from contextlib import contextmanager
 import time
 from sklearn.tree import export_graphviz
 import graphviz
 from sklearn.tree import DecisionTreeClassifier
 def load_data():
    apples = pd.read_csv("apple_quality_clean.csv")
    train_apples, test_apples = train_test_split(apples, test_size=0.2, random_state=42)
    # print(f"Training size: {len(train_apples)}")
    # print(f"Test size: {len(test_apples)}")
    train_apples_x, train_apples_y = train_apples.drop(columns=["Quality"]), train_apples["Quality"]
    test_apples_x , test_apples_y  = test_apples.drop(columns=["Quality"]) , test_apples["Quality"]
    return train_apples_x, train_apples_y, test_apples_x , test_apples_y
 def main_decision_tree():
    train_X, train_Y, test_X, test_Y = load_data()
    best_accuracy = 0
    best_depth = 0
    accuracies = []
    times = []
    depths = list(range(1, 19))
    for max_depth in depths:
        start_time = time.time()
        tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
        tree.fit(train_X, train_Y)
        total_time = time.time() - start_time
        y_prediction = tree.predict(test_X)
        accuracy = accuracy_score(test_Y, y_prediction)
        accuracies.append(accuracy)
        times.append(total_time)
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_depth = max_depth
    # Accuracy plot
    print("last accuracy ", accuracies[-1], depths[-1])
    print("best accuracy ", best_accuracy, best_depth)
    _, axs = plt.subplots(1, 2)
    axs[0].plot(depths, accuracies)
    axs[0].set(xlabel="Maksimalus gylis", ylabel="Tikslumas")
    axs[1].plot(depths, times)
    axs[1].set(xlabel="Maksimalus gylis", ylabel="Mokymosi trukmė")
    plt.show()
    # Confusion matrix
    # cm = confusion_matrix(test_apples_y, y_prediction, labels=tree.classes_)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
    # disp.plot()
    # plt.show()
    # Visualize tree
    # class_names = apples["Quality"].unique().astype(str).tolist()
    # failas = export_graphviz(tree, out_file=None,
    #                         feature_names=apples.drop(columns=["Quality"]).columns,
    #                         class_names=class_names,
    #                         filled=True, rounded=True,
    #                         special_characters=True)
    # graph = graphviz.Source(failas)
    # graph.render()
 def main_random_forest():
    train_X, train_Y, test_X, test_Y = load_data()
    forest = RandomForestClassifier(5, random_state=42)
    forest.fit(train_X, train_Y)
    y_prediction = forest.predict(test_X)
    accuracy = accuracy_score(test_Y, y_prediction)
    print(max(tree.get_depth() for tree in forest.estimators_))
    print(accuracy)
    cm = confusion_matrix(test_Y, y_prediction, labels=forest.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_)
    disp.plot()
    plt.show()
    # best_accuracy = 0
    # best_forest_size = 0
    # accuracies = []
    # forest_sizes = list(range(3, 9+1, 5))
    # for forest_size in forest_sizes:
    #     forest = RandomForestClassifier(forest_size, random_state=42)
    #     forest.fit(train_X, train_Y)
    #     y_prediction = forest.predict(test_X)
    #     accuracy = accuracy_score(test_Y, y_prediction)
    #     accuracies.append(accuracy)
    #     if accuracy > best_accuracy:
    #         best_accuracy = accuracy
    #         best_forest_size = forest_size
    # print("best", best_accuracy, best_forest_size)
    # plt.plot(forest_sizes, accuracies)
    # plt.xlabel("Medžių kiekis")
    # plt.ylabel("Tikslumas")
    # plt.show()
 main_random_forest()
--- a/requirements.txt
+++ b/requirements.txt