intelektikos-pagrindai/lab2.py

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, confusion_matrix, mean_squared_error
from contextlib import contextmanager
import time
from sklearn.tree import export_graphviz
import graphviz

from sklearn.tree import DecisionTreeClassifier

def load_data():
    apples = pd.read_csv("apple_quality_clean.csv")

    train_apples, test_apples = train_test_split(apples, test_size=0.2, random_state=42)
    # print(f"Training size: {len(train_apples)}")
    # print(f"Test size: {len(test_apples)}")

    train_apples_x, train_apples_y = train_apples.drop(columns=["Quality"]), train_apples["Quality"]
    test_apples_x , test_apples_y  = test_apples.drop(columns=["Quality"]) , test_apples["Quality"]

    return train_apples_x, train_apples_y, test_apples_x , test_apples_y

def main_decision_tree():
    train_X, train_Y, test_X, test_Y = load_data()

    best_accuracy = 0
    best_depth = 0

    accuracies = []
    times = []
    depths = list(range(1, 19))
    for max_depth in depths:
        start_time = time.time()
        tree = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
        tree.fit(train_X, train_Y)
        total_time = time.time() - start_time

        y_prediction = tree.predict(test_X)
        accuracy = accuracy_score(test_Y, y_prediction)
        accuracies.append(accuracy)
        times.append(total_time)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_depth = max_depth

    # Accuracy plot
    print("last accuracy ", accuracies[-1], depths[-1])
    print("best accuracy ", best_accuracy, best_depth)

    _, axs = plt.subplots(1, 2)
    axs[0].plot(depths, accuracies)
    axs[0].set(xlabel="Maksimalus gylis", ylabel="Tikslumas")

    axs[1].plot(depths, times)
    axs[1].set(xlabel="Maksimalus gylis", ylabel="Mokymosi trukmė")

    plt.show()

    # Confusion matrix
    # cm = confusion_matrix(test_apples_y, y_prediction, labels=tree.classes_)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=tree.classes_)
    # disp.plot()
    # plt.show()

    # Visualize tree
    # class_names = apples["Quality"].unique().astype(str).tolist()
    # failas = export_graphviz(tree, out_file=None,
    #                         feature_names=apples.drop(columns=["Quality"]).columns,
    #                         class_names=class_names,
    #                         filled=True, rounded=True,
    #                         special_characters=True)
    # graph = graphviz.Source(failas)
    # graph.render()

def main_random_forest():
    train_X, train_Y, test_X, test_Y = load_data()

    forest = RandomForestClassifier(5, random_state=42)
    forest.fit(train_X, train_Y)

    y_prediction = forest.predict(test_X)
    accuracy = accuracy_score(test_Y, y_prediction)
    print(max(tree.get_depth() for tree in forest.estimators_))
    print(accuracy)

    cm = confusion_matrix(test_Y, y_prediction, labels=forest.classes_)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=forest.classes_)
    disp.plot()
    plt.show()

    # best_accuracy = 0
    # best_forest_size = 0

    # accuracies = []
    # forest_sizes = list(range(3, 9+1, 5))
    # for forest_size in forest_sizes:
    #     forest = RandomForestClassifier(forest_size, random_state=42)
    #     forest.fit(train_X, train_Y)

    #     y_prediction = forest.predict(test_X)
    #     accuracy = accuracy_score(test_Y, y_prediction)
    #     accuracies.append(accuracy)
    #     if accuracy > best_accuracy:
    #         best_accuracy = accuracy
    #         best_forest_size = forest_size

    # print("best", best_accuracy, best_forest_size)
    # plt.plot(forest_sizes, accuracies)
    # plt.xlabel("Medžių kiekis")
    # plt.ylabel("Tikslumas")
    # plt.show()


main_random_forest()