import pandas as pd import matplotlib.pyplot as plt import seaborn as sns import numpy as np import math apples = pd.read_csv("apple_quality.csv") del apples['A_id'] apples['Quality'] = apples['Quality'].map({'good':1, 'bad':0}) def to_categorical(column, subdivisions): min_value = column.min() max_value = column.max() return ((column - min_value) / (max_value - min_value) * subdivisions).round(0) apples['Juiciness_categorical'] = to_categorical(apples['Juiciness'], 10) apples['Ripeness_categorical'] = to_categorical(apples['Ripeness'], 10) if False: print("2. Kokybės analizė") for column_name in apples: none_count = (apples[column_name] == None).sum() print(f"{column_name:11}: {none_count} trukstamų reikšmių kiekis") print("\n* Kardinalumas") print(apples.drop(columns=["Juiciness_categorical", "Ripeness_categorical", "Quality"]).describe(exclude=['O'])) if False: print("\n3. Kategoriniai") for column_name in ['Quality', 'Juiciness_categorical', 'Ripeness_categorical']: column = apples[column_name] value_counts = column.value_counts() moda = value_counts.index.tolist()[0] moda2 = value_counts.index.tolist()[1] moda_count = (column == moda).sum() moda2_count = (column == moda2).sum() print(f"* {column_name}") print(" Bendras reikšmių skaičius: ", column.count()) print(" Trūkstamų reikšmių procentas: 0%") print(" Kardinalumas: ", len(value_counts)) print(" Moda: ", moda) print(" Modos dažnumas: ", moda_count) print(" Modos procentinė reikšmė: ", moda_count / len(apples) * 100) print(" 2-Moda: ", moda2) print(" 2-Modos dažnumas: ", moda2_count) print(" 2-Modos procentinė reikšmė: ", moda2_count / len(apples) * 100) # 4. if False: bin_count = int(1 + 3.22 * math.log(len(apples))) print(bin_count) for column_name in apples: apples[column_name].plot.hist(bins=bin_count) plt.title(column_name) plt.show() # 7. if False: # 7.1. for x_column, y_column in [ ("Sweetness", "Size"), ("Sweetness", "Ripeness"), ("Crunchiness", "Ripeness") ]: plt.scatter(apples[x_column], apples[y_column], color='blue', alpha=0.5) plt.xlabel(x_column) plt.ylabel(y_column) plt.title(f'{x_column} vs. {y_column}') plt.grid() plt.show() # 7.2. pd.plotting.scatter_matrix(apples.drop(columns=['Quality', 'Juiciness_categorical', 'Ripeness_categorical']), alpha=0.2) plt.show() # 7.3. sns.barplot(x='Quality', y='Size', data={ "Quality": [ 0, 1 ], "Size": [ (apples['Quality'] == 0).sum(), (apples['Quality'] == 1).sum() ] }) plt.show() bad_apples = apples[apples['Quality'] == 0] good_apples = apples[apples['Quality'] == 1] sns.barplot(x='Juiciness', y='Quality', data={ "Juiciness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "Quality": [ (bad_apples['Juiciness_categorical'] == 0).sum(), (bad_apples['Juiciness_categorical'] == 1).sum(), (bad_apples['Juiciness_categorical'] == 2).sum(), (bad_apples['Juiciness_categorical'] == 3).sum(), (bad_apples['Juiciness_categorical'] == 4).sum(), (bad_apples['Juiciness_categorical'] == 5).sum(), (bad_apples['Juiciness_categorical'] == 6).sum(), (bad_apples['Juiciness_categorical'] == 7).sum(), (bad_apples['Juiciness_categorical'] == 8).sum(), (bad_apples['Juiciness_categorical'] == 9).sum(), (bad_apples['Juiciness_categorical'] == 10).sum(), ] }) plt.title("Quality: bad") plt.show() sns.barplot(x='Juiciness', y='Quality', data={ "Juiciness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "Quality": [ (good_apples['Juiciness_categorical'] == 0).sum(), (good_apples['Juiciness_categorical'] == 1).sum(), (good_apples['Juiciness_categorical'] == 2).sum(), (good_apples['Juiciness_categorical'] == 3).sum(), (good_apples['Juiciness_categorical'] == 4).sum(), (good_apples['Juiciness_categorical'] == 5).sum(), (good_apples['Juiciness_categorical'] == 6).sum(), (good_apples['Juiciness_categorical'] == 7).sum(), (good_apples['Juiciness_categorical'] == 8).sum(), (good_apples['Juiciness_categorical'] == 9).sum(), (good_apples['Juiciness_categorical'] == 10).sum(), ] }) plt.title("Quality: good") plt.show() sns.barplot(x='Ripeness', y='Quality', data={ "Ripeness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "Quality": [ (bad_apples['Ripeness_categorical'] == 0).sum(), (bad_apples['Ripeness_categorical'] == 1).sum(), (bad_apples['Ripeness_categorical'] == 2).sum(), (bad_apples['Ripeness_categorical'] == 3).sum(), (bad_apples['Ripeness_categorical'] == 4).sum(), (bad_apples['Ripeness_categorical'] == 5).sum(), (bad_apples['Ripeness_categorical'] == 6).sum(), (bad_apples['Ripeness_categorical'] == 7).sum(), (bad_apples['Ripeness_categorical'] == 8).sum(), (bad_apples['Ripeness_categorical'] == 9).sum(), (bad_apples['Ripeness_categorical'] == 10).sum(), ] }) plt.title("Quality: bad") plt.show() sns.barplot(x='Ripeness', y='Quality', data={ "Ripeness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "Quality": [ (good_apples['Ripeness_categorical'] == 0).sum(), (good_apples['Ripeness_categorical'] == 1).sum(), (good_apples['Ripeness_categorical'] == 2).sum(), (good_apples['Ripeness_categorical'] == 3).sum(), (good_apples['Ripeness_categorical'] == 4).sum(), (good_apples['Ripeness_categorical'] == 5).sum(), (good_apples['Ripeness_categorical'] == 6).sum(), (good_apples['Ripeness_categorical'] == 7).sum(), (good_apples['Ripeness_categorical'] == 8).sum(), (good_apples['Ripeness_categorical'] == 9).sum(), (good_apples['Ripeness_categorical'] == 10).sum(), ] }) plt.title("Quality: good") plt.show() # 7.4. sns.boxplot(apples, x='Quality', y='Juiciness') plt.title("Quality vs Juiciness") plt.show() sns.boxplot(apples, x='Quality', y='Ripeness') plt.title("Quality vs Ripeness") plt.show() # 8. if False: #sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f") sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f") plt.show() # 9. if False: def normalize_column(column): min_value = min(column) max_value = max(column) return (column-min_value)/(max_value-min_value) normalized_apples = pd.DataFrame() for column_name in apples.drop(columns=["Quality"]): normalized_apples[column_name] = normalize_column(apples[column_name]) normalized_apples["Quality"] = apples["Quality"] normalized_apples.to_csv() print(normalized_apples) bin_count = int(1 + 3.22 * math.log(len(apples))) column_name = "Ripeness" # apples[column_name][0] = 20 # apples[column_name][1] = 22 # apples[column_name][2] = 19 apples[column_name].plot.box() plt.title(column_name) plt.show()