1
0
intelektikos-pagrindai/lab1.py
2024-03-24 21:13:24 +02:00

209 lines
7.5 KiB
Python

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
apples = pd.read_csv("apple_quality.csv")
del apples['A_id']
apples['Quality'] = apples['Quality'].map({'good':1, 'bad':0})
def to_categorical(column, subdivisions):
min_value = column.min()
max_value = column.max()
return ((column - min_value) / (max_value - min_value) * subdivisions).round(0)
apples['Juiciness_categorical'] = to_categorical(apples['Juiciness'], 10)
apples['Ripeness_categorical'] = to_categorical(apples['Ripeness'], 10)
if False:
print("2. Kokybės analizė")
for column_name in apples:
none_count = (apples[column_name] == None).sum()
print(f"{column_name:11}: {none_count} trukstamų reikšmių kiekis")
print("\n* Kardinalumas")
print(apples.drop(columns=["Juiciness_categorical", "Ripeness_categorical", "Quality"]).describe(exclude=['O']))
if False:
print("\n3. Kategoriniai")
for column_name in ['Quality', 'Juiciness_categorical', 'Ripeness_categorical']:
column = apples[column_name]
value_counts = column.value_counts()
moda = value_counts.index.tolist()[0]
moda2 = value_counts.index.tolist()[1]
moda_count = (column == moda).sum()
moda2_count = (column == moda2).sum()
print(f"* {column_name}")
print(" Bendras reikšmių skaičius: ", column.count())
print(" Trūkstamų reikšmių procentas: 0%")
print(" Kardinalumas: ", len(value_counts))
print(" Moda: ", moda)
print(" Modos dažnumas: ", moda_count)
print(" Modos procentinė reikšmė: ", moda_count / len(apples) * 100)
print(" 2-Moda: ", moda2)
print(" 2-Modos dažnumas: ", moda2_count)
print(" 2-Modos procentinė reikšmė: ", moda2_count / len(apples) * 100)
# 4.
if False:
bin_count = int(1 + 3.22 * math.log(len(apples)))
print(bin_count)
for column_name in apples:
apples[column_name].plot.hist(bins=bin_count)
plt.title(column_name)
plt.show()
# 7.
if False:
# 7.1.
for x_column, y_column in [
("Sweetness", "Size"),
("Sweetness", "Ripeness"),
("Crunchiness", "Ripeness")
]:
plt.scatter(apples[x_column], apples[y_column], color='blue', alpha=0.5)
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'{x_column} vs. {y_column}')
plt.grid()
plt.show()
# 7.2.
pd.plotting.scatter_matrix(apples.drop(columns=['Quality', 'Juiciness_categorical', 'Ripeness_categorical']), alpha=0.2)
plt.show()
# 7.3.
sns.barplot(x='Quality', y='Size', data={
"Quality": [
0, 1
],
"Size": [
(apples['Quality'] == 0).sum(),
(apples['Quality'] == 1).sum()
]
})
plt.show()
bad_apples = apples[apples['Quality'] == 0]
good_apples = apples[apples['Quality'] == 1]
sns.barplot(x='Juiciness', y='Quality', data={
"Juiciness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Quality": [
(bad_apples['Juiciness_categorical'] == 0).sum(),
(bad_apples['Juiciness_categorical'] == 1).sum(),
(bad_apples['Juiciness_categorical'] == 2).sum(),
(bad_apples['Juiciness_categorical'] == 3).sum(),
(bad_apples['Juiciness_categorical'] == 4).sum(),
(bad_apples['Juiciness_categorical'] == 5).sum(),
(bad_apples['Juiciness_categorical'] == 6).sum(),
(bad_apples['Juiciness_categorical'] == 7).sum(),
(bad_apples['Juiciness_categorical'] == 8).sum(),
(bad_apples['Juiciness_categorical'] == 9).sum(),
(bad_apples['Juiciness_categorical'] == 10).sum(),
]
})
plt.title("Quality: bad")
plt.show()
sns.barplot(x='Juiciness', y='Quality', data={
"Juiciness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Quality": [
(good_apples['Juiciness_categorical'] == 0).sum(),
(good_apples['Juiciness_categorical'] == 1).sum(),
(good_apples['Juiciness_categorical'] == 2).sum(),
(good_apples['Juiciness_categorical'] == 3).sum(),
(good_apples['Juiciness_categorical'] == 4).sum(),
(good_apples['Juiciness_categorical'] == 5).sum(),
(good_apples['Juiciness_categorical'] == 6).sum(),
(good_apples['Juiciness_categorical'] == 7).sum(),
(good_apples['Juiciness_categorical'] == 8).sum(),
(good_apples['Juiciness_categorical'] == 9).sum(),
(good_apples['Juiciness_categorical'] == 10).sum(),
]
})
plt.title("Quality: good")
plt.show()
sns.barplot(x='Ripeness', y='Quality', data={
"Ripeness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Quality": [
(bad_apples['Ripeness_categorical'] == 0).sum(),
(bad_apples['Ripeness_categorical'] == 1).sum(),
(bad_apples['Ripeness_categorical'] == 2).sum(),
(bad_apples['Ripeness_categorical'] == 3).sum(),
(bad_apples['Ripeness_categorical'] == 4).sum(),
(bad_apples['Ripeness_categorical'] == 5).sum(),
(bad_apples['Ripeness_categorical'] == 6).sum(),
(bad_apples['Ripeness_categorical'] == 7).sum(),
(bad_apples['Ripeness_categorical'] == 8).sum(),
(bad_apples['Ripeness_categorical'] == 9).sum(),
(bad_apples['Ripeness_categorical'] == 10).sum(),
]
})
plt.title("Quality: bad")
plt.show()
sns.barplot(x='Ripeness', y='Quality', data={
"Ripeness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Quality": [
(good_apples['Ripeness_categorical'] == 0).sum(),
(good_apples['Ripeness_categorical'] == 1).sum(),
(good_apples['Ripeness_categorical'] == 2).sum(),
(good_apples['Ripeness_categorical'] == 3).sum(),
(good_apples['Ripeness_categorical'] == 4).sum(),
(good_apples['Ripeness_categorical'] == 5).sum(),
(good_apples['Ripeness_categorical'] == 6).sum(),
(good_apples['Ripeness_categorical'] == 7).sum(),
(good_apples['Ripeness_categorical'] == 8).sum(),
(good_apples['Ripeness_categorical'] == 9).sum(),
(good_apples['Ripeness_categorical'] == 10).sum(),
]
})
plt.title("Quality: good")
plt.show()
# 7.4.
sns.boxplot(apples, x='Quality', y='Juiciness')
plt.title("Quality vs Juiciness")
plt.show()
sns.boxplot(apples, x='Quality', y='Ripeness')
plt.title("Quality vs Ripeness")
plt.show()
# 8.
if False:
#sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.show()
# 9.
if False:
def normalize_column(column):
min_value = min(column)
max_value = max(column)
return (column-min_value)/(max_value-min_value)
normalized_apples = pd.DataFrame()
for column_name in apples.drop(columns=["Quality"]):
normalized_apples[column_name] = normalize_column(apples[column_name])
normalized_apples["Quality"] = apples["Quality"]
normalized_apples.to_csv()
print(normalized_apples)
bin_count = int(1 + 3.22 * math.log(len(apples)))
column_name = "Ripeness"
# apples[column_name][0] = 20
# apples[column_name][1] = 22
# apples[column_name][2] = 19
apples[column_name].plot.box()
plt.title(column_name)
plt.show()