1
0

initial commit

This commit is contained in:
Rokas Puzonas 2024-02-25 23:05:52 +02:00
commit d4d354d9dd
25 changed files with 8233 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
venv

BIN
Atsakaita lab1.docx Normal file

Binary file not shown.

BIN
Lab1 figures/Figure_1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

BIN
Lab1 figures/Figure_10.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

BIN
Lab1 figures/Figure_11.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
Lab1 figures/Figure_12.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

BIN
Lab1 figures/Figure_13.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

BIN
Lab1 figures/Figure_14.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

BIN
Lab1 figures/Figure_15.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.1 MiB

BIN
Lab1 figures/Figure_16.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

BIN
Lab1 figures/Figure_17.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 57 KiB

BIN
Lab1 figures/Figure_2.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
Lab1 figures/Figure_3.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
Lab1 figures/Figure_4.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

BIN
Lab1 figures/Figure_5.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
Lab1 figures/Figure_6.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

BIN
Lab1 figures/Figure_7.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
Lab1 figures/Figure_8.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

BIN
Lab1 figures/Figure_9.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

2
README.md Normal file
View File

@ -0,0 +1,2 @@
Apple Quality source: https://www.kaggle.com/datasets/nelgiriyewithana/apple-quality/data

4001
apple_quality.csv Normal file

File diff suppressed because it is too large Load Diff

4001
apple_quality_clean.csv Normal file

File diff suppressed because it is too large Load Diff

28
cleanup_data.py Normal file
View File

@ -0,0 +1,28 @@
import pandas as pd
def to_categorical(column, subdivisions):
min_value = column.min()
max_value = column.max()
return ((column - min_value) / (max_value - min_value) * subdivisions).round(0)
def normalize_column(column):
min_value = min(column)
max_value = max(column)
return (column-min_value)/(max_value-min_value)
apples = pd.read_csv("apple_quality.csv")
del apples['A_id']
apples['Quality'] = apples['Quality'].map({'good':1, 'bad':0})
apples['Juiciness_categorical'] = to_categorical(apples['Juiciness'], 10)
apples['Ripeness_categorical'] = to_categorical(apples['Ripeness'], 10)
normalized_apples = pd.DataFrame()
for column_name in apples.drop(columns=["Quality"]):
normalized_apples[column_name] = normalize_column(apples[column_name])
normalized_apples["Quality"] = apples["Quality"]
normalized_apples.to_csv("apple_quality_clean.csv", index=False)

200
lab1.py Normal file
View File

@ -0,0 +1,200 @@
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import math
apples = pd.read_csv("apple_quality.csv")
del apples['A_id']
apples['Quality'] = apples['Quality'].map({'good':1, 'bad':0})
def to_categorical(column, subdivisions):
min_value = column.min()
max_value = column.max()
return ((column - min_value) / (max_value - min_value) * subdivisions).round(0)
apples['Juiciness_categorical'] = to_categorical(apples['Juiciness'], 10)
apples['Ripeness_categorical'] = to_categorical(apples['Ripeness'], 10)
if False:
print("2. Kokybės analizė")
for column_name in apples:
none_count = (apples[column_name] == None).sum()
print(f"{column_name:11}: {none_count} trukstamų reikšmių kiekis")
print("\n* Kardinalumas")
print(apples.drop(columns=["Juiciness_categorical", "Ripeness_categorical", "Quality"]).describe(exclude=['O']))
if False:
print("\n3. Kategoriniai")
for column_name in ['Quality', 'Juiciness_categorical', 'Ripeness_categorical']:
column = apples[column_name]
value_counts = column.value_counts()
moda = value_counts.index.tolist()[0]
moda2 = value_counts.index.tolist()[1]
moda_count = (column == moda).sum()
moda2_count = (column == moda2).sum()
print(f"* {column_name}")
print(" Bendras reikšmių skaičius: ", column.count())
print(" Trūkstamų reikšmių procentas: 0%")
print(" Kardinalumas: ", len(value_counts))
print(" Moda: ", moda)
print(" Modos dažnumas: ", moda_count)
print(" Modos procentinė reikšmė: ", moda_count / len(apples) * 100)
print(" 2-Moda: ", moda2)
print(" 2-Modos dažnumas: ", moda2_count)
print(" 2-Modos procentinė reikšmė: ", moda2_count / len(apples) * 100)
# 4.
if False:
bin_count = int(1 + 3.22 * math.log(len(apples)))
print(bin_count)
for column_name in apples:
apples[column_name].plot.hist(bins=bin_count)
plt.title(column_name)
plt.show()
# 7.
if False:
# 7.1.
for x_column, y_column in [
("Sweetness", "Size"),
("Sweetness", "Ripeness"),
("Crunchiness", "Ripeness")
]:
plt.scatter(apples[x_column], apples[y_column], color='blue', alpha=0.5)
plt.xlabel(x_column)
plt.ylabel(y_column)
plt.title(f'{x_column} vs. {y_column}')
plt.grid()
plt.show()
# 7.2.
pd.plotting.scatter_matrix(apples.drop(columns=['Quality', 'Juiciness_categorical', 'Ripeness_categorical']), alpha=0.2)
plt.show()
# 7.3.
sns.barplot(x='Quality', y='Size', data={
"Quality": [
0, 1
],
"Size": [
(apples['Quality'] == 0).sum(),
(apples['Quality'] == 1).sum()
]
})
plt.show()
bad_apples = apples[apples['Quality'] == 0]
good_apples = apples[apples['Quality'] == 1]
sns.barplot(x='Juiciness', y='Quality', data={
"Juiciness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Quality": [
(bad_apples['Juiciness_categorical'] == 0).sum(),
(bad_apples['Juiciness_categorical'] == 1).sum(),
(bad_apples['Juiciness_categorical'] == 2).sum(),
(bad_apples['Juiciness_categorical'] == 3).sum(),
(bad_apples['Juiciness_categorical'] == 4).sum(),
(bad_apples['Juiciness_categorical'] == 5).sum(),
(bad_apples['Juiciness_categorical'] == 6).sum(),
(bad_apples['Juiciness_categorical'] == 7).sum(),
(bad_apples['Juiciness_categorical'] == 8).sum(),
(bad_apples['Juiciness_categorical'] == 9).sum(),
(bad_apples['Juiciness_categorical'] == 10).sum(),
]
})
plt.title("Quality: bad")
plt.show()
sns.barplot(x='Juiciness', y='Quality', data={
"Juiciness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Quality": [
(good_apples['Juiciness_categorical'] == 0).sum(),
(good_apples['Juiciness_categorical'] == 1).sum(),
(good_apples['Juiciness_categorical'] == 2).sum(),
(good_apples['Juiciness_categorical'] == 3).sum(),
(good_apples['Juiciness_categorical'] == 4).sum(),
(good_apples['Juiciness_categorical'] == 5).sum(),
(good_apples['Juiciness_categorical'] == 6).sum(),
(good_apples['Juiciness_categorical'] == 7).sum(),
(good_apples['Juiciness_categorical'] == 8).sum(),
(good_apples['Juiciness_categorical'] == 9).sum(),
(good_apples['Juiciness_categorical'] == 10).sum(),
]
})
plt.title("Quality: good")
plt.show()
sns.barplot(x='Ripeness', y='Quality', data={
"Ripeness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Quality": [
(bad_apples['Ripeness_categorical'] == 0).sum(),
(bad_apples['Ripeness_categorical'] == 1).sum(),
(bad_apples['Ripeness_categorical'] == 2).sum(),
(bad_apples['Ripeness_categorical'] == 3).sum(),
(bad_apples['Ripeness_categorical'] == 4).sum(),
(bad_apples['Ripeness_categorical'] == 5).sum(),
(bad_apples['Ripeness_categorical'] == 6).sum(),
(bad_apples['Ripeness_categorical'] == 7).sum(),
(bad_apples['Ripeness_categorical'] == 8).sum(),
(bad_apples['Ripeness_categorical'] == 9).sum(),
(bad_apples['Ripeness_categorical'] == 10).sum(),
]
})
plt.title("Quality: bad")
plt.show()
sns.barplot(x='Ripeness', y='Quality', data={
"Ripeness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
"Quality": [
(good_apples['Ripeness_categorical'] == 0).sum(),
(good_apples['Ripeness_categorical'] == 1).sum(),
(good_apples['Ripeness_categorical'] == 2).sum(),
(good_apples['Ripeness_categorical'] == 3).sum(),
(good_apples['Ripeness_categorical'] == 4).sum(),
(good_apples['Ripeness_categorical'] == 5).sum(),
(good_apples['Ripeness_categorical'] == 6).sum(),
(good_apples['Ripeness_categorical'] == 7).sum(),
(good_apples['Ripeness_categorical'] == 8).sum(),
(good_apples['Ripeness_categorical'] == 9).sum(),
(good_apples['Ripeness_categorical'] == 10).sum(),
]
})
plt.title("Quality: good")
plt.show()
# 7.4.
sns.boxplot(apples, x='Quality', y='Juiciness')
plt.title("Quality vs Juiciness")
plt.show()
sns.boxplot(apples, x='Quality', y='Ripeness')
plt.title("Quality vs Ripeness")
plt.show()
# 8.
if True:
#sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.show()
# 9.
if False:
def normalize_column(column):
min_value = min(column)
max_value = max(column)
return (column-min_value)/(max_value-min_value)
normalized_apples = pd.DataFrame()
for column_name in apples.drop(columns=["Quality"]):
normalized_apples[column_name] = normalize_column(apples[column_name])
normalized_apples["Quality"] = apples["Quality"]
normalized_apples.to_csv()
print(normalized_apples)

BIN
requirements.txt Normal file

Binary file not shown.