initial commit
1
.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
venv
|
BIN
Atsakaita lab1.docx
Normal file
BIN
Lab1 figures/Figure_1.png
Normal file
After Width: | Height: | Size: 13 KiB |
BIN
Lab1 figures/Figure_10.png
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
Lab1 figures/Figure_11.png
Normal file
After Width: | Height: | Size: 14 KiB |
BIN
Lab1 figures/Figure_12.png
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
Lab1 figures/Figure_13.png
Normal file
After Width: | Height: | Size: 15 KiB |
BIN
Lab1 figures/Figure_14.png
Normal file
After Width: | Height: | Size: 16 KiB |
BIN
Lab1 figures/Figure_15.png
Normal file
After Width: | Height: | Size: 1.1 MiB |
BIN
Lab1 figures/Figure_16.png
Normal file
After Width: | Height: | Size: 54 KiB |
BIN
Lab1 figures/Figure_17.png
Normal file
After Width: | Height: | Size: 57 KiB |
BIN
Lab1 figures/Figure_2.png
Normal file
After Width: | Height: | Size: 12 KiB |
BIN
Lab1 figures/Figure_3.png
Normal file
After Width: | Height: | Size: 12 KiB |
BIN
Lab1 figures/Figure_4.png
Normal file
After Width: | Height: | Size: 13 KiB |
BIN
Lab1 figures/Figure_5.png
Normal file
After Width: | Height: | Size: 14 KiB |
BIN
Lab1 figures/Figure_6.png
Normal file
After Width: | Height: | Size: 11 KiB |
BIN
Lab1 figures/Figure_7.png
Normal file
After Width: | Height: | Size: 14 KiB |
BIN
Lab1 figures/Figure_8.png
Normal file
After Width: | Height: | Size: 12 KiB |
BIN
Lab1 figures/Figure_9.png
Normal file
After Width: | Height: | Size: 14 KiB |
2
README.md
Normal file
@ -0,0 +1,2 @@
|
||||
|
||||
Apple Quality source: https://www.kaggle.com/datasets/nelgiriyewithana/apple-quality/data
|
4001
apple_quality.csv
Normal file
4001
apple_quality_clean.csv
Normal file
28
cleanup_data.py
Normal file
@ -0,0 +1,28 @@
|
||||
import pandas as pd
|
||||
|
||||
def to_categorical(column, subdivisions):
|
||||
min_value = column.min()
|
||||
max_value = column.max()
|
||||
|
||||
return ((column - min_value) / (max_value - min_value) * subdivisions).round(0)
|
||||
|
||||
def normalize_column(column):
|
||||
min_value = min(column)
|
||||
max_value = max(column)
|
||||
return (column-min_value)/(max_value-min_value)
|
||||
|
||||
apples = pd.read_csv("apple_quality.csv")
|
||||
del apples['A_id']
|
||||
apples['Quality'] = apples['Quality'].map({'good':1, 'bad':0})
|
||||
|
||||
apples['Juiciness_categorical'] = to_categorical(apples['Juiciness'], 10)
|
||||
apples['Ripeness_categorical'] = to_categorical(apples['Ripeness'], 10)
|
||||
|
||||
normalized_apples = pd.DataFrame()
|
||||
|
||||
for column_name in apples.drop(columns=["Quality"]):
|
||||
normalized_apples[column_name] = normalize_column(apples[column_name])
|
||||
|
||||
normalized_apples["Quality"] = apples["Quality"]
|
||||
|
||||
normalized_apples.to_csv("apple_quality_clean.csv", index=False)
|
200
lab1.py
Normal file
@ -0,0 +1,200 @@
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
import seaborn as sns
|
||||
import numpy as np
|
||||
import math
|
||||
|
||||
apples = pd.read_csv("apple_quality.csv")
|
||||
del apples['A_id']
|
||||
apples['Quality'] = apples['Quality'].map({'good':1, 'bad':0})
|
||||
|
||||
def to_categorical(column, subdivisions):
|
||||
min_value = column.min()
|
||||
max_value = column.max()
|
||||
|
||||
return ((column - min_value) / (max_value - min_value) * subdivisions).round(0)
|
||||
|
||||
apples['Juiciness_categorical'] = to_categorical(apples['Juiciness'], 10)
|
||||
apples['Ripeness_categorical'] = to_categorical(apples['Ripeness'], 10)
|
||||
|
||||
if False:
|
||||
print("2. Kokybės analizė")
|
||||
for column_name in apples:
|
||||
none_count = (apples[column_name] == None).sum()
|
||||
print(f"{column_name:11}: {none_count} trukstamų reikšmių kiekis")
|
||||
|
||||
print("\n* Kardinalumas")
|
||||
print(apples.drop(columns=["Juiciness_categorical", "Ripeness_categorical", "Quality"]).describe(exclude=['O']))
|
||||
|
||||
if False:
|
||||
print("\n3. Kategoriniai")
|
||||
for column_name in ['Quality', 'Juiciness_categorical', 'Ripeness_categorical']:
|
||||
column = apples[column_name]
|
||||
value_counts = column.value_counts()
|
||||
moda = value_counts.index.tolist()[0]
|
||||
moda2 = value_counts.index.tolist()[1]
|
||||
|
||||
moda_count = (column == moda).sum()
|
||||
moda2_count = (column == moda2).sum()
|
||||
|
||||
print(f"* {column_name}")
|
||||
print(" Bendras reikšmių skaičius: ", column.count())
|
||||
print(" Trūkstamų reikšmių procentas: 0%")
|
||||
print(" Kardinalumas: ", len(value_counts))
|
||||
print(" Moda: ", moda)
|
||||
print(" Modos dažnumas: ", moda_count)
|
||||
print(" Modos procentinė reikšmė: ", moda_count / len(apples) * 100)
|
||||
print(" 2-Moda: ", moda2)
|
||||
print(" 2-Modos dažnumas: ", moda2_count)
|
||||
print(" 2-Modos procentinė reikšmė: ", moda2_count / len(apples) * 100)
|
||||
|
||||
# 4.
|
||||
if False:
|
||||
bin_count = int(1 + 3.22 * math.log(len(apples)))
|
||||
print(bin_count)
|
||||
for column_name in apples:
|
||||
apples[column_name].plot.hist(bins=bin_count)
|
||||
plt.title(column_name)
|
||||
plt.show()
|
||||
|
||||
# 7.
|
||||
if False:
|
||||
# 7.1.
|
||||
for x_column, y_column in [
|
||||
("Sweetness", "Size"),
|
||||
("Sweetness", "Ripeness"),
|
||||
("Crunchiness", "Ripeness")
|
||||
]:
|
||||
plt.scatter(apples[x_column], apples[y_column], color='blue', alpha=0.5)
|
||||
plt.xlabel(x_column)
|
||||
plt.ylabel(y_column)
|
||||
plt.title(f'{x_column} vs. {y_column}')
|
||||
plt.grid()
|
||||
plt.show()
|
||||
|
||||
# 7.2.
|
||||
pd.plotting.scatter_matrix(apples.drop(columns=['Quality', 'Juiciness_categorical', 'Ripeness_categorical']), alpha=0.2)
|
||||
plt.show()
|
||||
|
||||
# 7.3.
|
||||
sns.barplot(x='Quality', y='Size', data={
|
||||
"Quality": [
|
||||
0, 1
|
||||
],
|
||||
"Size": [
|
||||
(apples['Quality'] == 0).sum(),
|
||||
(apples['Quality'] == 1).sum()
|
||||
]
|
||||
})
|
||||
plt.show()
|
||||
|
||||
bad_apples = apples[apples['Quality'] == 0]
|
||||
good_apples = apples[apples['Quality'] == 1]
|
||||
|
||||
sns.barplot(x='Juiciness', y='Quality', data={
|
||||
"Juiciness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
"Quality": [
|
||||
(bad_apples['Juiciness_categorical'] == 0).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 1).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 2).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 3).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 4).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 5).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 6).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 7).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 8).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 9).sum(),
|
||||
(bad_apples['Juiciness_categorical'] == 10).sum(),
|
||||
]
|
||||
})
|
||||
plt.title("Quality: bad")
|
||||
plt.show()
|
||||
|
||||
sns.barplot(x='Juiciness', y='Quality', data={
|
||||
"Juiciness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
"Quality": [
|
||||
(good_apples['Juiciness_categorical'] == 0).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 1).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 2).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 3).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 4).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 5).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 6).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 7).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 8).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 9).sum(),
|
||||
(good_apples['Juiciness_categorical'] == 10).sum(),
|
||||
]
|
||||
})
|
||||
plt.title("Quality: good")
|
||||
plt.show()
|
||||
|
||||
sns.barplot(x='Ripeness', y='Quality', data={
|
||||
"Ripeness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
"Quality": [
|
||||
(bad_apples['Ripeness_categorical'] == 0).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 1).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 2).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 3).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 4).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 5).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 6).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 7).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 8).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 9).sum(),
|
||||
(bad_apples['Ripeness_categorical'] == 10).sum(),
|
||||
]
|
||||
})
|
||||
plt.title("Quality: bad")
|
||||
plt.show()
|
||||
|
||||
sns.barplot(x='Ripeness', y='Quality', data={
|
||||
"Ripeness": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
|
||||
"Quality": [
|
||||
(good_apples['Ripeness_categorical'] == 0).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 1).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 2).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 3).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 4).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 5).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 6).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 7).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 8).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 9).sum(),
|
||||
(good_apples['Ripeness_categorical'] == 10).sum(),
|
||||
]
|
||||
})
|
||||
plt.title("Quality: good")
|
||||
plt.show()
|
||||
|
||||
# 7.4.
|
||||
sns.boxplot(apples, x='Quality', y='Juiciness')
|
||||
plt.title("Quality vs Juiciness")
|
||||
plt.show()
|
||||
|
||||
sns.boxplot(apples, x='Quality', y='Ripeness')
|
||||
plt.title("Quality vs Ripeness")
|
||||
plt.show()
|
||||
|
||||
# 8.
|
||||
if True:
|
||||
#sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).cov(), annot=True, cmap='coolwarm', fmt=".2f")
|
||||
sns.heatmap(apples.drop(columns=["Quality", "Ripeness_categorical", "Juiciness_categorical"]).corr(), annot=True, cmap='coolwarm', fmt=".2f")
|
||||
plt.show()
|
||||
|
||||
# 9.
|
||||
if False:
|
||||
def normalize_column(column):
|
||||
min_value = min(column)
|
||||
max_value = max(column)
|
||||
return (column-min_value)/(max_value-min_value)
|
||||
|
||||
normalized_apples = pd.DataFrame()
|
||||
|
||||
for column_name in apples.drop(columns=["Quality"]):
|
||||
normalized_apples[column_name] = normalize_column(apples[column_name])
|
||||
|
||||
normalized_apples["Quality"] = apples["Quality"]
|
||||
|
||||
normalized_apples.to_csv()
|
||||
print(normalized_apples)
|