machine-learing-methods/Lab1/2.1/main.py
2023-11-30 20:24:45 +02:00

245 lines
9.5 KiB
Python

import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
train_df = pd.read_csv("bike_train.csv")
test_df = pd.read_csv("bike_test.csv")
# 1.
if False:
print("Feature count:", len(train_df.columns))
print("Training set size: ", len(train_df.index))
print("Test set size: ", len(test_df.index))
plt.scatter(train_df['Temperature'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
plt.xlabel('Temperature')
plt.ylabel('Rented_Bike_Count')
plt.title('Temperature vs. Rented_Bike_Count')
plt.grid()
plt.show()
# 2.
if False:
plt.scatter(train_df['Visibility'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
plt.xlabel('Visibility')
plt.ylabel('Rented_Bike_Count')
plt.title('Visibility vs. Rented_Bike_Count')
plt.grid()
plt.show()
plt.scatter(train_df['Rainfall'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
plt.xlabel('Rainfall')
plt.ylabel('Rented_Bike_Count')
plt.title('Rainfall vs. Rented_Bike_Count')
plt.grid()
plt.show()
train_df = train_df.drop(columns=['Date'])
test_df = test_df.drop(columns=['Date'])
# 3.
if False:
sns.heatmap(train_df.drop(columns=["Functioning_Day", "Holiday", "Seasons"]).corr(), annot=True)
plt.show()
lm_fit = LinearRegression().fit(train_df[['Temperature', 'Dew_point_temperature']], train_df['Rented_Bike_Count'])
plt.scatter(train_df['Temperature'], train_df['Dew_point_temperature'], color='blue', alpha=0.5)
plt.xlabel('Temperature')
plt.ylabel('Dew_point_temperature')
plt.title('Temperature vs. Dew_point_temperature')
plt.grid()
plt.show()
# plt.scatter(train_df['Visibility'], train_df['Humidity'], color='blue', alpha=0.5)
# plt.xlabel('Visibility')
# plt.ylabel('Humidity')
# plt.title('Visibility vs. Humidity')
# plt.grid()
# plt.show()
train_df = pd.get_dummies(data=train_df, drop_first=True)
test_df = pd.get_dummies(data=test_df, drop_first=True)
# 4.
if False:
X_train = train_df.drop(columns=['Rented_Bike_Count'])
X_test = test_df.drop(columns=['Rented_Bike_Count'])
lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count'])
# plt.scatter(train_df['Rented_Bike_Count'], lm_fit.predict(train_df.drop(columns=['Rented_Bike_Count'])), color='blue', alpha=0.5, s=3)
# plt.xlabel('True Rented_Bike_Count values (Training data)')
# plt.ylabel('Predicted Rented_Bike_Count values')
# plt.ylim(-1000, 2000)
# plt.plot([0, 1000], [0, 1000], color='red', linewidth=3)
# plt.title('Predicted vs. True Rented_Bike_Count')
# plt.grid()
# plt.show()
plt.scatter(test_df['Rented_Bike_Count'], lm_fit.predict(X_test), color='blue', alpha=0.5, s=3)
plt.xlabel('True Rented_Bike_Count')
plt.ylabel('Predicted Rented_Bike_Count')
#plt.ylim(-1000, 2000)
plt.plot([0, 1000], [0, 1000], color='red', linewidth=3)
plt.title('Predicted vs. True Rented_Bike_Count')
plt.grid()
plt.show()
y_pred = lm_fit.predict(test_df.drop(columns=['Rented_Bike_Count']))
r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred)
print(f'R-squared: {r_squared}')
print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2)))
train_df[f"Rented_Bike_Count"]=np.log(train_df["Rented_Bike_Count"] + 1)
test_df[f"Rented_Bike_Count"]=np.log(test_df["Rented_Bike_Count"] + 1)
# 5.
if False:
X_train = train_df.drop(columns=['Rented_Bike_Count'])
X_test = test_df.drop(columns=['Rented_Bike_Count'])
y_train = np.log(train_df["Rented_Bike_Count"] + 1)
y_test = np.log(test_df["Rented_Bike_Count"] + 1)
lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count'])
y_pred = lm_fit.predict(X_test)
r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred)
print(f'R-squared: {r_squared}')
print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2)))
plt.scatter(test_df['Rented_Bike_Count'], y_pred, color='blue', alpha=0.5, s=3)
plt.xlabel('True log(Rented_Bike_Count+1)')
plt.ylabel('Predicted log(Rented_Bike_Count+1)')
#plt.ylim(-2.5, 10)
plt.plot([4, 7.5], [4, 7.5], color='red', linewidth=2)
plt.title('Predicted vs. True log(Rented_Bike_Count+1)')
plt.grid()
plt.show()
def append_plus_column(dataframe: DataFrame, column_a, column_b):
dataframe[f"{column_a}_plus_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b], axis=1)
def append_plus_mul_column(dataframe: DataFrame, column_a, column_b):
dataframe[f"{column_a}_plus_mul_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b] + row[column_a]*row[column_b], axis=1)
def append_log_column(dataframe: DataFrame, column):
dataframe[f"{column}_log"]=np.log(dataframe[column])
def append_sqrt_column(dataframe: DataFrame, column):
dataframe[f"{column}_sqrt"]=np.sqrt(dataframe[column])
def append_square_column(dataframe: DataFrame, column):
dataframe[f"{column}_square"]=np.square(dataframe[column])
def iter_transformations(dataframe: DataFrame):
for column_idx in range(len(dataframe.columns)):
name = dataframe.columns[column_idx]
column_contains_zero = any(v == 0 for v in dataframe[name])
column_contains_negative = any(v < 0 for v in dataframe[name])
if not column_contains_zero:
yield (f"{name}_log", lambda df: append_log_column(df, name))
if not column_contains_negative:
yield (f"{name}_sqrt", lambda df: append_sqrt_column(df, name))
yield (f"{name}_square", lambda df: append_square_column(df, name))
# for other_column_idx in range(column_idx+1, len(dataframe.columns)):
# other_name = dataframe.columns[other_column_idx]
# yield (f"{name}_plus_{other_name}", lambda df: append_plus_column(df, name, other_name))
# yield (f"{name}_plus_mul_{other_name}", lambda df: append_plus_mul_column(df, name, other_name))
def calc_r2_squared(transform_func):
X_train = train_df.drop(columns=['Rented_Bike_Count'])
X_test = test_df.drop(columns=['Rented_Bike_Count'])
y_train = train_df['Rented_Bike_Count']
y_test = test_df['Rented_Bike_Count']
transform_func(X_train)
transform_func(X_test)
lm_fit = LinearRegression().fit(X_train, y_train)
y_pred = lm_fit.predict(X_test)
r_squared = r2_score(y_test, y_pred)
return r_squared
# 6.
if False:
init_X_train = train_df.drop(columns=['Rented_Bike_Count'])
init_X_test = test_df.drop(columns=['Rented_Bike_Count'])
init_y_train = train_df['Rented_Bike_Count']
init_y_test = test_df['Rented_Bike_Count']
results = []
columns = train_df.drop(columns=['Rented_Bike_Count']).columns
print(len(columns))
for column_a_idx in range(len(columns)):
column_a = columns[column_a_idx]
for column_b_idx in range(column_a_idx+1, len(columns)):
column_b = columns[column_b_idx]
r2_plus = calc_r2_squared(lambda df: append_plus_column(df, column_a, column_b))
r2_plus_mul = calc_r2_squared(lambda df: append_plus_mul_column(df, column_a, column_b))
results.append((column_a, column_b, r2_plus, r2_plus_mul))
results.sort(key=lambda e: e[2]-e[3])
for (column_a, column_b, r2_plus, r2_plus_mul) in results[:10]:
print(column_a, column_b, r2_plus, r2_plus_mul, r2_plus_mul - r2_plus)
# for df in [X_train, X_test]:
#append_plus_mul_column(df, "Humidity", "Visibility")
#append_plus_column(df, "Humidity", "Visibility")
#append_plus_mul_column(df, "Dew_point_temperature", "Rainfall")
#append_plus_column(df, "Dew_point_temperature", "Rainfall")
# pass
# lm_fit = LinearRegression().fit(X_train, y_train)
# y_pred = lm_fit.predict(X_test)
# r_squared = r2_score(y_test, y_pred)
# print(f'R-squared: {r_squared}')
# plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5)
# plt.xlabel('True Rented_Bike_Count values')
# plt.ylabel('Predicted Rented_Bike_Count values')
# plt.ylim(-7.5, 10)
# plt.plot([0, 10], [0, 10], color='red', linewidth=3)
# plt.title('Predicted vs. True Rented_Bike_Count')
# plt.grid()
# plt.show()
# 7.
if True:
r2_list = []
for (transform_name, transform_func) in iter_transformations(train_df.drop(columns=['Rented_Bike_Count'])):
r2_list.append((transform_name, calc_r2_squared(lambda _: _), calc_r2_squared(transform_func)))
r2_list.sort(key=lambda e: e[1]-e[2])
for a in r2_list[:10]:
print(f"{a[0]:30} {a[1]:.6f} {a[2]:.6f} {a[2]-a[1]:.6f}")
# X_train = train_df.drop(columns=['Rented_Bike_Count'])
# X_test = test_df.drop(columns=['Rented_Bike_Count'])
# y_train = train_df['Rented_Bike_Count']
# y_test = test_df['Rented_Bike_Count']
# for df in [X_train, X_test]:
# append_sqrt_column(df, "Rainfall")
# append_sqrt_column(df, "Humidity")
# append_square_column(df, "Visibility")
# lm_fit = LinearRegression().fit(X_train, y_train)
# y_pred = lm_fit.predict(X_test)
# r_squared = r2_score(y_test, y_pred)
# print(f'R-squared: {r_squared}')
# plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5)
# plt.xlabel('True Rented_Bike_Count values')
# plt.ylabel('Predicted Rented_Bike_Count values')
# plt.ylim(-7.5, 10)
# plt.plot([0, 10], [0, 10], color='red', linewidth=3)
# plt.title('Predicted vs. True Rented_Bike_Count')
# plt.grid()
# plt.show()