245 lines
9.5 KiB
Python
245 lines
9.5 KiB
Python
import pandas as pd
|
|
from pandas import DataFrame
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
import seaborn as sns
|
|
from sklearn.linear_model import LinearRegression
|
|
from sklearn.metrics import r2_score
|
|
|
|
train_df = pd.read_csv("bike_train.csv")
|
|
test_df = pd.read_csv("bike_test.csv")
|
|
|
|
# 1.
|
|
if False:
|
|
print("Feature count:", len(train_df.columns))
|
|
print("Training set size: ", len(train_df.index))
|
|
print("Test set size: ", len(test_df.index))
|
|
plt.scatter(train_df['Temperature'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
|
|
plt.xlabel('Temperature')
|
|
plt.ylabel('Rented_Bike_Count')
|
|
plt.title('Temperature vs. Rented_Bike_Count')
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
# 2.
|
|
if False:
|
|
plt.scatter(train_df['Visibility'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
|
|
plt.xlabel('Visibility')
|
|
plt.ylabel('Rented_Bike_Count')
|
|
plt.title('Visibility vs. Rented_Bike_Count')
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
plt.scatter(train_df['Rainfall'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5)
|
|
plt.xlabel('Rainfall')
|
|
plt.ylabel('Rented_Bike_Count')
|
|
plt.title('Rainfall vs. Rented_Bike_Count')
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
train_df = train_df.drop(columns=['Date'])
|
|
test_df = test_df.drop(columns=['Date'])
|
|
|
|
# 3.
|
|
if False:
|
|
sns.heatmap(train_df.drop(columns=["Functioning_Day", "Holiday", "Seasons"]).corr(), annot=True)
|
|
plt.show()
|
|
|
|
lm_fit = LinearRegression().fit(train_df[['Temperature', 'Dew_point_temperature']], train_df['Rented_Bike_Count'])
|
|
|
|
plt.scatter(train_df['Temperature'], train_df['Dew_point_temperature'], color='blue', alpha=0.5)
|
|
plt.xlabel('Temperature')
|
|
plt.ylabel('Dew_point_temperature')
|
|
plt.title('Temperature vs. Dew_point_temperature')
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
# plt.scatter(train_df['Visibility'], train_df['Humidity'], color='blue', alpha=0.5)
|
|
# plt.xlabel('Visibility')
|
|
# plt.ylabel('Humidity')
|
|
# plt.title('Visibility vs. Humidity')
|
|
# plt.grid()
|
|
# plt.show()
|
|
|
|
train_df = pd.get_dummies(data=train_df, drop_first=True)
|
|
test_df = pd.get_dummies(data=test_df, drop_first=True)
|
|
|
|
# 4.
|
|
if False:
|
|
X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
|
X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
|
lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count'])
|
|
|
|
# plt.scatter(train_df['Rented_Bike_Count'], lm_fit.predict(train_df.drop(columns=['Rented_Bike_Count'])), color='blue', alpha=0.5, s=3)
|
|
# plt.xlabel('True Rented_Bike_Count values (Training data)')
|
|
# plt.ylabel('Predicted Rented_Bike_Count values')
|
|
# plt.ylim(-1000, 2000)
|
|
# plt.plot([0, 1000], [0, 1000], color='red', linewidth=3)
|
|
# plt.title('Predicted vs. True Rented_Bike_Count')
|
|
# plt.grid()
|
|
# plt.show()
|
|
|
|
plt.scatter(test_df['Rented_Bike_Count'], lm_fit.predict(X_test), color='blue', alpha=0.5, s=3)
|
|
plt.xlabel('True Rented_Bike_Count')
|
|
plt.ylabel('Predicted Rented_Bike_Count')
|
|
#plt.ylim(-1000, 2000)
|
|
plt.plot([0, 1000], [0, 1000], color='red', linewidth=3)
|
|
plt.title('Predicted vs. True Rented_Bike_Count')
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
y_pred = lm_fit.predict(test_df.drop(columns=['Rented_Bike_Count']))
|
|
r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred)
|
|
print(f'R-squared: {r_squared}')
|
|
print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2)))
|
|
|
|
|
|
train_df[f"Rented_Bike_Count"]=np.log(train_df["Rented_Bike_Count"] + 1)
|
|
test_df[f"Rented_Bike_Count"]=np.log(test_df["Rented_Bike_Count"] + 1)
|
|
|
|
# 5.
|
|
if False:
|
|
X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
|
X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
|
y_train = np.log(train_df["Rented_Bike_Count"] + 1)
|
|
y_test = np.log(test_df["Rented_Bike_Count"] + 1)
|
|
|
|
lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count'])
|
|
|
|
y_pred = lm_fit.predict(X_test)
|
|
r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred)
|
|
print(f'R-squared: {r_squared}')
|
|
print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2)))
|
|
|
|
plt.scatter(test_df['Rented_Bike_Count'], y_pred, color='blue', alpha=0.5, s=3)
|
|
plt.xlabel('True log(Rented_Bike_Count+1)')
|
|
plt.ylabel('Predicted log(Rented_Bike_Count+1)')
|
|
#plt.ylim(-2.5, 10)
|
|
plt.plot([4, 7.5], [4, 7.5], color='red', linewidth=2)
|
|
plt.title('Predicted vs. True log(Rented_Bike_Count+1)')
|
|
plt.grid()
|
|
plt.show()
|
|
|
|
def append_plus_column(dataframe: DataFrame, column_a, column_b):
|
|
dataframe[f"{column_a}_plus_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b], axis=1)
|
|
|
|
def append_plus_mul_column(dataframe: DataFrame, column_a, column_b):
|
|
dataframe[f"{column_a}_plus_mul_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b] + row[column_a]*row[column_b], axis=1)
|
|
|
|
def append_log_column(dataframe: DataFrame, column):
|
|
dataframe[f"{column}_log"]=np.log(dataframe[column])
|
|
|
|
def append_sqrt_column(dataframe: DataFrame, column):
|
|
dataframe[f"{column}_sqrt"]=np.sqrt(dataframe[column])
|
|
|
|
def append_square_column(dataframe: DataFrame, column):
|
|
dataframe[f"{column}_square"]=np.square(dataframe[column])
|
|
|
|
def iter_transformations(dataframe: DataFrame):
|
|
for column_idx in range(len(dataframe.columns)):
|
|
name = dataframe.columns[column_idx]
|
|
column_contains_zero = any(v == 0 for v in dataframe[name])
|
|
column_contains_negative = any(v < 0 for v in dataframe[name])
|
|
|
|
if not column_contains_zero:
|
|
yield (f"{name}_log", lambda df: append_log_column(df, name))
|
|
if not column_contains_negative:
|
|
yield (f"{name}_sqrt", lambda df: append_sqrt_column(df, name))
|
|
yield (f"{name}_square", lambda df: append_square_column(df, name))
|
|
|
|
# for other_column_idx in range(column_idx+1, len(dataframe.columns)):
|
|
# other_name = dataframe.columns[other_column_idx]
|
|
# yield (f"{name}_plus_{other_name}", lambda df: append_plus_column(df, name, other_name))
|
|
# yield (f"{name}_plus_mul_{other_name}", lambda df: append_plus_mul_column(df, name, other_name))
|
|
|
|
def calc_r2_squared(transform_func):
|
|
X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
|
X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
|
y_train = train_df['Rented_Bike_Count']
|
|
y_test = test_df['Rented_Bike_Count']
|
|
|
|
transform_func(X_train)
|
|
transform_func(X_test)
|
|
|
|
lm_fit = LinearRegression().fit(X_train, y_train)
|
|
y_pred = lm_fit.predict(X_test)
|
|
r_squared = r2_score(y_test, y_pred)
|
|
return r_squared
|
|
|
|
# 6.
|
|
if False:
|
|
init_X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
|
init_X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
|
init_y_train = train_df['Rented_Bike_Count']
|
|
init_y_test = test_df['Rented_Bike_Count']
|
|
|
|
results = []
|
|
columns = train_df.drop(columns=['Rented_Bike_Count']).columns
|
|
print(len(columns))
|
|
for column_a_idx in range(len(columns)):
|
|
column_a = columns[column_a_idx]
|
|
for column_b_idx in range(column_a_idx+1, len(columns)):
|
|
column_b = columns[column_b_idx]
|
|
r2_plus = calc_r2_squared(lambda df: append_plus_column(df, column_a, column_b))
|
|
r2_plus_mul = calc_r2_squared(lambda df: append_plus_mul_column(df, column_a, column_b))
|
|
results.append((column_a, column_b, r2_plus, r2_plus_mul))
|
|
|
|
results.sort(key=lambda e: e[2]-e[3])
|
|
for (column_a, column_b, r2_plus, r2_plus_mul) in results[:10]:
|
|
print(column_a, column_b, r2_plus, r2_plus_mul, r2_plus_mul - r2_plus)
|
|
|
|
# for df in [X_train, X_test]:
|
|
#append_plus_mul_column(df, "Humidity", "Visibility")
|
|
#append_plus_column(df, "Humidity", "Visibility")
|
|
|
|
#append_plus_mul_column(df, "Dew_point_temperature", "Rainfall")
|
|
#append_plus_column(df, "Dew_point_temperature", "Rainfall")
|
|
# pass
|
|
|
|
# lm_fit = LinearRegression().fit(X_train, y_train)
|
|
# y_pred = lm_fit.predict(X_test)
|
|
# r_squared = r2_score(y_test, y_pred)
|
|
# print(f'R-squared: {r_squared}')
|
|
|
|
# plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5)
|
|
# plt.xlabel('True Rented_Bike_Count values')
|
|
# plt.ylabel('Predicted Rented_Bike_Count values')
|
|
# plt.ylim(-7.5, 10)
|
|
# plt.plot([0, 10], [0, 10], color='red', linewidth=3)
|
|
# plt.title('Predicted vs. True Rented_Bike_Count')
|
|
# plt.grid()
|
|
# plt.show()
|
|
|
|
# 7.
|
|
if True:
|
|
r2_list = []
|
|
for (transform_name, transform_func) in iter_transformations(train_df.drop(columns=['Rented_Bike_Count'])):
|
|
r2_list.append((transform_name, calc_r2_squared(lambda _: _), calc_r2_squared(transform_func)))
|
|
|
|
r2_list.sort(key=lambda e: e[1]-e[2])
|
|
for a in r2_list[:10]:
|
|
print(f"{a[0]:30} {a[1]:.6f} {a[2]:.6f} {a[2]-a[1]:.6f}")
|
|
|
|
# X_train = train_df.drop(columns=['Rented_Bike_Count'])
|
|
# X_test = test_df.drop(columns=['Rented_Bike_Count'])
|
|
# y_train = train_df['Rented_Bike_Count']
|
|
# y_test = test_df['Rented_Bike_Count']
|
|
|
|
# for df in [X_train, X_test]:
|
|
# append_sqrt_column(df, "Rainfall")
|
|
# append_sqrt_column(df, "Humidity")
|
|
# append_square_column(df, "Visibility")
|
|
|
|
# lm_fit = LinearRegression().fit(X_train, y_train)
|
|
# y_pred = lm_fit.predict(X_test)
|
|
# r_squared = r2_score(y_test, y_pred)
|
|
# print(f'R-squared: {r_squared}')
|
|
|
|
# plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5)
|
|
# plt.xlabel('True Rented_Bike_Count values')
|
|
# plt.ylabel('Predicted Rented_Bike_Count values')
|
|
# plt.ylim(-7.5, 10)
|
|
# plt.plot([0, 10], [0, 10], color='red', linewidth=3)
|
|
# plt.title('Predicted vs. True Rented_Bike_Count')
|
|
# plt.grid()
|
|
# plt.show() |