import pandas as pd from pandas import DataFrame import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.linear_model import LinearRegression from sklearn.metrics import r2_score train_df = pd.read_csv("bike_train.csv") test_df = pd.read_csv("bike_test.csv") # 1. if False: print("Feature count:", len(train_df.columns)) print("Training set size: ", len(train_df.index)) print("Test set size: ", len(test_df.index)) plt.scatter(train_df['Temperature'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5) plt.xlabel('Temperature') plt.ylabel('Rented_Bike_Count') plt.title('Temperature vs. Rented_Bike_Count') plt.grid() plt.show() # 2. if False: plt.scatter(train_df['Visibility'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5) plt.xlabel('Visibility') plt.ylabel('Rented_Bike_Count') plt.title('Visibility vs. Rented_Bike_Count') plt.grid() plt.show() plt.scatter(train_df['Rainfall'], train_df['Rented_Bike_Count'], color='blue', alpha=0.5) plt.xlabel('Rainfall') plt.ylabel('Rented_Bike_Count') plt.title('Rainfall vs. Rented_Bike_Count') plt.grid() plt.show() train_df = train_df.drop(columns=['Date']) test_df = test_df.drop(columns=['Date']) # 3. if False: sns.heatmap(train_df.drop(columns=["Functioning_Day", "Holiday", "Seasons"]).corr(), annot=True) plt.show() lm_fit = LinearRegression().fit(train_df[['Temperature', 'Dew_point_temperature']], train_df['Rented_Bike_Count']) plt.scatter(train_df['Temperature'], train_df['Dew_point_temperature'], color='blue', alpha=0.5) plt.xlabel('Temperature') plt.ylabel('Dew_point_temperature') plt.title('Temperature vs. Dew_point_temperature') plt.grid() plt.show() # plt.scatter(train_df['Visibility'], train_df['Humidity'], color='blue', alpha=0.5) # plt.xlabel('Visibility') # plt.ylabel('Humidity') # plt.title('Visibility vs. Humidity') # plt.grid() # plt.show() train_df = pd.get_dummies(data=train_df, drop_first=True) test_df = pd.get_dummies(data=test_df, drop_first=True) # 4. if False: X_train = train_df.drop(columns=['Rented_Bike_Count']) X_test = test_df.drop(columns=['Rented_Bike_Count']) lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count']) # plt.scatter(train_df['Rented_Bike_Count'], lm_fit.predict(train_df.drop(columns=['Rented_Bike_Count'])), color='blue', alpha=0.5, s=3) # plt.xlabel('True Rented_Bike_Count values (Training data)') # plt.ylabel('Predicted Rented_Bike_Count values') # plt.ylim(-1000, 2000) # plt.plot([0, 1000], [0, 1000], color='red', linewidth=3) # plt.title('Predicted vs. True Rented_Bike_Count') # plt.grid() # plt.show() plt.scatter(test_df['Rented_Bike_Count'], lm_fit.predict(X_test), color='blue', alpha=0.5, s=3) plt.xlabel('True Rented_Bike_Count') plt.ylabel('Predicted Rented_Bike_Count') #plt.ylim(-1000, 2000) plt.plot([0, 1000], [0, 1000], color='red', linewidth=3) plt.title('Predicted vs. True Rented_Bike_Count') plt.grid() plt.show() y_pred = lm_fit.predict(test_df.drop(columns=['Rented_Bike_Count'])) r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred) print(f'R-squared: {r_squared}') print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2))) train_df[f"Rented_Bike_Count"]=np.log(train_df["Rented_Bike_Count"] + 1) test_df[f"Rented_Bike_Count"]=np.log(test_df["Rented_Bike_Count"] + 1) # 5. if False: X_train = train_df.drop(columns=['Rented_Bike_Count']) X_test = test_df.drop(columns=['Rented_Bike_Count']) y_train = np.log(train_df["Rented_Bike_Count"] + 1) y_test = np.log(test_df["Rented_Bike_Count"] + 1) lm_fit = LinearRegression().fit(X_train, train_df['Rented_Bike_Count']) y_pred = lm_fit.predict(X_test) r_squared = r2_score(test_df['Rented_Bike_Count'], y_pred) print(f'R-squared: {r_squared}') print('RMSE: %.2f' % np.sqrt(np.mean((y_pred-test_df['Rented_Bike_Count'])**2))) plt.scatter(test_df['Rented_Bike_Count'], y_pred, color='blue', alpha=0.5, s=3) plt.xlabel('True log(Rented_Bike_Count+1)') plt.ylabel('Predicted log(Rented_Bike_Count+1)') #plt.ylim(-2.5, 10) plt.plot([4, 7.5], [4, 7.5], color='red', linewidth=2) plt.title('Predicted vs. True log(Rented_Bike_Count+1)') plt.grid() plt.show() def append_plus_column(dataframe: DataFrame, column_a, column_b): dataframe[f"{column_a}_plus_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b], axis=1) def append_plus_mul_column(dataframe: DataFrame, column_a, column_b): dataframe[f"{column_a}_plus_mul_{column_b}"] = dataframe.apply(lambda row: row[column_a] + row[column_b] + row[column_a]*row[column_b], axis=1) def append_log_column(dataframe: DataFrame, column): dataframe[f"{column}_log"]=np.log(dataframe[column]) def append_sqrt_column(dataframe: DataFrame, column): dataframe[f"{column}_sqrt"]=np.sqrt(dataframe[column]) def append_square_column(dataframe: DataFrame, column): dataframe[f"{column}_square"]=np.square(dataframe[column]) def iter_transformations(dataframe: DataFrame): for column_idx in range(len(dataframe.columns)): name = dataframe.columns[column_idx] column_contains_zero = any(v == 0 for v in dataframe[name]) column_contains_negative = any(v < 0 for v in dataframe[name]) if not column_contains_zero: yield (f"{name}_log", lambda df: append_log_column(df, name)) if not column_contains_negative: yield (f"{name}_sqrt", lambda df: append_sqrt_column(df, name)) yield (f"{name}_square", lambda df: append_square_column(df, name)) # for other_column_idx in range(column_idx+1, len(dataframe.columns)): # other_name = dataframe.columns[other_column_idx] # yield (f"{name}_plus_{other_name}", lambda df: append_plus_column(df, name, other_name)) # yield (f"{name}_plus_mul_{other_name}", lambda df: append_plus_mul_column(df, name, other_name)) def calc_r2_squared(transform_func): X_train = train_df.drop(columns=['Rented_Bike_Count']) X_test = test_df.drop(columns=['Rented_Bike_Count']) y_train = train_df['Rented_Bike_Count'] y_test = test_df['Rented_Bike_Count'] transform_func(X_train) transform_func(X_test) lm_fit = LinearRegression().fit(X_train, y_train) y_pred = lm_fit.predict(X_test) r_squared = r2_score(y_test, y_pred) return r_squared # 6. if False: init_X_train = train_df.drop(columns=['Rented_Bike_Count']) init_X_test = test_df.drop(columns=['Rented_Bike_Count']) init_y_train = train_df['Rented_Bike_Count'] init_y_test = test_df['Rented_Bike_Count'] results = [] columns = train_df.drop(columns=['Rented_Bike_Count']).columns print(len(columns)) for column_a_idx in range(len(columns)): column_a = columns[column_a_idx] for column_b_idx in range(column_a_idx+1, len(columns)): column_b = columns[column_b_idx] r2_plus = calc_r2_squared(lambda df: append_plus_column(df, column_a, column_b)) r2_plus_mul = calc_r2_squared(lambda df: append_plus_mul_column(df, column_a, column_b)) results.append((column_a, column_b, r2_plus, r2_plus_mul)) results.sort(key=lambda e: e[2]-e[3]) for (column_a, column_b, r2_plus, r2_plus_mul) in results[:10]: print(column_a, column_b, r2_plus, r2_plus_mul, r2_plus_mul - r2_plus) # for df in [X_train, X_test]: #append_plus_mul_column(df, "Humidity", "Visibility") #append_plus_column(df, "Humidity", "Visibility") #append_plus_mul_column(df, "Dew_point_temperature", "Rainfall") #append_plus_column(df, "Dew_point_temperature", "Rainfall") # pass # lm_fit = LinearRegression().fit(X_train, y_train) # y_pred = lm_fit.predict(X_test) # r_squared = r2_score(y_test, y_pred) # print(f'R-squared: {r_squared}') # plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5) # plt.xlabel('True Rented_Bike_Count values') # plt.ylabel('Predicted Rented_Bike_Count values') # plt.ylim(-7.5, 10) # plt.plot([0, 10], [0, 10], color='red', linewidth=3) # plt.title('Predicted vs. True Rented_Bike_Count') # plt.grid() # plt.show() # 7. if True: r2_list = [] for (transform_name, transform_func) in iter_transformations(train_df.drop(columns=['Rented_Bike_Count'])): r2_list.append((transform_name, calc_r2_squared(lambda _: _), calc_r2_squared(transform_func))) r2_list.sort(key=lambda e: e[1]-e[2]) for a in r2_list[:10]: print(f"{a[0]:30} {a[1]:.6f} {a[2]:.6f} {a[2]-a[1]:.6f}") # X_train = train_df.drop(columns=['Rented_Bike_Count']) # X_test = test_df.drop(columns=['Rented_Bike_Count']) # y_train = train_df['Rented_Bike_Count'] # y_test = test_df['Rented_Bike_Count'] # for df in [X_train, X_test]: # append_sqrt_column(df, "Rainfall") # append_sqrt_column(df, "Humidity") # append_square_column(df, "Visibility") # lm_fit = LinearRegression().fit(X_train, y_train) # y_pred = lm_fit.predict(X_test) # r_squared = r2_score(y_test, y_pred) # print(f'R-squared: {r_squared}') # plt.scatter(y_train, lm_fit.predict(X_train), color='blue', alpha=0.5) # plt.xlabel('True Rented_Bike_Count values') # plt.ylabel('Predicted Rented_Bike_Count values') # plt.ylim(-7.5, 10) # plt.plot([0, 10], [0, 10], color='red', linewidth=3) # plt.title('Predicted vs. True Rented_Bike_Count') # plt.grid() # plt.show()