# Imports
%matplotlib inline
from My_Functions import *
import numpy as np
import pandas as pd
import seaborn as sns
import warnings
from collections import defaultdict
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, r2_score,roc_curve
from sklearn.model_selection import train_test_split, KFold,StratifiedKFold
from sklearn.model_selection import cross_val_score, cross_val_predict,validation_curve
from sklearn.ensemble import RandomForestRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelBinarizer, RobustScaler,PolynomialFeatures
from sklearn.neighbors import KNeighborsRegressor
from scipy import stats
from scipy.stats import skew, boxcox_normmax
from scipy.special import boxcox1p
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator , MultipleLocator
##from matplotlib import XAxis
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, LabelBinarizer,StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE, RFECV
from sklearn.linear_model import LogisticRegression,LinearRegression, OrthogonalMatchingPursuit
from sklearn.model_selection import train_test_split , TimeSeriesSplit, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from matplotlib.gridspec import GridSpec
import plotly.tools as tls
import plotly
plotly.tools.set_credentials_file(username='Furqan92', api_key='22DfVN5rFRg79OYygN5h')
import plotly.plotly as py
from sklearn.decomposition import PCA
from pandas import DataFrame
from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.simplefilter(action='ignore')
random_seed = 504
tscv = TimeSeriesSplit(n_splits=5)
# To automatically reload the function file
%load_ext autoreload
%aimport My_Functions
%run My_Functions.py
%autoreload 1
# Load datasets
we_h=pd.read_csv('./weekends1_holi_data_prepared.csv')
we_h['dteday']=pd.to_datetime(we_h['dteday'], format='%Y-%m-%d')
X_Train_2011_2012Q2 = we_h[we_h['dteday'] < datetime.datetime(2012, 7, 1, 0, 0)].drop(['cnt','casual','registered','dteday'],axis=1)## NONE OF THst_EM IN DATA
Y_cnt_train_2011_2012Q2 =we_h['cnt'][we_h['dteday'] < datetime.datetime(2012, 7, 1, 0, 0)]
X_Test_2012Q3 = we_h[(we_h['dteday'] >= datetime.datetime(2012, 7, 1, 0, 0)) & (we_h['dteday'] <= datetime.datetime(2012, 9, 30, 0, 0))].drop(['cnt','casual','registered','dteday'],axis=1)## NONE OF THEM IN DATA
Y_cnt_test_2012Q3 =we_h['cnt'][(we_h['dteday'] >= datetime.datetime(2012, 7, 1, 0, 0)) & (we_h['dteday'] <= datetime.datetime(2012, 9, 30, 0, 0))]
##Linear Regression
lm_parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True]}
lm = GridSearchCV(LinearRegression(),
param_grid=lm_parameters,
cv=tscv,return_train_score=True)
lm.fit(X_Train_2011_2012Q2, Y_cnt_train_2011_2012Q2)
# lm.cv_results_
lm_predictions = lm.predict(X_Test_2012Q3)
lm.score(X_Test_2012Q3, Y_cnt_test_2012Q3)
RF_parameters = {'n_estimators': [10, 30 ,40,100],
'bootstrap': [True ,False],
'max_depth': [80, 100 ,120 ],
'max_features': ['log2', 16, 32 ],
'min_samples_leaf': [2, 5 , 8],
'min_samples_split': [ 10 , 8 , 15 , 18],
'random_state':[random_seed],
'criterion':['mse']}
rf = GridSearchCV(RandomForestRegressor(),
param_grid= RF_parameters,
cv=tscv)
rf.fit(X_Train_2011_2012Q2, Y_cnt_train_2011_2012Q2)
# rf.cv_results_
rf_predictions = lm.predict(X_Test_2012Q3)
rf.score(X_Test_2012Q3, Y_cnt_test_2012Q3)
knn_parameters = {'n_neighbors':[1, 5,10,15, 30 ],'weights':['uniform'], 'algorithm':['auto'],'leaf_size':[5 ,10, 20,30],
'p':[1,2,3],'metric':['minkowski']}
knn = GridSearchCV(KNeighborsRegressor(),
param_grid=knn_parameters,
cv=tscv,return_train_score=True)
knn.fit(X_Train_2011_2012Q2, Y_cnt_train_2011_2012Q2)
# rf.cv_results_
knn_predictions = knn.predict(X_Test_2012Q3)
knn.score(X_Test_2012Q3, Y_cnt_test_2012Q3)
# svr = GridSearchCV(SVR(), cv=tscv,
# param_grid={"kernel" : ['linear', 'rbf', 'poly'],
# "C": [1e0, 1e1, 1e2, 1e3],
# "gamma": np.logspace(-2, 2, 5)})
# svr.fit(X_Train_2011_2012Q2, Y_cnt_train_2011_2012Q2)
# # rf.cv_results_
# svr_predictions = svr.predict(X_Test_2012Q3)
# svr.score(X_Test_2012Q3, Y_cnt_test_2012Q3)
param_grid = {'learning_rate': [0.01, 0.1],
'max_depth': [4,8,12],
'min_child_weight': [3,5,10,20,35,50],
'subsample': [0.5, 0.75],
'colsample_bytree': [0.5, 0.75],
'n_estimators': [100, 300],
'random_state':[random_seed]
}
model = xgb.XGBRegressor()
xg = GridSearchCV(model,
param_grid = param_grid,
cv = tscv,
n_jobs = -1,
scoring = 'r2',
verbose=True)
xg.fit(X_Train_2011_2012Q2, Y_cnt_train_2011_2012Q2)
xg_predictions = xg.predict(X_Test_2012Q3)
xg.score(X_Test_2012Q3, Y_cnt_test_2012Q3)
# LR and Random Forest predictions
combinedPredictions = pd.DataFrame({'lm_predictions':lm_predictions,'rf_predictions':rf_predictions,'xg_predictions':xg_predictions })
# Concat original data
X_Test_2012Q3.reset_index(drop=True,inplace=True)
combinedPredictions = pd.concat([combinedPredictions,X_Test_2012Q3], axis=1)
# Target data
combinedPredictionsTarget = pd.DataFrame({'target':Y_cnt_test_2012Q3})
#Reset indices
combinedPredictions.reset_index(drop=True,inplace=True)
combinedPredictionsTarget.reset_index(drop=True,inplace=True)
#Get Q4 data
X_Test_Q4 = we_h[(we_h['dteday'] >= datetime.datetime(2012, 9, 30, 0, 0)) & (we_h['dteday'] <= datetime.datetime(2012, 12, 31, 0, 0))].drop(['cnt','casual','registered','dteday'],axis=1)## NONE OF THEM IN DATA
Y_cnt_test_Q4 =we_h['cnt'][(we_h['dteday'] >= datetime.datetime(2012, 9, 30, 0, 0)) & (we_h['dteday'] <= datetime.datetime(2012, 12, 31, 0, 0))]
oringal_cnt = Y_cnt_test_Q4
#get level-0 predictions for Q4 data
Q4_lm_predictions = pd.DataFrame(lm.predict(X_Test_Q4), columns=["lm_predictions"])
Q4_rf_predictions = pd.DataFrame(rf.predict(X_Test_Q4), columns=["rf_predictions"])
##Q4_knn_predications = pd.DataFrame(knn.predict(X_Test_Q4), columns=["knn_predications"])
Q4_xg_predictions = pd.DataFrame(xg.predict(X_Test_Q4), columns=["xg_predictions"])
X_Test_Q4.reset_index(drop=True,inplace=True)
Q4_lm_predictions.reset_index(drop=True,inplace=True)
Q4_rf_predictions.reset_index(drop=True,inplace=True)
Q4_xg_predictions.reset_index(drop=True,inplace=True)
Y_cnt_test_Q4.reset_index(drop=True,inplace=True)
#concat Q4 with level-0 predictions
X_Test_Q4_with_predictions = pd.concat([Q4_lm_predictions, Q4_rf_predictions,Q4_xg_predictions,X_Test_Q4], axis=1)
Y_cnt_test_Q4 = pd.DataFrame({"target": Y_cnt_test_Q4})
lm.score(X_Test_Q4, Y_cnt_test_Q4)
rf.score(X_Test_Q4, Y_cnt_test_Q4)
knn.score(X_Test_Q4, Y_cnt_test_Q4)
xg.score(X_Test_Q4, Y_cnt_test_Q4)
param_grid = {'learning_rate': [0.01, 0.1],
'max_depth': [4,8,12],
'min_child_weight': [3,5,10,20,35,50],
'subsample': [0.5, 0.75],
'colsample_bytree': [0.5, 0.75],
'n_estimators': [100, 300],
'random_state':[random_seed]
}
model = xgb.XGBRegressor()
xg1 = GridSearchCV(model,
param_grid = param_grid,
cv = tscv,
n_jobs = -1,
scoring = 'r2',
verbose=True)
xg1.fit(combinedPredictions, combinedPredictionsTarget)
xg1_predictions = xg1.predict(X_Test_Q4_with_predictions)
xg1.score(X_Test_Q4_with_predictions, Y_cnt_test_Q4)
The score decreased drastically so after some research Stochastic Gradient Descent seemed like an appropriate model to try in stacking because it also act on errors using lasso and ridge and elastic net
penalty and has learning rate , in fact it gave a better score then XGBoost but still not morew then the level 0 XGBoost
# Grid search - this will take about 1 minute.
param_grid = {
'alpha': 10.0 ** -np.arange(1, 7),
'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
'penalty': ['l2', 'l1', 'elasticnet'],
'learning_rate': ['constant', 'optimal', 'invscaling'],
}
clf = GridSearchCV(SGDRegressor(), param_grid)
clf.fit(combinedPredictions, combinedPredictionsTarget)
clf_predictions = clf.predict(X_Test_Q4_with_predictions)
clf.score(X_Test_Q4_with_predictions, Y_cnt_test_Q4)
The score decreased after stacking, so it seems appropriate to stick with level 0 XGBoost model
work_predictions=pd.read_csv('pred_working.csv')
work_predictions.columns = ['dteday','prediction','target']
not_work_predictions = pd.concat((pd.DataFrame(xg.predict(X_Test_Q4)) , Y_cnt_test_Q4), axis=1)
not_work_predictions.columns = ['prediction','target']
print('The last r2 score for the predictions is :')
r2score(np.concatenate((not_work_predictions['target'],
work_predictions['target'])),
np.concatenate((not_work_predictions['prediction'],
work_predictions['prediction'])) )
Stacking ensemble was useful for working days as R2 score improved by 0.01, but surprisingly there was a drop in R2 for Holidays. Research showed that Stochastic Gradient Descent could be an appropriate level 1 model as it also acts on errors using lasso and ridge and elastic net penalty, also, it has learning rate. Infact, it gave a better score (0.88) than XGBoost but still not more then the level 0 XGBoost.
This means that the new predictions that were added to the initial dataset were not useful for Level 1 model and maybe led to overfitting in the training. The final decision was to keep the stacking ensemble for Working Days, which produced an R2 score 0f 0.901 but not for the Holidays. For holidays, only level 0 XGBoost was used that yielded a score of 0.8905. After combining the two predictions, the final R2 score was 0.897.
date = we_h['dteday'][(we_h['dteday'] >= datetime.datetime(2012, 9, 30, 0, 0)) & (we_h['dteday'] <= datetime.datetime(2012, 12, 31, 0, 0))]
date=pd.to_datetime(date, format='%Y-%m-%d')
date.reset_index(drop=True,inplace=True)
work_predictions.reset_index(drop=True,inplace=True)
not_work_predictions=pd.concat([date,not_work_predictions],axis=1)
work_predictions.set_index('dteday')
not_work_predictions.set_index('dteday')
combined=work_predictions.append(not_work_predictions)
df = combined.melt('dteday', var_name='cols', value_name='vals')
df["dteday"]=df["dteday"].map(lambda x: pd.to_datetime(x))
mpl_fig = plt.figure()
ax= plt.axes()
sns.lineplot(x="dteday", y="vals", hue='cols', data=df)
plotly_fig=tls.mpl_to_plotly(mpl_fig)
plotly_fig['layout']['xaxis'] = {
'tickmode': 'auto',
'nticks': 3
}
plotly_fig['layout']['showlegend'] = False
plotly_fig['layout'] = {'width':800}
plotly_fig['layout']['title'] = 'Predictions'
py.iplot(plotly_fig,filename='Predictions')