#manipulating/analyzing data
import numpy as np
import pandas as pd

#data visualization
import seaborn as sns
import matplotlib.pyplot as plt

#statistics
import scipy.stats as stats

#splitting data into train and test
from sklearn.model_selection import train_test_split

#making logistic regression model
import statsmodels.stats.api as sms
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from sklearn.linear_model import LogisticRegression

#making decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#tuning hyperparameters
from sklearn.model_selection import GridSearchCV

#deriving metrics to assess model performance
from sklearn.metrics import (f1_score, accuracy_score, recall_score, precision_score, 
                             ConfusionMatrixDisplay, roc_auc_score, confusion_matrix, 
                             precision_recall_curve, roc_curve, make_scorer)

#manipulating the number of rows and columns displayed
pd.set_option('display.max_columns', None) #no limit for rows displayed
pd.set_option('display.max_rows', 200) #maximum 200 limit for rows displayed

#preventing warnings from being displayed
import warnings
warnings.filterwarnings('ignore')


#assigning hotels variable to the data stored in the csv file
hotels = pd.read_csv('INNHotelsGroup.csv')


#viewing the first 5 rows of hotels
hotels.head()


#viewing the last 5 rows of hotels
hotels.tail()


#viewing the number of columns and rows in hotels
hotels.shape

(36275, 19)


#viewing further column information and datatype for each column in hotels
hotels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36275 entries, 0 to 36274
Data columns (total 19 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Booking_ID                            36275 non-null  object 
 1   no_of_adults                          36275 non-null  int64  
 2   no_of_children                        36275 non-null  int64  
 3   no_of_weekend_nights                  36275 non-null  int64  
 4   no_of_week_nights                     36275 non-null  int64  
 5   type_of_meal_plan                     36275 non-null  object 
 6   required_car_parking_space            36275 non-null  int64  
 7   room_type_reserved                    36275 non-null  object 
 8   lead_time                             36275 non-null  int64  
 9   arrival_year                          36275 non-null  int64  
 10  arrival_month                         36275 non-null  int64  
 11  arrival_date                          36275 non-null  int64  
 12  market_segment_type                   36275 non-null  object 
 13  repeated_guest                        36275 non-null  int64  
 14  no_of_previous_cancellations          36275 non-null  int64  
 15  no_of_previous_bookings_not_canceled  36275 non-null  int64  
 16  avg_price_per_room                    36275 non-null  float64
 17  no_of_special_requests                36275 non-null  int64  
 18  booking_status                        36275 non-null  object 
dtypes: float64(1), int64(13), object(5)
memory usage: 5.3+ MB


#viewing the statistical summary for all columns in hotels
hotels.describe(include='all')


#viewing duplicate values in hotels
hotels.duplicated().sum()

0


#viewing missing values in hotels
hotels.isnull().sum()

Booking_ID                              0
no_of_adults                            0
no_of_children                          0
no_of_weekend_nights                    0
no_of_week_nights                       0
type_of_meal_plan                       0
required_car_parking_space              0
room_type_reserved                      0
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
market_segment_type                     0
repeated_guest                          0
no_of_previous_cancellations            0
no_of_previous_bookings_not_canceled    0
avg_price_per_room                      0
no_of_special_requests                  0
booking_status                          0
dtype: int64


#using for loop to make histogram for all columns, except Booking_ID
#Booking_ID histogram is not needed because it is confirmed that each row has a unique value
#color of histograms is set to thistle
for i in hotels.columns: #for each column in hotels
    if (i=='Booking_ID'): #if column is Booking_ID
        pass #skip
    elif (i=='room_type_reserved'): #if column is room_type_reserved
        sns.histplot(data=hotels, x=i, color='thistle') #use data from hotels to plot i on x-axis
        plt.xticks(rotation=90) #rotate x-axis labels 90 degrees
        plt.title(i) #title is set to column name
        plt.show(); #display histogram
    else: #all of the other columns
        sns.histplot(data=hotels, x=i, color='thistle') #use data from hotels to plot i on x-axis
        plt.title(i) #title is set to column name
        plt.show(); #display histogram


#assigning hotels_numerical to the numerical columns
hotels_numerical = hotels.select_dtypes(np.number)
#making heatmap with columns in hotels_numerical
#displaying correlation in shades of blue/number labels(limited to 2 decimal places) are shown/range is from -1 to 1
#annotation label size is set to 9 to avoid overcrowding
sns.heatmap(hotels_numerical.corr(), annot=True,annot_kws={'fontsize':9}, vmin=-1, vmax=1, fmt='.2f', cmap='Blues')
plt.show(); #displaying heatmap


#comparing booking_status with other columns using histograms
for i in hotels.columns: #for each column in hotels
    if (i=='Booking_ID' or i=='booking_status'): #if column is Booking_ID or booking_status
        pass #skip
    elif (i=='room_type_reserved'): #if column is room_type_reserved
        sns.histplot(data=hotels, x=i, hue='booking_status') #using data from hotels to plot i on x-axis/hue is set to booking_status
        plt.xticks(rotation=90) #rotate x-axis labels 90 degrees
        plt.title(i) #setting title of histogram
        plt.show(); #displaying histogram
    else: #all of the other columns
        sns.histplot(data=hotels, x=i, hue='booking_status') #using data from hotels to plot i on x-axis/hue is set to booking_status
        plt.title('booking_status vs. '+ i) #setting title of histogram
        plt.show(); #displaying histogram


#making histogram of arrival_month column in hotels
#color is set to lightsteelblue
sns.histplot(data=hotels, x='arrival_month', color='lightsteelblue') #plotting arrival_month on x-axis
plt.title('Distribution of Arrival Months') #setting title of histogram
plt.xlabel('Arrival Month') #setting title of x-axis
plt.ylabel('Number of Bookings') #setting title of y-axis
plt.show(); #displaying histogram


#making histogram of market_segment_type column in hotels
#color is set to lightsteelblue
sns.histplot(data=hotels, x='market_segment_type', color='lightsteelblue')#plotting market_segment_type on x-axis
plt.title('Distribution of Market Segment Type') #setting title of histogram
plt.xlabel('Market Segment Type') #setting title of x-axis
plt.ylabel('Number of Bookings') #setting title of y-axis
plt.show(); #displaying histogram


#making a scatterplot for avg_price_per_room vs. market_segment_type columns
#plotting market_segment_type on x-axis/avg_price_per_room on y-axis
#color is set to lightsteelblue
sns.scatterplot(data=hotels, x='market_segment_type', y='avg_price_per_room', color='lightsteelblue')
plt.title('Average Room Price vs. Market Segment Type') #setting title of scatterplot
plt.xlabel('Market Segment Type') #setting title of x-axis
plt.ylabel('Average Room Price') #setting title of y-axis
plt.show(); #displaying scatterplot


#The number of rows in hotel where booking status is canceled divided by the total number of rows in hotel
#all multiplied by 100 to get value as percentage
len(hotels[hotels['booking_status'] == 'Canceled'])/hotels.shape[0] * 100

32.76361130254997


#isolating the bookings with a repeated guest by assigning them a variable called repeated
repeated = hotels[hotels['repeated_guest'] ==1]
#The number of rows in repeated where booking status is canceled divided by the total number of rows in repeated
#all multiplied by 100 to get value as percentage
len(repeated[repeated['booking_status'] == 'Canceled'])/repeated.shape[0]*100

1.7204301075268817


#isolating the bookings where booking status is canceled into a variable called Canceled
Canceled = hotels[hotels['booking_status'] == 'Canceled']
#making histogram of no_of_special_requests column in canceled to check distribution of canceled bookings based on special requests
#color is set to lightsteelblue
sns.histplot(data=Canceled,x='no_of_special_requests', color='lightsteelblue') #plotting no_of_special_requests on x-axis
plt.title('Cancelations vs. Number of Special Requests') #setting title of histogram
plt.xlabel('Number of Special Requests') #setting title of x-axis
plt.ylabel('Cancelations') #setting title of y-axis
plt.show(); #displaying histogram


#double-checking the number of null values in each column of hotels
hotels.isnull().sum()

Booking_ID                              0
no_of_adults                            0
no_of_children                          0
no_of_weekend_nights                    0
no_of_week_nights                       0
type_of_meal_plan                       0
required_car_parking_space              0
room_type_reserved                      0
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
market_segment_type                     0
repeated_guest                          0
no_of_previous_cancellations            0
no_of_previous_bookings_not_canceled    0
avg_price_per_room                      0
no_of_special_requests                  0
booking_status                          0
dtype: int64


#dropping Booking_ID column from dataset/assigning it to a new variable called hotels1 to avoid changing original dataframe
hotels1 = hotels.drop(['Booking_ID'], axis=1)


#assigning the numerical columns of hotels1 to a variable called hotels1_numerical
hotels1_numerical = hotels1.select_dtypes(np.number)
#making boxplot for each numerical column in hotels1
for i in hotels1_numerical.columns: #for each column in hotels1.numerical
    sns.boxplot(data=hotels1_numerical, x=i) #plot that column on x-axis
    plt.title(i) #set title of boxplot to the column name
    plt.show(); #display boxplot


#booking_status is the Y variable/main variable in focus
#changing booking_status column values to 1 and 0
#If value is Canceled, then value is 1/if value is Not_Canceled, then value is 0
hotels1['booking_status'] = hotels1['booking_status'].apply(lambda x: 1 if x == 'Canceled' else 0)


#checking to make sure all values in booking_status have been changed to 1 and 0
hotels1['booking_status'].value_counts()

0    24390
1    11885
Name: booking_status, dtype: int64


#Assigning all variables, excluding booking_status, to X - so X equals all the columns when booking_status is dropped
X = hotels1.drop(['booking_status'], axis=1)
#Assigning booking_status to Y
Y = hotels1['booking_status']


#checking X to make sure it has all columns except booking status
X


#checking Y to make sure that it only has booking_status
Y

0        0
1        0
2        1
3        1
4        1
        ..
36270    0
36271    1
36272    0
36273    1
36274    0
Name: booking_status, Length: 36275, dtype: int64


#including a constant for the variables in X
X = sm.add_constant(X)
#making dummy variables for the variables in X/dropping first variable in each set of dummy variables
X = pd.get_dummies(X, drop_first=True)


#viewing all of the variables in X to make sure dummy variables are correctly created
X.count()

const                                   36275
no_of_adults                            36275
no_of_children                          36275
no_of_weekend_nights                    36275
no_of_week_nights                       36275
required_car_parking_space              36275
lead_time                               36275
arrival_year                            36275
arrival_month                           36275
arrival_date                            36275
repeated_guest                          36275
no_of_previous_cancellations            36275
no_of_previous_bookings_not_canceled    36275
avg_price_per_room                      36275
no_of_special_requests                  36275
type_of_meal_plan_Meal Plan 2           36275
type_of_meal_plan_Meal Plan 3           36275
type_of_meal_plan_Not Selected          36275
room_type_reserved_Room_Type 2          36275
room_type_reserved_Room_Type 3          36275
room_type_reserved_Room_Type 4          36275
room_type_reserved_Room_Type 5          36275
room_type_reserved_Room_Type 6          36275
room_type_reserved_Room_Type 7          36275
market_segment_type_Complementary       36275
market_segment_type_Corporate           36275
market_segment_type_Offline             36275
market_segment_type_Online              36275
dtype: int64


#dividing the dtaa in X and Y into train and test groups
#To make a 70:30 train to test ratio, test_size is 0.3
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1)


#viewing the frequency of values in y_train
y_train.value_counts(normalize=True)

0    0.670644
1    0.329356
Name: booking_status, dtype: float64


#viewing the frquency of values in y_test
y_test.value_counts(normalize=True)

0    0.676376
1    0.323624
Name: booking_status, dtype: float64


#viewing the number of rows and columns in X_train
X_train.shape

(25392, 28)


#viewing the number of rows and columns in X_test
X_test.shape

(10883, 28)


#checking for any changes in the distribution of variables in hotels1
#using for loop to make histogram for all columns
#color of histograms is set to thistle
for i in hotels1.columns: #for each column in hotels1
    if (i=='room_type_reserved'): #if column is room_type_reserved
        sns.histplot(data=hotels1, x=i, color='thistle') #use data from hotels1 to plot i on x-axis
        plt.xticks(rotation=90) #rotate x-axis labels 90 degrees
        plt.title(i) #title is set to column name
        plt.show(); #display histogram
    else: #all of the other columns
        sns.histplot(data=hotels1, x=i, color='thistle') #use data from hotels1 to plot i on x-axis
        plt.title(i) #title is set to column name
        plt.show(); #display histogram


#booking_status can now be added to hotels1_numerical
#again assigning hotels1_numerical to all numerical columns in hotels1 to make sure that booking_status is also added
hotels1_numerical = hotels1.select_dtypes(np.number)
#checking for any changes in the correlation between numerical variables in hotels1
#making heatmap with columns in hotels1_numerical
#displaying correlation in shades of blue/number labels(limited to 2 decimal places) are shown/range is from -1 to 1
#annotation label size is set to 9 to avoid overcrowding
sns.heatmap(hotels1_numerical.corr(), annot=True,annot_kws={'fontsize':9}, vmin=-1, vmax=1, fmt='.2f', cmap='Blues')
plt.show(); #displaying heatmap


#making a new dataframe called vif
vif = pd.DataFrame()
#creating a column called variables in vif dataframe to hold all the columns in X_train
vif['variables'] = X_train.columns
#creating column called values to hold the variance inflation factor value for each variable in X_train
vif['values'] = [variance_inflation_factor(X_train.values, i) #calculate variance inflation factor value
                for i in range(len(X_train.columns))] #for each column in X_train

#format set to 6 decimal spaces
pd.options.display.float_format = '{:.6f}'.format
vif #displaying vif dataframe


#dropping market_segment_type_Online from X_train
X_train = X_train.drop(['market_segment_type_Online'], axis=1)
#also dropping this column from X_test
X_test = X_test.drop(['market_segment_type_Online'], axis=1)
#checking the VIF for the X_train variables after dropping market_segment_type_Online
#making a new dataframe called vif
vif = pd.DataFrame()
#creating a column called variables in vif dataframe to hold all the columns in X_train
vif['variables'] = X_train.columns
#creating column called values to hold the variance inflation factor value for each variable in X_train
vif['values'] = [variance_inflation_factor(X_train.values, i) #calculate variance inflation factor value
                for i in range(len(X_train.columns))] #for each column in X_train

#format set to 6 decimal spaces
pd.options.display.float_format = '{:.6f}'.format
vif #displaying vif dataframe


#building the logistic regression model using y_train and X_train
#since description of optimization, current function value, and iterations does not need to be shown, disp=False
logit = sm.Logit(y_train, X_train.astype(float))
lg = logit.fit(disp=False)
#printing the summary of the logistic regression model
print(lg.summary())

                           Logit Regression Results                           
==============================================================================
Dep. Variable:         booking_status   No. Observations:                25392
Model:                          Logit   Df Residuals:                    25365
Method:                           MLE   Df Model:                           26
Date:                Fri, 15 Dec 2023   Pseudo R-squ.:                  0.3292
Time:                        20:35:35   Log-Likelihood:                -10794.
converged:                      False   LL-Null:                       -16091.
Covariance Type:            nonrobust   LLR p-value:                     0.000
========================================================================================================
                                           coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------------
const                                 -933.3324    120.655     -7.736      0.000   -1169.813    -696.852
no_of_adults                             0.1060      0.037      2.841      0.004       0.033       0.179
no_of_children                           0.1542      0.057      2.694      0.007       0.042       0.266
no_of_weekend_nights                     0.1075      0.020      5.439      0.000       0.069       0.146
no_of_week_nights                        0.0405      0.012      3.295      0.001       0.016       0.065
required_car_parking_space              -1.5907      0.138    -11.538      0.000      -1.861      -1.320
lead_time                                0.0157      0.000     58.933      0.000       0.015       0.016
arrival_year                             0.4611      0.060      7.711      0.000       0.344       0.578
arrival_month                           -0.0411      0.006     -6.358      0.000      -0.054      -0.028
arrival_date                             0.0005      0.002      0.257      0.797      -0.003       0.004
repeated_guest                          -2.3140      0.618     -3.743      0.000      -3.526      -1.102
no_of_previous_cancellations             0.2633      0.086      3.074      0.002       0.095       0.431
no_of_previous_bookings_not_canceled    -0.1728      0.152     -1.136      0.256      -0.471       0.125
avg_price_per_room                       0.0187      0.001     25.374      0.000       0.017       0.020
no_of_special_requests                  -1.4709      0.030    -48.891      0.000      -1.530      -1.412
type_of_meal_plan_Meal Plan 2            0.1794      0.067      2.694      0.007       0.049       0.310
type_of_meal_plan_Meal Plan 3           19.8256   1.36e+04      0.001      0.999   -2.67e+04    2.67e+04
type_of_meal_plan_Not Selected           0.2745      0.053      5.181      0.000       0.171       0.378
room_type_reserved_Room_Type 2          -0.3640      0.131     -2.784      0.005      -0.620      -0.108
room_type_reserved_Room_Type 3          -0.0018      1.310     -0.001      0.999      -2.569       2.566
room_type_reserved_Room_Type 4          -0.2763      0.053     -5.207      0.000      -0.380      -0.172
room_type_reserved_Room_Type 5          -0.7182      0.209     -3.436      0.001      -1.128      -0.308
room_type_reserved_Room_Type 6          -0.9408      0.147     -6.402      0.000      -1.229      -0.653
room_type_reserved_Room_Type 7          -1.3891      0.293     -4.743      0.000      -1.963      -0.815
market_segment_type_Complementary      -47.7454   7.09e+06  -6.74e-06      1.000   -1.39e+07    1.39e+07
market_segment_type_Corporate           -0.8033      0.103     -7.807      0.000      -1.005      -0.602
market_segment_type_Offline             -1.7995      0.052    -34.577      0.000      -1.902      -1.698
========================================================================================================


#viewing which variables in the lg model have a p-value greater than 0.05
lg.pvalues[lg.pvalues > 0.05]

arrival_date                           0.797064
no_of_previous_bookings_not_canceled   0.255990
type_of_meal_plan_Meal Plan 3          0.998838
room_type_reserved_Room_Type 3         0.998909
market_segment_type_Complementary      0.999995
dtype: float64


#dropping market_segment_type_Complementary first since it has the highest p-value above 0.05
X_train = X_train.drop(['market_segment_type_Complementary'], axis=1)
#also dropping this column from X_test
X_test = X_test.drop(['market_segment_type_Complementary'], axis=1)
#creating the model again with the modified X_train
#building the logistic regression model using y_train and X_train
#since description of optimization, current function value, and iterations does not need to be shown, disp=False
logit = sm.Logit(y_train, X_train.astype(float))
lg = logit.fit(disp=False)
#viewing the variable p-values in the lg model again 
lg.pvalues[lg.pvalues > 0.05]

arrival_date                           0.841373
no_of_previous_bookings_not_canceled   0.256268
type_of_meal_plan_Meal Plan 3          0.213351
room_type_reserved_Room_Type 3         0.935021
dtype: float64


#dropping room_type_reserved_Room_Type 3 since it has the next highest p-value above 0.05
X_train = X_train.drop(['room_type_reserved_Room_Type 3'], axis=1)
#also dropping this column from X_test
X_test = X_test.drop(['room_type_reserved_Room_Type 3'], axis=1)
#creating the model again with the modified X_train
#building the logistic regression model using y_train and X_train
#since description of optimization, current function value, and iterations does not need to be shown, disp=False
logit = sm.Logit(y_train, X_train.astype(float))
lg = logit.fit(disp=False)
#viewing the variable p-values in the lg model again 
lg.pvalues[lg.pvalues > 0.05]

arrival_date                           0.841240
no_of_previous_bookings_not_canceled   0.256263
type_of_meal_plan_Meal Plan 3          0.213344
dtype: float64


#dropping arrival_date since it has the next highest p-value above 0.05
X_train = X_train.drop(['arrival_date'], axis=1)
#also dropping this column from X_test
X_test = X_test.drop(['arrival_date'], axis=1)
#creating the model again with the modified X_train
#building the logistic regression model using y_train and X_train
#since description of optimization, current function value, and iterations does not need to be shown, disp=False
logit = sm.Logit(y_train, X_train.astype(float))
lg = logit.fit(disp=False)
#viewing the variable p-values in the lg model again 
lg.pvalues[lg.pvalues > 0.05]

no_of_previous_bookings_not_canceled   0.256582
type_of_meal_plan_Meal Plan 3          0.212731
dtype: float64


#dropping type_of_meal_plan_Meal Plan 3 since it has the next highest p-value above 0.05
X_train = X_train.drop(['type_of_meal_plan_Meal Plan 3'], axis=1)
#also dropping this column from X_test
X_test = X_test.drop(['type_of_meal_plan_Meal Plan 3'], axis=1)
#creating the model again with the modified X_train
#building the logistic regression model using y_train and X_train
#since description of optimization, current function value, and iterations does not need to be shown, disp=False
logit = sm.Logit(y_train, X_train.astype(float))
lg = logit.fit(disp=False)
#viewing the variable p-values in the lg model again 
lg.pvalues[lg.pvalues > 0.05]

no_of_previous_bookings_not_canceled   0.256594
dtype: float64


#dropping no_of_previous_bookings_not_canceled since it has the last highest p-value above 0.05
X_train = X_train.drop(['no_of_previous_bookings_not_canceled'], axis=1)
#also dropping this column from X_test
X_test = X_test.drop(['no_of_previous_bookings_not_canceled'], axis=1)
#creating the model again with the modified X_train
#building the logistic regression model using y_train and X_train
#since description of optimization, current function value, and iterations does not need to be shown, disp=False
logit = sm.Logit(y_train, X_train.astype(float))
lg = logit.fit(disp=False)
#viewing the variable p-values in the lg model again 
lg.pvalues[lg.pvalues > 0.05]

Series([], dtype: float64)


#printing the summary of the logistic regression model
print(lg.summary())

                           Logit Regression Results                           
==============================================================================
Dep. Variable:         booking_status   No. Observations:                25392
Model:                          Logit   Df Residuals:                    25370
Method:                           MLE   Df Model:                           21
Date:                Fri, 15 Dec 2023   Pseudo R-squ.:                  0.3283
Time:                        20:35:36   Log-Likelihood:                -10809.
converged:                       True   LL-Null:                       -16091.
Covariance Type:            nonrobust   LLR p-value:                     0.000
==================================================================================================
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                           -917.2860    120.456     -7.615      0.000   -1153.376    -681.196
no_of_adults                       0.1086      0.037      2.914      0.004       0.036       0.182
no_of_children                     0.1522      0.057      2.660      0.008       0.040       0.264
no_of_weekend_nights               0.1086      0.020      5.501      0.000       0.070       0.147
no_of_week_nights                  0.0418      0.012      3.403      0.001       0.018       0.066
required_car_parking_space        -1.5943      0.138    -11.561      0.000      -1.865      -1.324
lead_time                          0.0157      0.000     59.218      0.000       0.015       0.016
arrival_year                       0.4531      0.060      7.591      0.000       0.336       0.570
arrival_month                     -0.0424      0.006     -6.568      0.000      -0.055      -0.030
repeated_guest                    -2.7365      0.557     -4.915      0.000      -3.828      -1.645
no_of_previous_cancellations       0.2289      0.077      2.983      0.003       0.078       0.379
avg_price_per_room                 0.0192      0.001     26.343      0.000       0.018       0.021
no_of_special_requests            -1.4699      0.030    -48.892      0.000      -1.529      -1.411
type_of_meal_plan_Meal Plan 2      0.1654      0.067      2.487      0.013       0.035       0.296
type_of_meal_plan_Not Selected     0.2858      0.053      5.405      0.000       0.182       0.389
room_type_reserved_Room_Type 2    -0.3560      0.131     -2.725      0.006      -0.612      -0.100
room_type_reserved_Room_Type 4    -0.2826      0.053     -5.330      0.000      -0.387      -0.179
room_type_reserved_Room_Type 5    -0.7352      0.208     -3.529      0.000      -1.143      -0.327
room_type_reserved_Room_Type 6    -0.9650      0.147     -6.572      0.000      -1.253      -0.677
room_type_reserved_Room_Type 7    -1.4312      0.293     -4.892      0.000      -2.005      -0.858
market_segment_type_Corporate     -0.7928      0.103     -7.711      0.000      -0.994      -0.591
market_segment_type_Offline       -1.7867      0.052    -34.391      0.000      -1.889      -1.685
==================================================================================================


#predictions for training set for probabilities that are larger than 0.5
X_train_pre = lg.predict(X_train) > 0.5
#rounding them to make sure they are values of 1 and 0
X_train_pred = np.round(X_train_pre)

#making the confusion matrix for training group using y_train and predictions from X_train
confusion_mat_train = confusion_matrix(y_train, X_train_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_train, fmt='g',annot=True)
plt.title('Training Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap

#also creating predictions for testing set for performance metric calculations
X_test_pre = lg.predict(X_test) > 0.5
#rounding them for values of 1 and 0
X_test_pred = np.round(X_test_pre)

#making the confusion matrix for testing group using y_test and predictions from X_test
confusion_mat_test = confusion_matrix(y_test, X_test_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_test, fmt='g',annot=True)
plt.title('Testing Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap


#creating a dataframe called performance_metrics to hold performance metrics for model
performance_metrics = pd.DataFrame()
#creating a column called group to display whether row corresponds to testing or training
performance_metrics['Group'] = ['Training', 'Testing']
#making column for accuracy to display calculated accuracy scores for training and test groups
performance_metrics['Accuracy'] = [accuracy_score(y_train, X_train_pred), accuracy_score(y_test, X_test_pred)]
#making column for recall to display calculated recall scores for training and test groups
performance_metrics['Recall'] = [recall_score(y_train, X_train_pred), recall_score(y_test, X_test_pred)]
#making column for precision to display calculated precision scores for training and test groups
performance_metrics['Precision'] = [precision_score(y_train, X_train_pred), precision_score(y_test, X_test_pred)]
#making column for F1 to display calculated F1 scores for training and test groups
performance_metrics['F1'] = [f1_score(y_train, X_train_pred), f1_score(y_test, X_test_pred)]
#copying performance_metrics dataframe into variable called pm for comparison at the end
pm = performance_metrics.copy()
#displaying performance_metrics
performance_metrics


#calculating ROC-AUC curve values based on y_train and X_train predictions
logit_roc_auc = roc_auc_score(y_train, lg.predict(X_train))
#deriving false positive rate, true positive rate, and thresholds from y_train and X_train predictions
fpr, tpr, thresholds = roc_curve(y_train, lg.predict(X_train))
#plotting the fpr, and tpr and adding the label for logistic regression area
#rounding area to 2 decimal places
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0,1], [0,1], 'r--') #plotting a red dotted line where area under line would be 0.50
plt.xlim([0.0, 1.0]) #setting limits for x-axis range
plt.ylim([0.0, 1.05]) #setting limits for y-axis range
plt.xlabel('False Positive Rate') #setting title of x-axis
plt.ylabel('True Positive Rate') #setting title of y-axis
plt.title('Receiver Operating Characteristic Curve') #setting title of figure
plt.legend(loc='lower right') #adding a legend to show area under curve/placing it in lower right corner
plt.show(); #displaying figure


#finding the optimal curve threshold
#assigning optimal_cut_off variable to the value where the tpr is highest and fpr is lowest
optimal_cut_off = np.argmax(tpr-fpr)
#finding the threshold at the cut off point
roc_optimal_threshold = thresholds [optimal_cut_off]
#displaying optimal threshold determined from roc curve
roc_optimal_threshold

0.371046662348869


#making predictions for X_train values that are greater than the optimal threshold
X_train_pre = lg.predict(X_train) > roc_optimal_threshold
#rounding them to make sure they are values of 1 and 0
X_train_pred = np.round(X_train_pre)

#making the confusion matrix for training group using y_train and predictions from X_train
confusion_mat_train = confusion_matrix(y_train, X_train_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_train, fmt='g',annot=True)
plt.title('Training Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap

#also creating predictions for testing set for performance metric calculations
X_test_pre = lg.predict(X_test) > roc_optimal_threshold
#rounding them for values of 1 and 0
X_test_pred = np.round(X_test_pre)

#making the confusion matrix for testing group using y_test and predictions from X_test
confusion_mat_test = confusion_matrix(y_test, X_test_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_test, fmt='g',annot=True)
plt.title('Testing Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap


#calculating the performance metrics again
#creating a dataframe called performance_metrics to hold performance metrics for model
performance_metrics = pd.DataFrame()
#creating a column called group to display whether row corresponds to testing or training
performance_metrics['Group'] = ['Training', 'Testing']
#making column for accuracy to display calculated accuracy scores for training and test groups
performance_metrics['Accuracy'] = [accuracy_score(y_train, X_train_pred), accuracy_score(y_test, X_test_pred)]
#making column for recall to display calculated recall scores for training and test groups
performance_metrics['Recall'] = [recall_score(y_train, X_train_pred), recall_score(y_test, X_test_pred)]
#making column for precision to display calculated precision scores for training and test groups
performance_metrics['Precision'] = [precision_score(y_train, X_train_pred), precision_score(y_test, X_test_pred)]
#making column for F1 to display calculated F1 scores for training and test groups
performance_metrics['F1'] = [f1_score(y_train, X_train_pred), f1_score(y_test, X_test_pred)]
#copying performance_metrics dataframe into variable called pm1 for comparison at the end
pm1 = performance_metrics.copy()
#displaying performance_metrics
performance_metrics


#calculating precision, recall and threshold values using y_train and X_train predictions
prec, rec, tre = precision_recall_curve(y_train, lg.predict(X_train))
#plotting the precision curve using calculated precision values/curve is blue dotted line/labeled as precision
plt.plot(tre, prec[:-1], 'b--', label='precision')
#plotting the recall curve using calculated recall values/curve is green dotted line/labeled as recall
plt.plot(tre, rec[:-1], 'g--', label='recall')
plt.legend(loc='lower left') #adding a legend in the lower left corner of figure
plt.xlabel('Threshold') #setting title of x-axis
plt.ylim([0,1]) #y-axis range is set from 0 to 1
plt.show(); #displaying figure


#making predictions for X_train values that are greater than the threshold of 0.42
X_train_pre = lg.predict(X_train) > 0.42
#rounding them to make sure they are values of 1 and 0
X_train_pred = np.round(X_train_pre)

#making the confusion matrix for training group using y_train and predictions from X_train
confusion_mat_train = confusion_matrix(y_train, X_train_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_train, fmt='g',annot=True)
plt.title('Training Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap

#also creating predictions for testing set for performance metric calculations
X_test_pre = lg.predict(X_test) > 0.42
#rounding them for values of 1 and 0
X_test_pred = np.round(X_test_pre)

#making the confusion matrix for testing group using y_test and predictions from X_test
confusion_mat_test = confusion_matrix(y_test, X_test_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_test, fmt='g',annot=True)
plt.title('Testing Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap


#calculating the performance metrics again
#creating a dataframe called performance_metrics to hold performance metrics for model
performance_metrics = pd.DataFrame()
#creating a column called group to display whether row corresponds to testing or training
performance_metrics['Group'] = ['Training', 'Testing']
#making column for accuracy to display calculated accuracy scores for training and test groups
performance_metrics['Accuracy'] = [accuracy_score(y_train, X_train_pred), accuracy_score(y_test, X_test_pred)]
#making column for recall to display calculated recall scores for training and test groups
performance_metrics['Recall'] = [recall_score(y_train, X_train_pred), recall_score(y_test, X_test_pred)]
#making column for precision to display calculated precision scores for training and test groups
performance_metrics['Precision'] = [precision_score(y_train, X_train_pred), precision_score(y_test, X_test_pred)]
#making column for F1 to display calculated F1 scores for training and test groups
performance_metrics['F1'] = [f1_score(y_train, X_train_pred), f1_score(y_test, X_test_pred)]
#copying performance_metrics dataframe into variable called pm2 for comparison at the end
pm2 = performance_metrics.copy()
#displaying performance_metrics
performance_metrics


#performance metrics for 0.5 threshold
pm


#performance metrics for 0.37 threshold from the roc curve
pm1


#performance metrics for 0.42 threshold from the precision recall curve
pm2


#printing the summary of the final logistic regression model
print(lg.summary())

                           Logit Regression Results                           
==============================================================================
Dep. Variable:         booking_status   No. Observations:                25392
Model:                          Logit   Df Residuals:                    25370
Method:                           MLE   Df Model:                           21
Date:                Fri, 15 Dec 2023   Pseudo R-squ.:                  0.3283
Time:                        20:35:39   Log-Likelihood:                -10809.
converged:                       True   LL-Null:                       -16091.
Covariance Type:            nonrobust   LLR p-value:                     0.000
==================================================================================================
                                     coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const                           -917.2860    120.456     -7.615      0.000   -1153.376    -681.196
no_of_adults                       0.1086      0.037      2.914      0.004       0.036       0.182
no_of_children                     0.1522      0.057      2.660      0.008       0.040       0.264
no_of_weekend_nights               0.1086      0.020      5.501      0.000       0.070       0.147
no_of_week_nights                  0.0418      0.012      3.403      0.001       0.018       0.066
required_car_parking_space        -1.5943      0.138    -11.561      0.000      -1.865      -1.324
lead_time                          0.0157      0.000     59.218      0.000       0.015       0.016
arrival_year                       0.4531      0.060      7.591      0.000       0.336       0.570
arrival_month                     -0.0424      0.006     -6.568      0.000      -0.055      -0.030
repeated_guest                    -2.7365      0.557     -4.915      0.000      -3.828      -1.645
no_of_previous_cancellations       0.2289      0.077      2.983      0.003       0.078       0.379
avg_price_per_room                 0.0192      0.001     26.343      0.000       0.018       0.021
no_of_special_requests            -1.4699      0.030    -48.892      0.000      -1.529      -1.411
type_of_meal_plan_Meal Plan 2      0.1654      0.067      2.487      0.013       0.035       0.296
type_of_meal_plan_Not Selected     0.2858      0.053      5.405      0.000       0.182       0.389
room_type_reserved_Room_Type 2    -0.3560      0.131     -2.725      0.006      -0.612      -0.100
room_type_reserved_Room_Type 4    -0.2826      0.053     -5.330      0.000      -0.387      -0.179
room_type_reserved_Room_Type 5    -0.7352      0.208     -3.529      0.000      -1.143      -0.327
room_type_reserved_Room_Type 6    -0.9650      0.147     -6.572      0.000      -1.253      -0.677
room_type_reserved_Room_Type 7    -1.4312      0.293     -4.892      0.000      -2.005      -0.858
market_segment_type_Corporate     -0.7928      0.103     -7.711      0.000      -0.994      -0.591
market_segment_type_Offline       -1.7867      0.052    -34.391      0.000      -1.889      -1.685
==================================================================================================


#0.37 Threshold
#Final Performance Metrics
pm1


#calculating odds of the variables in X_train to get another perspective
#creating a dataframe called odds_model 
odds_data = pd.DataFrame()
#creating a column called odds to include the odds value for each variable in the lg model
odds_data['odds'] = np.exp(lg.params)
#creating a column for odds as a change in percent
odds_data['percent_change'] = (np.exp(lg.params) - 1)*100
#displaying odds_data
odds_data


#building the decision tree using y_train and X_train/the tree will be split using gini criterion
dt = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dt.fit(X_train, y_train)

DecisionTreeClassifier(random_state=1)

DecisionTreeClassifier(random_state=1)


#viewing confusion matrix for the decision tree predictions on training set and testing set
#making predictions for X_train values using dt model
X_train_pred = dt.predict(X_train)

#making the confusion matrix for training group using y_train and predictions from X_train
confusion_mat_train = confusion_matrix(y_train, X_train_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_train, fmt='g',annot=True)
plt.title('Training Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap

#also creating predictions for testing set for performance metric calculations
X_test_pred = dt.predict(X_test)

#making the confusion matrix for testing group using y_test and predictions from X_test
confusion_mat_test = confusion_matrix(y_test, X_test_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_test, fmt='g',annot=True)
plt.title('Testing Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap


#creating a dataframe called performance_metrics to hold performance metrics for model
performance_metrics = pd.DataFrame()
#creating a column called group to display whether row corresponds to testing or training
performance_metrics['Group'] = ['Training', 'Testing']
#making column for accuracy to display calculated accuracy scores for training and test groups
performance_metrics['Accuracy'] = [accuracy_score(y_train, X_train_pred), accuracy_score(y_test, X_test_pred)]
#making column for recall to display calculated recall scores for training and test groups
performance_metrics['Recall'] = [recall_score(y_train, X_train_pred), recall_score(y_test, X_test_pred)]
#making column for precision to display calculated precision scores for training and test groups
performance_metrics['Precision'] = [precision_score(y_train, X_train_pred), precision_score(y_test, X_test_pred)]
#making column for F1 to display calculated F1 scores for training and test groups
performance_metrics['F1'] = [f1_score(y_train, X_train_pred), f1_score(y_test, X_test_pred)]
#copying the performance_metrics dataframe into a variable called pmd to help with model comparison
pmd = performance_metrics.copy()
#displaying performance_metrics
performance_metrics


#viewing feature importance for all of the variables in X_train
#making a dataframe called features_imp where the index is the columns in X_train
features_imp = pd.DataFrame(index=X_train.columns)
#creating a column called importance to hold feature importance for each feature
features_imp['importance'] = dt.feature_importances_
#sorting the values in features by descending order of importance
features_imp['importance'].sort_values(ascending=False)

lead_time                        0.394584
avg_price_per_room               0.199281
arrival_month                    0.078105
no_of_special_requests           0.068036
market_segment_type_Offline      0.059178
no_of_week_nights                0.058022
no_of_weekend_nights             0.045708
no_of_adults                     0.035081
arrival_year                     0.014460
type_of_meal_plan_Not Selected   0.009358
market_segment_type_Corporate    0.007848
room_type_reserved_Room_Type 4   0.007525
required_car_parking_space       0.006583
type_of_meal_plan_Meal Plan 2    0.005962
no_of_children                   0.005305
room_type_reserved_Room_Type 2   0.002136
room_type_reserved_Room_Type 5   0.001225
repeated_guest                   0.000842
room_type_reserved_Room_Type 6   0.000670
no_of_previous_cancellations     0.000091
room_type_reserved_Room_Type 7   0.000000
const                            0.000000
Name: importance, dtype: float64


#using grid search to find the right combination of hyperparameters
#selecting classifier
estimator = DecisionTreeClassifier(random_state=1)

#selecting possible parameter value options
parameters = {'max_depth':np.arange(1,10),
              'criterion': ['gini', 'entropy']}

#using recall score to compare paramater combinations
score = make_scorer(recall_score)

#Grid Search on X_train and y_train/cross-validation is set to 5
search_grid = GridSearchCV(estimator, parameters, scoring=score, cv=5)
search_grid = search_grid.fit(X_train, y_train)

#classifier is equal to the best parameter selection
estimator = search_grid.best_estimator_

#applying parameter selection on X_train and y_train
estimator.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=9, random_state=1)

DecisionTreeClassifier(max_depth=9, random_state=1)


#viewing confusion matrix for gridsearch predictions on training set and testing set
#making predictions for X_train using estimator model
X_train_pred = estimator.predict(X_train)

#making the confusion matrix for training group using y_train and predictions from X_train
confusion_mat_train = confusion_matrix(y_train, X_train_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_train, fmt='g',annot=True)
plt.title('Training Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap

#also creating predictions for testing set for performance metric calculations
X_test_pred = estimator.predict(X_test)

#making the confusion matrix for testing group using y_test and predictions from X_test
confusion_mat_test = confusion_matrix(y_test, X_test_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_test, fmt='g',annot=True)
plt.title('Testing Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap


#creating a dataframe called performance_metrics to hold performance metrics for model
performance_metrics = pd.DataFrame()
#creating a column called group to display whether row corresponds to testing or training
performance_metrics['Group'] = ['Training', 'Testing']
#making column for accuracy to display calculated accuracy scores for training and test groups
performance_metrics['Accuracy'] = [accuracy_score(y_train, X_train_pred), accuracy_score(y_test, X_test_pred)]
#making column for recall to display calculated recall scores for training and test groups
performance_metrics['Recall'] = [recall_score(y_train, X_train_pred), recall_score(y_test, X_test_pred)]
#making column for precision to display calculated precision scores for training and test groups
performance_metrics['Precision'] = [precision_score(y_train, X_train_pred), precision_score(y_test, X_test_pred)]
#making column for F1 to display calculated F1 scores for training and test groups
performance_metrics['F1'] = [f1_score(y_train, X_train_pred), f1_score(y_test, X_test_pred)]
#copying the performance_metrics dataframe into a variable called pmd1 to help with model comparison
pmd1 = performance_metrics.copy()
#displaying performance_metrics
performance_metrics


#viewing feature importance for all of the variables in X_train
#making a dataframe called features_imp where the index is the columns in X_train
features_imp = pd.DataFrame(index=X_train.columns)
#creating a column called importance to hold feature importance for each feature
features_imp['importance'] = estimator.feature_importances_
#sorting the values in features by descending order of importance
features_imp['importance'].sort_values(ascending=False)

lead_time                        0.454228
avg_price_per_room               0.158186
no_of_special_requests           0.111881
market_segment_type_Offline      0.096996
arrival_month                    0.047749
no_of_adults                     0.038120
no_of_weekend_nights             0.023899
no_of_week_nights                0.020005
arrival_year                     0.015694
required_car_parking_space       0.010204
market_segment_type_Corporate    0.009913
type_of_meal_plan_Meal Plan 2    0.006678
type_of_meal_plan_Not Selected   0.004248
room_type_reserved_Room_Type 4   0.001133
room_type_reserved_Room_Type 2   0.000659
no_of_children                   0.000407
no_of_previous_cancellations     0.000000
room_type_reserved_Room_Type 5   0.000000
room_type_reserved_Room_Type 6   0.000000
room_type_reserved_Room_Type 7   0.000000
repeated_guest                   0.000000
const                            0.000000
Name: importance, dtype: float64


#performing cost complexity pruning
#selecting classifier
clf = DecisionTreeClassifier(random_state=1)
path = clf.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas, impurities = abs(path.ccp_alphas), path.impurities

#viewing ccp_alphas and impurities as dataframe
pd.DataFrame(path)


#plotting ccp_alphas and impurities/markers are small circles/and line forms steps
plt.figure(figsize=(20,5)) #figure size is set to (20,5)
plt.plot(ccp_alphas[:-1], impurities[:-1], marker='o', drawstyle='steps-post')
plt.title('Training: Total Impurity of Leaves vs. Effective Alpha') #setting title of figure
plt.xlabel('Effective Alpha') #setting title of x-axis
plt.ylabel('Total Impurity of Leaves') #setting title of y-axis
plt.show(); #displaying figure


#training model based on ccp_alphas
clf1 = []

for ccp_alpha in ccp_alphas:  #for every alpha value in ccp_alphas
    clf = DecisionTreeClassifier(random_state=1, ccp_alpha=ccp_alpha)
    clf.fit(X_train, y_train)
    clf1.append(clf)

print('Last Tree Node Number:', clf1[-1].tree_.node_count)
print('Last ccp_alpha:', ccp_alphas[-1])

Last Tree Node Number: 1
Last ccp_alpha: 0.07657789477371368


clf1 = clf1[:-1] #removing the very last value in clf1
ccp_alphas = ccp_alphas[:-1] #removing the very last value in ccp_alphas

#plotting number of nodes vs. alpha values
nodes_number = [clf.tree_.node_count for clf in clf1] #assigning number of nodes to nodes_number
plt.figure(figsize=(20,5)) #adjusting size of figure
plt.plot(ccp_alphas, nodes_number, marker='o', drawstyle='steps-post')#plotting ccp_alphas and nodes_number
plt.title('Number of Nodes vs. Alpha') #setting title of figure
plt.xlabel('Alpha') #setting title of x-axis
plt.ylabel('Number of Nodes') #setting title of y-axis
plt.show(); #displaying figure

#plotting depth vs. alpha values
depth_values = [clf.tree_.max_depth for clf in clf1] #assigning depth to depth_values
plt.figure(figsize=(20,5)) #adjusting size of figure
plt.plot(ccp_alphas, depth_values, marker='o', drawstyle='steps-post') #plotting ccp_alphas and depth_values
plt.title('Depth vs. Alpha') #setting title of figure
plt.xlabel('Alpha') #setting title of x-axis
plt.ylabel('Tree Depth') #setting title of y-axis
plt.show(); #displaying figure


#gathering values for training_recall
training_recall = [] #creating empty list for training_recall
for clf in clf1: #for each clf value in clf1
    predictions = clf.predict(X_train) #make predictions on X_train using clf model
    train_recall = recall_score(y_train, predictions) #calculate recall score
    training_recall.append(train_recall) #add the recall score to list
#values for testing recall
testing_recall = [] #creating empty list for testing_recall
for clf in clf1: #for each clf value in clf1
    predictions1 = clf.predict(X_test) #make predictions on X_test using clf model
    test_recall = recall_score(y_test, predictions1) #calculate recall score
    testing_recall.append(test_recall) #add recall score to list

#plotting recall vs. alpha values for both testing and training and sets
plt.figure(figsize=(15,5)) #setting size of figure
plt.plot(ccp_alphas, training_recall, marker='o',label='Training', drawstyle='steps-post') #plotting line for training recall
plt.plot(ccp_alphas, testing_recall, marker='o', label='Testing', drawstyle='steps-post') #plotting line for testing recall
plt.legend(loc='upper right') #setting legend in upper right corner
plt.title('Recall vs. Alpha') #setting title of figure
plt.xlabel('Alpha') #setting title of x-axis
plt.ylabel('Recall') #setting title of y-axis
plt.show(); #displaying figure


#Using highest recall value for both testing and training set to derive optimal model
highest_recall = np.argmax(testing_recall) #finding highest testing recall
model = clf1[highest_recall] #finding the corresponding model
print(model) #printing model

DecisionTreeClassifier(ccp_alpha=1.8176530463865078e-05, random_state=1)


#viewing confusion matrix on training set and testing set
#making predictions for X_train using model
X_train_pred = model.predict(X_train)

#making the confusion matrix for training group using y_train and predictions from X_train
confusion_mat_train = confusion_matrix(y_train, X_train_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_train, fmt='g',annot=True)
plt.title('Training Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap

#also creating predictions for testing set for performance metric calculations
X_test_pred = model.predict(X_test)

#making the confusion matrix for testing group using y_test and predictions from X_test
confusion_mat_test = confusion_matrix(y_test, X_test_pred)
#making heatmap of confusion matrix, displaying annotation labels as full numbers instead of scientific notation
sns.heatmap(confusion_mat_test, fmt='g',annot=True)
plt.title('Testing Confusion Matrix') #setting title of heatmap
plt.xlabel('Predicted Values') #setting title of x-axis
plt.ylabel('Actual Values') #setting title of y-axis
plt.show(); #displaying heatmap


#creating a dataframe called performance_metrics to hold performance metrics for model
performance_metrics = pd.DataFrame()
#creating a column called group to display whether row corresponds to testing or training
performance_metrics['Group'] = ['Training', 'Testing']
#making column for accuracy to display calculated accuracy scores for training and test groups
performance_metrics['Accuracy'] = [accuracy_score(y_train, X_train_pred), accuracy_score(y_test, X_test_pred)]
#making column for recall to display calculated recall scores for training and test groups
performance_metrics['Recall'] = [recall_score(y_train, X_train_pred), recall_score(y_test, X_test_pred)]
#making column for precision to display calculated precision scores for training and test groups
performance_metrics['Precision'] = [precision_score(y_train, X_train_pred), precision_score(y_test, X_test_pred)]
#making column for F1 to display calculated F1 scores for training and test groups
performance_metrics['F1'] = [f1_score(y_train, X_train_pred), f1_score(y_test, X_test_pred)]
#copying the performance_metrics dataframe into a variable called pmd2 to help with model comparison
pmd2 = performance_metrics.copy()
#displaying performance_metrics
performance_metrics


#viewing feature importance for all of the variables in X_train
#making a dataframe called features_imp where the index is the columns in X_train
features_imp = pd.DataFrame(index=X_train.columns)
#creating a column called importance to hold feature importance for each feature
features_imp['importance'] = model.feature_importances_
#sorting the values in features by descending order of importance
features_imp['importance'].sort_values(ascending=False)

lead_time                        0.394861
avg_price_per_room               0.199127
arrival_month                    0.078010
no_of_special_requests           0.068228
market_segment_type_Offline      0.059344
no_of_week_nights                0.057774
no_of_weekend_nights             0.045663
no_of_adults                     0.035116
arrival_year                     0.014326
type_of_meal_plan_Not Selected   0.009325
market_segment_type_Corporate    0.007871
room_type_reserved_Room_Type 4   0.007486
required_car_parking_space       0.006602
type_of_meal_plan_Meal Plan 2    0.005966
no_of_children                   0.005321
room_type_reserved_Room_Type 2   0.002143
room_type_reserved_Room_Type 5   0.001228
repeated_guest                   0.000845
room_type_reserved_Room_Type 6   0.000672
no_of_previous_cancellations     0.000091
room_type_reserved_Room_Type 7   0.000000
const                            0.000000
Name: importance, dtype: float64


#performance metrics for the first model - without pruning
pmd


#performance metrics for the pre-pruning model
pmd1


#performance metrics for the post-pruning model
pmd2


#viewing feature importance for all of the variables in X_train
#making a dataframe called features_imp where the index is the columns in X_train
features_imp = pd.DataFrame(index=X_train.columns)
#creating a column called importance to hold feature importance for each feature
features_imp['importance'] = estimator.feature_importances_
#sorting the values in features by descending order of importance
features_imp['importance'].sort_values(ascending=False)

lead_time                        0.454228
avg_price_per_room               0.158186
no_of_special_requests           0.111881
market_segment_type_Offline      0.096996
arrival_month                    0.047749
no_of_adults                     0.038120
no_of_weekend_nights             0.023899
no_of_week_nights                0.020005
arrival_year                     0.015694
required_car_parking_space       0.010204
market_segment_type_Corporate    0.009913
type_of_meal_plan_Meal Plan 2    0.006678
type_of_meal_plan_Not Selected   0.004248
room_type_reserved_Room_Type 4   0.001133
room_type_reserved_Room_Type 2   0.000659
no_of_children                   0.000407
no_of_previous_cancellations     0.000000
room_type_reserved_Room_Type 5   0.000000
room_type_reserved_Room_Type 6   0.000000
room_type_reserved_Room_Type 7   0.000000
repeated_guest                   0.000000
const                            0.000000
Name: importance, dtype: float64


#assigning features to the list of columns in X_train
features = list(X_train.columns)
plt.figure(figsize=(30,20)) #figure size is set to (30,20)
#plotting the decision tree for estimator model/fontsize is set to 12/node_ids and class_names are shown
tree.plot_tree(estimator, feature_names = features, fontsize=12, filled=True, node_ids=True, class_names=True)
plt.show() #displaying figure


#also printing a visual text depiction
print(tree.export_text(estimator, feature_names=features, show_weights=True))

|--- lead_time <= 151.50
|   |--- no_of_special_requests <= 0.50
|   |   |--- market_segment_type_Offline <= 0.50
|   |   |   |--- lead_time <= 20.50
|   |   |   |   |--- avg_price_per_room <= 105.95
|   |   |   |   |   |--- avg_price_per_room <= 78.90
|   |   |   |   |   |   |--- no_of_weekend_nights <= 1.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 74.59
|   |   |   |   |   |   |   |   |--- lead_time <= 16.50
|   |   |   |   |   |   |   |   |   |--- weights: [536.00, 14.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  16.50
|   |   |   |   |   |   |   |   |   |--- weights: [21.00, 4.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  74.59
|   |   |   |   |   |   |   |   |--- no_of_children <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [103.00, 12.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_children >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |--- no_of_weekend_nights >  1.50
|   |   |   |   |   |   |   |--- lead_time <= 1.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 62.50
|   |   |   |   |   |   |   |   |   |--- weights: [3.00, 10.00] class: 1
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  62.50
|   |   |   |   |   |   |   |   |   |--- weights: [10.00, 1.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  1.50
|   |   |   |   |   |   |   |   |--- type_of_meal_plan_Meal Plan 2 <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [92.00, 8.00] class: 0
|   |   |   |   |   |   |   |   |--- type_of_meal_plan_Meal Plan 2 >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |--- avg_price_per_room >  78.90
|   |   |   |   |   |   |--- lead_time <= 2.50
|   |   |   |   |   |   |   |--- arrival_month <= 5.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [36.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [106.00, 21.00] class: 0
|   |   |   |   |   |   |   |--- arrival_month >  5.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 10.50
|   |   |   |   |   |   |   |   |   |--- weights: [116.00, 3.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  10.50
|   |   |   |   |   |   |   |   |   |--- weights: [50.00, 4.00] class: 0
|   |   |   |   |   |   |--- lead_time >  2.50
|   |   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |   |--- type_of_meal_plan_Not Selected <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [323.00, 88.00] class: 0
|   |   |   |   |   |   |   |   |--- type_of_meal_plan_Not Selected >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [67.00, 60.00] class: 0
|   |   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |   |--- weights: [87.00, 0.00] class: 0
|   |   |   |   |--- avg_price_per_room >  105.95
|   |   |   |   |   |--- lead_time <= 3.50
|   |   |   |   |   |   |--- avg_price_per_room <= 202.67
|   |   |   |   |   |   |   |--- no_of_week_nights <= 4.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 8.50
|   |   |   |   |   |   |   |   |   |--- weights: [179.00, 32.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  8.50
|   |   |   |   |   |   |   |   |   |--- weights: [143.00, 8.00] class: 0
|   |   |   |   |   |   |   |--- no_of_week_nights >  4.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 9.00
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 8.00] class: 1
|   |   |   |   |   |   |   |   |--- arrival_month >  9.00
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |--- avg_price_per_room >  202.67
|   |   |   |   |   |   |   |--- arrival_month <= 11.00
|   |   |   |   |   |   |   |   |--- weights: [0.00, 16.00] class: 1
|   |   |   |   |   |   |   |--- arrival_month >  11.00
|   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |--- lead_time >  3.50
|   |   |   |   |   |   |--- arrival_month <= 8.50
|   |   |   |   |   |   |   |--- market_segment_type_Corporate <= 0.50
|   |   |   |   |   |   |   |   |--- no_of_weekend_nights <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [91.00, 163.00] class: 1
|   |   |   |   |   |   |   |   |--- no_of_weekend_nights >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [17.00, 77.00] class: 1
|   |   |   |   |   |   |   |--- market_segment_type_Corporate >  0.50
|   |   |   |   |   |   |   |   |--- no_of_weekend_nights <= 5.00
|   |   |   |   |   |   |   |   |   |--- weights: [25.00, 2.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_weekend_nights >  5.00
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |--- arrival_month >  8.50
|   |   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |   |--- lead_time <= 9.50
|   |   |   |   |   |   |   |   |   |--- weights: [23.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  9.50
|   |   |   |   |   |   |   |   |   |--- weights: [28.00, 2.00] class: 0
|   |   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |   |   |--- weights: [78.00, 65.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |   |   |--- weights: [28.00, 0.00] class: 0
|   |   |   |--- lead_time >  20.50
|   |   |   |   |--- avg_price_per_room <= 86.38
|   |   |   |   |   |--- market_segment_type_Corporate <= 0.50
|   |   |   |   |   |   |--- avg_price_per_room <= 60.07
|   |   |   |   |   |   |   |--- lead_time <= 84.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 23.17
|   |   |   |   |   |   |   |   |   |--- weights: [35.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  23.17
|   |   |   |   |   |   |   |   |   |--- weights: [36.00, 4.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  84.50
|   |   |   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |   |   |--- weights: [9.00, 10.00] class: 1
|   |   |   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |   |   |--- weights: [14.00, 1.00] class: 0
|   |   |   |   |   |   |--- avg_price_per_room >  60.07
|   |   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |   |--- lead_time <= 62.50
|   |   |   |   |   |   |   |   |   |--- weights: [48.00, 3.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  62.50
|   |   |   |   |   |   |   |   |   |--- weights: [19.00, 36.00] class: 1
|   |   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 5.50
|   |   |   |   |   |   |   |   |   |--- weights: [201.00, 239.00] class: 1
|   |   |   |   |   |   |   |   |--- arrival_month >  5.50
|   |   |   |   |   |   |   |   |   |--- weights: [48.00, 107.00] class: 1
|   |   |   |   |   |--- market_segment_type_Corporate >  0.50
|   |   |   |   |   |   |--- no_of_week_nights <= 3.50
|   |   |   |   |   |   |   |--- lead_time <= 21.50
|   |   |   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 3.00] class: 1
|   |   |   |   |   |   |   |--- lead_time >  21.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 63.00
|   |   |   |   |   |   |   |   |   |--- weights: [15.00, 5.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  63.00
|   |   |   |   |   |   |   |   |   |--- weights: [165.00, 13.00] class: 0
|   |   |   |   |   |   |--- no_of_week_nights >  3.50
|   |   |   |   |   |   |   |--- arrival_month <= 9.00
|   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- arrival_month >  9.00
|   |   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |--- avg_price_per_room >  86.38
|   |   |   |   |   |--- required_car_parking_space <= 0.50
|   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |--- type_of_meal_plan_Meal Plan 2 <= 0.50
|   |   |   |   |   |   |   |   |--- lead_time <= 86.50
|   |   |   |   |   |   |   |   |   |--- weights: [86.00, 14.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  86.50
|   |   |   |   |   |   |   |   |   |--- weights: [6.00, 7.00] class: 1
|   |   |   |   |   |   |   |--- type_of_meal_plan_Meal Plan 2 >  0.50
|   |   |   |   |   |   |   |   |--- no_of_week_nights <= 3.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 44.00] class: 1
|   |   |   |   |   |   |   |   |--- no_of_week_nights >  3.50
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 130.12
|   |   |   |   |   |   |   |   |--- type_of_meal_plan_Not Selected <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [448.00, 786.00] class: 1
|   |   |   |   |   |   |   |   |--- type_of_meal_plan_Not Selected >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [84.00, 312.00] class: 1
|   |   |   |   |   |   |   |--- avg_price_per_room >  130.12
|   |   |   |   |   |   |   |   |--- arrival_month <= 10.50
|   |   |   |   |   |   |   |   |   |--- weights: [182.00, 716.00] class: 1
|   |   |   |   |   |   |   |   |--- arrival_month >  10.50
|   |   |   |   |   |   |   |   |   |--- weights: [25.00, 27.00] class: 1
|   |   |   |   |   |--- required_car_parking_space >  0.50
|   |   |   |   |   |   |--- no_of_weekend_nights <= 3.00
|   |   |   |   |   |   |   |--- weights: [54.00, 0.00] class: 0
|   |   |   |   |   |   |--- no_of_weekend_nights >  3.00
|   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |--- market_segment_type_Offline >  0.50
|   |   |   |--- lead_time <= 90.50
|   |   |   |   |--- no_of_weekend_nights <= 0.50
|   |   |   |   |   |--- avg_price_per_room <= 197.00
|   |   |   |   |   |   |--- weights: [1609.00, 0.00] class: 0
|   |   |   |   |   |--- avg_price_per_room >  197.00
|   |   |   |   |   |   |--- arrival_month <= 10.50
|   |   |   |   |   |   |   |--- weights: [0.00, 16.00] class: 1
|   |   |   |   |   |   |--- arrival_month >  10.50
|   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |--- no_of_weekend_nights >  0.50
|   |   |   |   |   |--- lead_time <= 68.50
|   |   |   |   |   |   |--- lead_time <= 1.50
|   |   |   |   |   |   |   |--- arrival_month <= 2.50
|   |   |   |   |   |   |   |   |--- no_of_adults <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 33.00] class: 1
|   |   |   |   |   |   |   |   |--- no_of_adults >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [3.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- arrival_month >  2.50
|   |   |   |   |   |   |   |   |--- no_of_weekend_nights <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [26.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_weekend_nights >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [9.00, 2.00] class: 0
|   |   |   |   |   |   |--- lead_time >  1.50
|   |   |   |   |   |   |   |--- arrival_month <= 9.50
|   |   |   |   |   |   |   |   |--- lead_time <= 59.50
|   |   |   |   |   |   |   |   |   |--- weights: [443.00, 40.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  59.50
|   |   |   |   |   |   |   |   |   |--- weights: [38.00, 15.00] class: 0
|   |   |   |   |   |   |   |--- arrival_month >  9.50
|   |   |   |   |   |   |   |   |--- lead_time <= 65.50
|   |   |   |   |   |   |   |   |   |--- weights: [384.00, 7.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  65.50
|   |   |   |   |   |   |   |   |   |--- weights: [16.00, 3.00] class: 0
|   |   |   |   |   |--- lead_time >  68.50
|   |   |   |   |   |   |--- avg_price_per_room <= 99.98
|   |   |   |   |   |   |   |--- arrival_month <= 3.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 62.50
|   |   |   |   |   |   |   |   |   |--- weights: [21.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  62.50
|   |   |   |   |   |   |   |   |   |--- weights: [16.00, 17.00] class: 1
|   |   |   |   |   |   |   |--- arrival_month >  3.50
|   |   |   |   |   |   |   |   |--- lead_time <= 71.50
|   |   |   |   |   |   |   |   |   |--- weights: [3.00, 2.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  71.50
|   |   |   |   |   |   |   |   |   |--- weights: [97.00, 6.00] class: 0
|   |   |   |   |   |   |--- avg_price_per_room >  99.98
|   |   |   |   |   |   |   |--- no_of_adults <= 1.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 123.25
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 52.00] class: 1
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  123.25
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- no_of_adults >  1.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 105.20
|   |   |   |   |   |   |   |   |   |--- weights: [4.00, 23.00] class: 1
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  105.20
|   |   |   |   |   |   |   |   |   |--- weights: [29.00, 6.00] class: 0
|   |   |   |--- lead_time >  90.50
|   |   |   |   |--- no_of_week_nights <= 2.50
|   |   |   |   |   |--- lead_time <= 116.50
|   |   |   |   |   |   |--- avg_price_per_room <= 75.07
|   |   |   |   |   |   |   |--- avg_price_per_room <= 58.75
|   |   |   |   |   |   |   |   |--- weights: [7.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  58.75
|   |   |   |   |   |   |   |   |--- arrival_month <= 4.50
|   |   |   |   |   |   |   |   |   |--- weights: [3.00, 78.00] class: 1
|   |   |   |   |   |   |   |   |--- arrival_month >  4.50
|   |   |   |   |   |   |   |   |   |--- weights: [15.00, 24.00] class: 1
|   |   |   |   |   |   |--- avg_price_per_room >  75.07
|   |   |   |   |   |   |   |--- avg_price_per_room <= 93.58
|   |   |   |   |   |   |   |   |--- arrival_month <= 3.00
|   |   |   |   |   |   |   |   |   |--- weights: [31.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  3.00
|   |   |   |   |   |   |   |   |   |--- weights: [58.00, 27.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  93.58
|   |   |   |   |   |   |   |   |--- arrival_month <= 4.50
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 34.00] class: 1
|   |   |   |   |   |   |   |   |--- arrival_month >  4.50
|   |   |   |   |   |   |   |   |   |--- weights: [85.00, 115.00] class: 1
|   |   |   |   |   |--- lead_time >  116.50
|   |   |   |   |   |   |--- no_of_adults <= 1.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 122.00
|   |   |   |   |   |   |   |   |--- weights: [85.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  122.00
|   |   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |   |   |--- no_of_adults >  1.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 81.60
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 64.38
|   |   |   |   |   |   |   |   |   |--- weights: [5.00, 4.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  64.38
|   |   |   |   |   |   |   |   |   |--- weights: [57.00, 2.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  81.60
|   |   |   |   |   |   |   |   |--- arrival_month <= 3.50
|   |   |   |   |   |   |   |   |   |--- weights: [23.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  3.50
|   |   |   |   |   |   |   |   |   |--- weights: [85.00, 76.00] class: 0
|   |   |   |   |--- no_of_week_nights >  2.50
|   |   |   |   |   |--- avg_price_per_room <= 114.92
|   |   |   |   |   |   |--- no_of_week_nights <= 8.50
|   |   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |   |--- weights: [88.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 10.50
|   |   |   |   |   |   |   |   |   |--- weights: [223.00, 24.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  10.50
|   |   |   |   |   |   |   |   |   |--- weights: [5.00, 4.00] class: 0
|   |   |   |   |   |   |--- no_of_week_nights >  8.50
|   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |   |--- avg_price_per_room >  114.92
|   |   |   |   |   |   |--- lead_time <= 116.00
|   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |--- lead_time >  116.00
|   |   |   |   |   |   |   |--- type_of_meal_plan_Meal Plan 2 <= 0.50
|   |   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- type_of_meal_plan_Meal Plan 2 >  0.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |--- no_of_special_requests >  0.50
|   |   |--- no_of_special_requests <= 1.50
|   |   |   |--- lead_time <= 8.50
|   |   |   |   |--- no_of_week_nights <= 10.00
|   |   |   |   |   |--- lead_time <= 4.50
|   |   |   |   |   |   |--- avg_price_per_room <= 219.86
|   |   |   |   |   |   |   |--- avg_price_per_room <= 157.64
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 78.71
|   |   |   |   |   |   |   |   |   |--- weights: [230.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  78.71
|   |   |   |   |   |   |   |   |   |--- weights: [583.00, 17.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  157.64
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 158.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  158.50
|   |   |   |   |   |   |   |   |   |--- weights: [84.00, 6.00] class: 0
|   |   |   |   |   |   |--- avg_price_per_room >  219.86
|   |   |   |   |   |   |   |--- arrival_month <= 6.00
|   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |   |--- arrival_month >  6.00
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 237.25
|   |   |   |   |   |   |   |   |   |--- weights: [5.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  237.25
|   |   |   |   |   |   |   |   |   |--- weights: [4.00, 2.00] class: 0
|   |   |   |   |   |--- lead_time >  4.50
|   |   |   |   |   |   |--- room_type_reserved_Room_Type 2 <= 0.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 123.60
|   |   |   |   |   |   |   |   |--- arrival_month <= 8.50
|   |   |   |   |   |   |   |   |   |--- weights: [173.00, 15.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  8.50
|   |   |   |   |   |   |   |   |   |--- weights: [130.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  123.60
|   |   |   |   |   |   |   |   |--- type_of_meal_plan_Not Selected <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [112.00, 16.00] class: 0
|   |   |   |   |   |   |   |   |--- type_of_meal_plan_Not Selected >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [20.00, 9.00] class: 0
|   |   |   |   |   |   |--- room_type_reserved_Room_Type 2 >  0.50
|   |   |   |   |   |   |   |--- no_of_weekend_nights <= 1.50
|   |   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- no_of_weekend_nights >  1.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |--- no_of_week_nights >  10.00
|   |   |   |   |   |--- arrival_month <= 1.50
|   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |--- arrival_month >  1.50
|   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |--- lead_time >  8.50
|   |   |   |   |--- market_segment_type_Offline <= 0.50
|   |   |   |   |   |--- required_car_parking_space <= 0.50
|   |   |   |   |   |   |--- avg_price_per_room <= 118.55
|   |   |   |   |   |   |   |--- no_of_weekend_nights <= 2.50
|   |   |   |   |   |   |   |   |--- lead_time <= 61.50
|   |   |   |   |   |   |   |   |   |--- weights: [1371.00, 213.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  61.50
|   |   |   |   |   |   |   |   |   |--- weights: [986.00, 297.00] class: 0
|   |   |   |   |   |   |   |--- no_of_weekend_nights >  2.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [3.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [6.00, 20.00] class: 1
|   |   |   |   |   |   |--- avg_price_per_room >  118.55
|   |   |   |   |   |   |   |--- arrival_month <= 8.50
|   |   |   |   |   |   |   |   |--- no_of_adults <= 2.50
|   |   |   |   |   |   |   |   |   |--- weights: [532.00, 141.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_adults >  2.50
|   |   |   |   |   |   |   |   |   |--- weights: [99.00, 54.00] class: 0
|   |   |   |   |   |   |   |--- arrival_month >  8.50
|   |   |   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |   |   |--- weights: [66.00, 7.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |   |   |--- weights: [396.00, 232.00] class: 0
|   |   |   |   |   |--- required_car_parking_space >  0.50
|   |   |   |   |   |   |--- no_of_weekend_nights <= 3.00
|   |   |   |   |   |   |   |--- weights: [193.00, 0.00] class: 0
|   |   |   |   |   |   |--- no_of_weekend_nights >  3.00
|   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |--- market_segment_type_Offline >  0.50
|   |   |   |   |   |--- lead_time <= 102.50
|   |   |   |   |   |   |--- no_of_children <= 0.50
|   |   |   |   |   |   |   |--- weights: [484.00, 0.00] class: 0
|   |   |   |   |   |   |--- no_of_children >  0.50
|   |   |   |   |   |   |   |--- lead_time <= 91.50
|   |   |   |   |   |   |   |   |--- weights: [40.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  91.50
|   |   |   |   |   |   |   |   |--- arrival_month <= 5.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |   |   |   |   |   |--- arrival_month >  5.50
|   |   |   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |--- lead_time >  102.50
|   |   |   |   |   |   |--- no_of_week_nights <= 2.50
|   |   |   |   |   |   |   |--- lead_time <= 105.00
|   |   |   |   |   |   |   |   |--- arrival_month <= 4.50
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  4.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 4.00] class: 1
|   |   |   |   |   |   |   |--- lead_time >  105.00
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 83.39
|   |   |   |   |   |   |   |   |   |--- weights: [10.00, 6.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  83.39
|   |   |   |   |   |   |   |   |   |--- weights: [31.00, 3.00] class: 0
|   |   |   |   |   |   |--- no_of_week_nights >  2.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 187.92
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 122.00
|   |   |   |   |   |   |   |   |   |--- weights: [57.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  122.00
|   |   |   |   |   |   |   |   |   |--- weights: [3.00, 1.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  187.92
|   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |--- no_of_special_requests >  1.50
|   |   |   |--- lead_time <= 90.50
|   |   |   |   |--- no_of_week_nights <= 3.50
|   |   |   |   |   |--- weights: [2126.00, 0.00] class: 0
|   |   |   |   |--- no_of_week_nights >  3.50
|   |   |   |   |   |--- no_of_week_nights <= 9.50
|   |   |   |   |   |   |--- no_of_special_requests <= 2.50
|   |   |   |   |   |   |   |--- lead_time <= 6.50
|   |   |   |   |   |   |   |   |--- weights: [43.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  6.50
|   |   |   |   |   |   |   |   |--- room_type_reserved_Room_Type 4 <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [139.00, 32.00] class: 0
|   |   |   |   |   |   |   |   |--- room_type_reserved_Room_Type 4 >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [60.00, 4.00] class: 0
|   |   |   |   |   |   |--- no_of_special_requests >  2.50
|   |   |   |   |   |   |   |--- weights: [70.00, 0.00] class: 0
|   |   |   |   |   |--- no_of_week_nights >  9.50
|   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |--- lead_time >  90.50
|   |   |   |   |--- avg_price_per_room <= 202.95
|   |   |   |   |   |--- arrival_month <= 8.50
|   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |--- arrival_month <= 7.50
|   |   |   |   |   |   |   |   |--- no_of_week_nights <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |   |   |--- no_of_week_nights >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [2.00, 3.00] class: 1
|   |   |   |   |   |   |   |--- arrival_month >  7.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 78.20
|   |   |   |   |   |   |   |   |   |--- weights: [11.00, 1.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  78.20
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 1.00] class: 0
|   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |--- lead_time <= 150.50
|   |   |   |   |   |   |   |   |--- no_of_children <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [227.00, 11.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_children >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [45.00, 8.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  150.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |--- arrival_month >  8.50
|   |   |   |   |   |   |--- no_of_special_requests <= 2.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 90.42
|   |   |   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |   |   |--- weights: [15.00, 23.00] class: 1
|   |   |   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |   |   |--- weights: [21.00, 5.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  90.42
|   |   |   |   |   |   |   |   |--- no_of_adults <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [11.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_adults >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [96.00, 42.00] class: 0
|   |   |   |   |   |   |--- no_of_special_requests >  2.50
|   |   |   |   |   |   |   |--- weights: [52.00, 0.00] class: 0
|   |   |   |   |--- avg_price_per_room >  202.95
|   |   |   |   |   |--- weights: [0.00, 7.00] class: 1
|--- lead_time >  151.50
|   |--- avg_price_per_room <= 100.04
|   |   |--- no_of_special_requests <= 0.50
|   |   |   |--- no_of_adults <= 1.50
|   |   |   |   |--- market_segment_type_Offline <= 0.50
|   |   |   |   |   |--- avg_price_per_room <= 34.84
|   |   |   |   |   |   |--- no_of_weekend_nights <= 0.50
|   |   |   |   |   |   |   |--- lead_time <= 280.00
|   |   |   |   |   |   |   |   |--- weights: [3.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  280.00
|   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |--- no_of_weekend_nights >  0.50
|   |   |   |   |   |   |   |--- weights: [11.00, 0.00] class: 0
|   |   |   |   |   |--- avg_price_per_room >  34.84
|   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |--- weights: [0.00, 57.00] class: 1
|   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 84.30
|   |   |   |   |   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |   |   |   |   |   |--- avg_price_per_room >  84.30
|   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |--- market_segment_type_Offline >  0.50
|   |   |   |   |   |--- lead_time <= 163.50
|   |   |   |   |   |   |--- no_of_weekend_nights <= 1.50
|   |   |   |   |   |   |   |--- lead_time <= 160.50
|   |   |   |   |   |   |   |   |--- weights: [4.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  160.50
|   |   |   |   |   |   |   |   |--- weights: [1.00, 1.00] class: 0
|   |   |   |   |   |   |--- no_of_weekend_nights >  1.50
|   |   |   |   |   |   |   |--- weights: [0.00, 15.00] class: 1
|   |   |   |   |   |--- lead_time >  163.50
|   |   |   |   |   |   |--- lead_time <= 341.00
|   |   |   |   |   |   |   |--- lead_time <= 173.00
|   |   |   |   |   |   |   |   |--- lead_time <= 165.50
|   |   |   |   |   |   |   |   |   |--- weights: [62.00, 6.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  165.50
|   |   |   |   |   |   |   |   |   |--- weights: [4.00, 9.00] class: 1
|   |   |   |   |   |   |   |--- lead_time >  173.00
|   |   |   |   |   |   |   |   |--- arrival_month <= 5.50
|   |   |   |   |   |   |   |   |   |--- weights: [9.00, 3.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_month >  5.50
|   |   |   |   |   |   |   |   |   |--- weights: [252.00, 5.00] class: 0
|   |   |   |   |   |   |--- lead_time >  341.00
|   |   |   |   |   |   |   |--- no_of_week_nights <= 4.00
|   |   |   |   |   |   |   |   |--- lead_time <= 402.00
|   |   |   |   |   |   |   |   |   |--- weights: [16.00, 7.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  402.00
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 3.00] class: 1
|   |   |   |   |   |   |   |--- no_of_week_nights >  4.00
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 88.33
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 7.00] class: 1
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  88.33
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 1.00] class: 0
|   |   |   |--- no_of_adults >  1.50
|   |   |   |   |--- avg_price_per_room <= 82.47
|   |   |   |   |   |--- market_segment_type_Offline <= 0.50
|   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |--- weights: [0.00, 130.00] class: 1
|   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |--- no_of_weekend_nights <= 0.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 76.87
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 6.00] class: 1
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  76.87
|   |   |   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- no_of_weekend_nights >  0.50
|   |   |   |   |   |   |   |   |--- no_of_week_nights <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 5.00] class: 1
|   |   |   |   |   |   |   |   |--- no_of_week_nights >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 45.00] class: 1
|   |   |   |   |   |--- market_segment_type_Offline >  0.50
|   |   |   |   |   |   |--- lead_time <= 244.00
|   |   |   |   |   |   |   |--- no_of_week_nights <= 1.50
|   |   |   |   |   |   |   |   |--- no_of_weekend_nights <= 1.50
|   |   |   |   |   |   |   |   |   |--- weights: [6.00, 38.00] class: 1
|   |   |   |   |   |   |   |   |--- no_of_weekend_nights >  1.50
|   |   |   |   |   |   |   |   |   |--- weights: [24.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- no_of_week_nights >  1.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 66.50
|   |   |   |   |   |   |   |   |   |--- weights: [19.00, 9.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  66.50
|   |   |   |   |   |   |   |   |   |--- weights: [123.00, 9.00] class: 0
|   |   |   |   |   |   |--- lead_time >  244.00
|   |   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |   |--- arrival_year <= 2017.50
|   |   |   |   |   |   |   |   |   |--- weights: [34.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- arrival_year >  2017.50
|   |   |   |   |   |   |   |   |   |--- weights: [45.00, 198.00] class: 1
|   |   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |   |--- weights: [36.00, 0.00] class: 0
|   |   |   |   |--- avg_price_per_room >  82.47
|   |   |   |   |   |--- no_of_adults <= 2.50
|   |   |   |   |   |   |--- lead_time <= 324.50
|   |   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |   |--- room_type_reserved_Room_Type 4 <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [10.00, 650.00] class: 1
|   |   |   |   |   |   |   |   |--- room_type_reserved_Room_Type 4 >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [6.00, 7.00] class: 1
|   |   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |   |--- market_segment_type_Offline <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 13.00] class: 1
|   |   |   |   |   |   |   |   |--- market_segment_type_Offline >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [7.00, 0.00] class: 0
|   |   |   |   |   |   |--- lead_time >  324.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 89.00
|   |   |   |   |   |   |   |   |--- weights: [8.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  89.00
|   |   |   |   |   |   |   |   |--- market_segment_type_Offline <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 4.00] class: 1
|   |   |   |   |   |   |   |   |--- market_segment_type_Offline >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 5.00] class: 1
|   |   |   |   |   |--- no_of_adults >  2.50
|   |   |   |   |   |   |--- weights: [7.00, 0.00] class: 0
|   |   |--- no_of_special_requests >  0.50
|   |   |   |--- no_of_weekend_nights <= 0.50
|   |   |   |   |--- lead_time <= 180.50
|   |   |   |   |   |--- lead_time <= 159.50
|   |   |   |   |   |   |--- arrival_month <= 8.50
|   |   |   |   |   |   |   |--- weights: [8.00, 0.00] class: 0
|   |   |   |   |   |   |--- arrival_month >  8.50
|   |   |   |   |   |   |   |--- no_of_week_nights <= 2.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 4.00] class: 1
|   |   |   |   |   |   |   |--- no_of_week_nights >  2.50
|   |   |   |   |   |   |   |   |--- lead_time <= 156.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |   |   |--- lead_time >  156.50
|   |   |   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |--- lead_time >  159.50
|   |   |   |   |   |   |--- no_of_adults <= 0.50
|   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |--- no_of_adults >  0.50
|   |   |   |   |   |   |   |--- lead_time <= 178.50
|   |   |   |   |   |   |   |   |--- weights: [44.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  178.50
|   |   |   |   |   |   |   |   |--- avg_price_per_room <= 93.62
|   |   |   |   |   |   |   |   |   |--- weights: [5.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- avg_price_per_room >  93.62
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 2.00] class: 1
|   |   |   |   |--- lead_time >  180.50
|   |   |   |   |   |--- market_segment_type_Offline <= 0.50
|   |   |   |   |   |   |--- no_of_special_requests <= 2.50
|   |   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |   |--- no_of_week_nights <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_week_nights >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 125.00] class: 1
|   |   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |   |--- lead_time <= 300.50
|   |   |   |   |   |   |   |   |   |--- weights: [8.00, 6.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  300.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 5.00] class: 1
|   |   |   |   |   |   |--- no_of_special_requests >  2.50
|   |   |   |   |   |   |   |--- weights: [12.00, 0.00] class: 0
|   |   |   |   |   |--- market_segment_type_Offline >  0.50
|   |   |   |   |   |   |--- no_of_adults <= 2.50
|   |   |   |   |   |   |   |--- lead_time <= 356.00
|   |   |   |   |   |   |   |   |--- lead_time <= 302.50
|   |   |   |   |   |   |   |   |   |--- weights: [15.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  302.50
|   |   |   |   |   |   |   |   |   |--- weights: [2.00, 1.00] class: 0
|   |   |   |   |   |   |   |--- lead_time >  356.00
|   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |--- no_of_adults >  2.50
|   |   |   |   |   |   |   |--- weights: [0.00, 2.00] class: 1
|   |   |   |--- no_of_weekend_nights >  0.50
|   |   |   |   |--- market_segment_type_Offline <= 0.50
|   |   |   |   |   |--- no_of_week_nights <= 9.50
|   |   |   |   |   |   |--- arrival_month <= 11.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 76.48
|   |   |   |   |   |   |   |   |--- lead_time <= 245.50
|   |   |   |   |   |   |   |   |   |--- weights: [46.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- lead_time >  245.50
|   |   |   |   |   |   |   |   |   |--- weights: [17.00, 2.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  76.48
|   |   |   |   |   |   |   |   |--- no_of_week_nights <= 6.50
|   |   |   |   |   |   |   |   |   |--- weights: [241.00, 61.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_week_nights >  6.50
|   |   |   |   |   |   |   |   |   |--- weights: [5.00, 5.00] class: 0
|   |   |   |   |   |   |--- arrival_month >  11.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 55.92
|   |   |   |   |   |   |   |   |--- weights: [5.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  55.92
|   |   |   |   |   |   |   |   |--- no_of_special_requests <= 2.50
|   |   |   |   |   |   |   |   |   |--- weights: [18.00, 21.00] class: 1
|   |   |   |   |   |   |   |   |--- no_of_special_requests >  2.50
|   |   |   |   |   |   |   |   |   |--- weights: [3.00, 0.00] class: 0
|   |   |   |   |   |--- no_of_week_nights >  9.50
|   |   |   |   |   |   |--- room_type_reserved_Room_Type 2 <= 0.50
|   |   |   |   |   |   |   |--- weights: [0.00, 7.00] class: 1
|   |   |   |   |   |   |--- room_type_reserved_Room_Type 2 >  0.50
|   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |--- market_segment_type_Offline >  0.50
|   |   |   |   |   |--- lead_time <= 348.50
|   |   |   |   |   |   |--- no_of_week_nights <= 5.50
|   |   |   |   |   |   |   |--- no_of_special_requests <= 1.50
|   |   |   |   |   |   |   |   |--- weights: [129.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- no_of_special_requests >  1.50
|   |   |   |   |   |   |   |   |--- no_of_week_nights <= 3.50
|   |   |   |   |   |   |   |   |   |--- weights: [9.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- no_of_week_nights >  3.50
|   |   |   |   |   |   |   |   |   |--- weights: [4.00, 1.00] class: 0
|   |   |   |   |   |   |--- no_of_week_nights >  5.50
|   |   |   |   |   |   |   |--- type_of_meal_plan_Meal Plan 2 <= 0.50
|   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |   |--- type_of_meal_plan_Meal Plan 2 >  0.50
|   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |--- lead_time >  348.50
|   |   |   |   |   |   |--- lead_time <= 372.50
|   |   |   |   |   |   |   |--- avg_price_per_room <= 58.50
|   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- avg_price_per_room >  58.50
|   |   |   |   |   |   |   |   |--- weights: [6.00, 2.00] class: 0
|   |   |   |   |   |   |--- lead_time >  372.50
|   |   |   |   |   |   |   |--- weights: [1.00, 1.00] class: 0
|   |--- avg_price_per_room >  100.04
|   |   |--- arrival_month <= 11.50
|   |   |   |--- no_of_special_requests <= 2.50
|   |   |   |   |--- weights: [0.00, 2108.00] class: 1
|   |   |   |--- no_of_special_requests >  2.50
|   |   |   |   |--- weights: [31.00, 0.00] class: 0
|   |   |--- arrival_month >  11.50
|   |   |   |--- no_of_special_requests <= 0.50
|   |   |   |   |--- weights: [47.00, 0.00] class: 0
|   |   |   |--- no_of_special_requests >  0.50
|   |   |   |   |--- lead_time <= 289.50
|   |   |   |   |   |--- no_of_special_requests <= 1.50
|   |   |   |   |   |   |--- avg_price_per_room <= 114.59
|   |   |   |   |   |   |   |--- weights: [2.00, 0.00] class: 0
|   |   |   |   |   |   |--- avg_price_per_room >  114.59
|   |   |   |   |   |   |   |--- weights: [0.00, 6.00] class: 1
|   |   |   |   |   |--- no_of_special_requests >  1.50
|   |   |   |   |   |   |--- required_car_parking_space <= 0.50
|   |   |   |   |   |   |   |--- no_of_week_nights <= 4.50
|   |   |   |   |   |   |   |   |--- weights: [7.00, 0.00] class: 0
|   |   |   |   |   |   |   |--- no_of_week_nights >  4.50
|   |   |   |   |   |   |   |   |--- room_type_reserved_Room_Type 4 <= 0.50
|   |   |   |   |   |   |   |   |   |--- weights: [1.00, 0.00] class: 0
|   |   |   |   |   |   |   |   |--- room_type_reserved_Room_Type 4 >  0.50
|   |   |   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |   |   |--- required_car_parking_space >  0.50
|   |   |   |   |   |   |   |--- weights: [0.00, 1.00] class: 1
|   |   |   |   |--- lead_time >  289.50
|   |   |   |   |   |--- weights: [0.00, 7.00] class: 1


pmd1 #viewing dataframe with performance metrics


pm1 #viewing dataframe with performance metrics


pmd1 #viewing dataframe with performance metrics

	Booking_ID	no_of_adults	no_of_children	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	required_car_parking_space	room_type_reserved	lead_time	arrival_year	arrival_month	arrival_date	market_segment_type	repeated_guest	no_of_previous_cancellations	no_of_previous_bookings_not_canceled	avg_price_per_room	no_of_special_requests	booking_status
count	36275	36275.000000	36275.000000	36275.000000	36275.000000	36275	36275.000000	36275	36275.000000	36275.000000	36275.000000	36275.000000	36275	36275.000000	36275.000000	36275.000000	36275.000000	36275.000000	36275
unique	36275	NaN	NaN	NaN	NaN	4	NaN	7	NaN	NaN	NaN	NaN	5	NaN	NaN	NaN	NaN	NaN	2
top	INN00001	NaN	NaN	NaN	NaN	Meal Plan 1	NaN	Room_Type 1	NaN	NaN	NaN	NaN	Online	NaN	NaN	NaN	NaN	NaN	Not_Canceled
freq	1	NaN	NaN	NaN	NaN	27835	NaN	28130	NaN	NaN	NaN	NaN	23214	NaN	NaN	NaN	NaN	NaN	24390
mean	NaN	1.844962	0.105279	0.810724	2.204300	NaN	0.030986	NaN	85.232557	2017.820427	7.423653	15.596995	NaN	0.025637	0.023349	0.153411	103.423539	0.619655	NaN
std	NaN	0.518715	0.402648	0.870644	1.410905	NaN	0.173281	NaN	85.930817	0.383836	3.069894	8.740447	NaN	0.158053	0.368331	1.754171	35.089424	0.786236	NaN
min	NaN	0.000000	0.000000	0.000000	0.000000	NaN	0.000000	NaN	0.000000	2017.000000	1.000000	1.000000	NaN	0.000000	0.000000	0.000000	0.000000	0.000000	NaN
25%	NaN	2.000000	0.000000	0.000000	1.000000	NaN	0.000000	NaN	17.000000	2018.000000	5.000000	8.000000	NaN	0.000000	0.000000	0.000000	80.300000	0.000000	NaN
50%	NaN	2.000000	0.000000	1.000000	2.000000	NaN	0.000000	NaN	57.000000	2018.000000	8.000000	16.000000	NaN	0.000000	0.000000	0.000000	99.450000	0.000000	NaN
75%	NaN	2.000000	0.000000	2.000000	3.000000	NaN	0.000000	NaN	126.000000	2018.000000	10.000000	23.000000	NaN	0.000000	0.000000	0.000000	120.000000	1.000000	NaN
max	NaN	4.000000	10.000000	7.000000	17.000000	NaN	1.000000	NaN	443.000000	2018.000000	12.000000	31.000000	NaN	1.000000	13.000000	58.000000	540.000000	5.000000	NaN

	Group	Accuracy	Recall	Precision	F1
0	Training	0.805411	0.632548	0.739033	0.681657
1	Testing	0.804649	0.630892	0.729003	0.676408

	Group	Accuracy	Recall	Precision	F1
0	Training	0.792888	0.735621	0.668696	0.700564
1	Testing	0.796012	0.739353	0.666667	0.701131

	Group	Accuracy	Recall	Precision	F1
0	Training	0.801276	0.699390	0.697888	0.698638
1	Testing	0.803639	0.703861	0.693815	0.698802

	Group	Accuracy	Recall	Precision	F1
0	Training	0.805411	0.632548	0.739033	0.681657
1	Testing	0.804649	0.630892	0.729003	0.676408

	Booking_ID	no_of_adults	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	room_type_reserved	lead_time	arrival_year	arrival_month	arrival_date	market_segment_type	avg_price_per_room	no_of_special_requests	booking_status
36270	INN36271	3	2	6	Meal Plan 1	Room_Type 4	85	2018	8	3	Online	167.80	1	Not_Canceled
36271	INN36272	2	1	3	Meal Plan 1	Room_Type 1	228	2018	10	17	Online	90.95	2	Canceled
36272	INN36273	2	2	6	Meal Plan 1	Room_Type 1	148	2018	7	1	Online	98.39	2	Not_Canceled
36273	INN36274	2	0	3	Not Selected	Room_Type 1	63	2018	4	21	Online	94.50	0	Canceled
36274	INN36275	2	1	2	Meal Plan 1	Room_Type 1	207	2018	12	30	Offline	161.67	0	Not_Canceled

	no_of_adults	no_of_children	no_of_weekend_nights	no_of_week_nights	type_of_meal_plan	required_car_parking_space	room_type_reserved	lead_time	arrival_year	arrival_month	arrival_date	market_segment_type	repeated_guest	no_of_previous_cancellations	no_of_previous_bookings_not_canceled	avg_price_per_room	no_of_special_requests
0	2	0	1	2	Meal Plan 1	0	Room_Type 1	224	2017	10	2	Offline	0	0	0	65.00	0
1	2	0	2	3	Not Selected	0	Room_Type 1	5	2018	11	6	Online	0	0	0	106.68	1
2	1	0	2	1	Meal Plan 1	0	Room_Type 1	1	2018	2	28	Online	0	0	0	60.00	0
3	2	0	0	2	Meal Plan 1	0	Room_Type 1	211	2018	5	20	Online	0	0	0	100.00	0
4	2	0	1	1	Not Selected	0	Room_Type 1	48	2018	4	11	Online	0	0	0	94.50	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
36270	3	0	2	6	Meal Plan 1	0	Room_Type 4	85	2018	8	3	Online	0	0	0	167.80	1
36271	2	0	1	3	Meal Plan 1	0	Room_Type 1	228	2018	10	17	Online	0	0	0	90.95	2
36272	2	0	2	6	Meal Plan 1	0	Room_Type 1	148	2018	7	1	Online	0	0	0	98.39	2
36273	2	0	0	3	Not Selected	0	Room_Type 1	63	2018	4	21	Online	0	0	0	94.50	0
36274	2	0	1	2	Meal Plan 1	0	Room_Type 1	207	2018	12	30	Offline	0	0	0	161.67	0

	variables	values
0	const	39468156.706004
1	no_of_adults	1.348154
2	no_of_children	1.978229
3	no_of_weekend_nights	1.069475
4	no_of_week_nights	1.095667
5	required_car_parking_space	1.039928
6	lead_time	1.394914
7	arrival_year	1.430830
8	arrival_month	1.275673
9	arrival_date	1.006738
10	repeated_guest	1.783516
11	no_of_previous_cancellations	1.395689
12	no_of_previous_bookings_not_canceled	1.651986
13	avg_price_per_room	2.050421
14	no_of_special_requests	1.247278
15	type_of_meal_plan_Meal Plan 2	1.271851
16	type_of_meal_plan_Meal Plan 3	1.025216
17	type_of_meal_plan_Not Selected	1.272183
18	room_type_reserved_Room_Type 2	1.101438
19	room_type_reserved_Room_Type 3	1.003302
20	room_type_reserved_Room_Type 4	1.361515
21	room_type_reserved_Room_Type 5	1.027810
22	room_type_reserved_Room_Type 6	1.973072
23	room_type_reserved_Room_Type 7	1.115123
24	market_segment_type_Complementary	4.500109
25	market_segment_type_Corporate	16.928435
26	market_segment_type_Offline	64.113924
27	market_segment_type_Online	71.176430

	variables	values
0	const	39391371.314593
1	no_of_adults	1.331784
2	no_of_children	1.977350
3	no_of_weekend_nights	1.069039
4	no_of_week_nights	1.095118
5	required_car_parking_space	1.039795
6	lead_time	1.390637
7	arrival_year	1.428376
8	arrival_month	1.274625
9	arrival_date	1.006721
10	repeated_guest	1.780188
11	no_of_previous_cancellations	1.395447
12	no_of_previous_bookings_not_canceled	1.651745
13	avg_price_per_room	2.049595
14	no_of_special_requests	1.242418
15	type_of_meal_plan_Meal Plan 2	1.271497
16	type_of_meal_plan_Meal Plan 3	1.025216
17	type_of_meal_plan_Not Selected	1.270387
18	room_type_reserved_Room_Type 2	1.101271
19	room_type_reserved_Room_Type 3	1.003301
20	room_type_reserved_Room_Type 4	1.356004
21	room_type_reserved_Room_Type 5	1.027810
22	room_type_reserved_Room_Type 6	1.972732
23	room_type_reserved_Room_Type 7	1.115003
24	market_segment_type_Complementary	1.338253
25	market_segment_type_Corporate	1.527769
26	market_segment_type_Offline	1.597418

	odds	percent_change
const	0.000000	-100.000000
no_of_adults	1.114754	11.475363
no_of_children	1.164360	16.436009
no_of_weekend_nights	1.114753	11.475256
no_of_week_nights	1.042636	4.263629
required_car_parking_space	0.203048	-79.695231
lead_time	1.015835	1.583521
arrival_year	1.573235	57.323511
arrival_month	0.958528	-4.147245
repeated_guest	0.064797	-93.520258
no_of_previous_cancellations	1.257157	25.715665
avg_price_per_room	1.019348	1.934790
no_of_special_requests	0.229941	-77.005947
type_of_meal_plan_Meal Plan 2	1.179916	17.991562
type_of_meal_plan_Not Selected	1.330892	33.089244
room_type_reserved_Room_Type 2	0.700461	-29.953888
room_type_reserved_Room_Type 4	0.753830	-24.617006
room_type_reserved_Room_Type 5	0.479403	-52.059666
room_type_reserved_Room_Type 6	0.380991	-61.900934
room_type_reserved_Room_Type 7	0.239033	-76.096691
market_segment_type_Corporate	0.452584	-54.741616
market_segment_type_Offline	0.167504	-83.249628

	Group	Accuracy	Recall	Precision	F1
0	Training	0.993541	0.984694	0.995647	0.990141
1	Testing	0.867132	0.801817	0.790594	0.796166

	Group	Accuracy	Recall	Precision	F1
0	Training	0.873976	0.781777	0.826235	0.803391
1	Testing	0.866397	0.769733	0.808289	0.788540

	ccp_alphas	impurities
0	0.000000	0.008309
1	0.000000	0.008309
2	0.000000	0.008309
3	0.000000	0.008310
4	0.000001	0.008310
...	...	...
1434	0.004543	0.280273
1435	0.006585	0.293443
1436	0.017260	0.310703
1437	0.018160	0.365183
1438	0.076578	0.441761

	Group	Accuracy	Recall	Precision	F1
0	Training	0.993384	0.986249	0.993615	0.989918
1	Testing	0.866765	0.803237	0.788901	0.796005

INN Hotels Project¶

Context¶

Objective¶

Data Description¶

Importing necessary libraries and data¶

Data Overview¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Exploratory Data Analysis (EDA)¶

Univariate Analysis¶

Observations:¶

Bivariate Analysis¶

Observations:¶

Observations:¶

Questions¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Observations:¶

Data Preprocessing¶

Missing Value Treatment¶

Observations:¶

Feature Engineering¶

Observations:¶

Outlier Detection and Treatment¶

Observations:¶

Preparing Data for Modeling¶

Observations:¶

Observations:¶

Exploratory Data Analysis After Data Preprocessing¶

Observations:¶

Observations:¶

Checking Multicollinearity¶

Observations:¶

Observations:¶

Building a Logistic Regression Model¶

Observations:¶

Dropping P-values > 0.05¶

Reviewing Summary of Model¶

Observations:¶

Model Performance Evaluation¶

At a 0.5 (Default) Threshold¶

Observations:¶

Using ROC-AUC to Find Optimal Threshold¶

Observations:¶

Using Precision-Recall Curve to Find Optimal Threshold¶

Observations:¶

Observations:¶

Comparing Performance Metrics from the Three Thresholds¶

Observations:¶

Choosing the Final Model¶

Final Model Summary¶

Performance Metrics at Final Threshold of 0.37¶

Calculating Odds¶

Observations:¶

Building a Decision Tree model¶

Viewing Model Performance¶

Observations:¶

Observations:¶

Do we need to prune the tree?¶

Pre-Pruning¶

Observations:¶

Observations:¶

Post-Pruning¶

Observations:¶

Observations:¶

Choosing the Final Decision Tree¶

Final Model : Pre-Pruning Decision Tree¶

Feature Importance¶

Visual Depiction¶

Text Depiction¶

Final Performance Metrics¶

Observations:¶

Logistic Regression and Decision Tree Analysis¶