import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools import add_constant


raw = pd.read_csv('healing_hugs.csv')
raw.head()


sns.heatmap(raw.isnull()).set_title("Dataframe NaN's Before Cleaning")

Text(0.5, 1.0, "Dataframe NaN's Before Cleaning")


# Create a new dataframe in case we need the original later
hh = raw.copy(deep=True)
# Because "Work_shift" is categorical, we will address it first, then fill the rest with 0
hh = hh.fillna({"Work_shift": "none"}).fillna(0)


# check unique values
for col in hh:
    print(col + str(hh[col].unique()))

Day_of_week['Sun' 'Mon' 'Tue' 'Wed' 'Thu' 'Fri' 'Sat']
Workout[1. 0.]
Work_shift['on call' 'late' 'early' 'off' 'none' 'mid' 'early ']
Healing_hug[0. 1. 2.]
Month[ 8  9 10 11 12  1  2  3  4  5  6  7]
Year[2021 2022 2023 2024]
Pto[0 1]


# Work_shift "off" is same as "none", so refine those values
hh.Work_shift.replace(to_replace="off", value="none", inplace=True)
# We don't need the extra spaces in 'early '
hh.Work_shift = hh.Work_shift.str.strip()


hh.Healing_hug.value_counts()

0.0    600
1.0    114
2.0      1
Name: Healing_hug, dtype: int64


hh.Healing_hug.replace(to_replace=2, value=1, inplace=True)


sns.heatmap(hh.isnull()).set_title("Dataframe NaN's After Cleaning")
for col in hh:
    print(col + str(hh[col].unique()))

Day_of_week['Sun' 'Mon' 'Tue' 'Wed' 'Thu' 'Fri' 'Sat']
Workout[1. 0.]
Work_shift['on call' 'late' 'early' 'none' 'mid']
Healing_hug[0. 1.]
Month[ 8  9 10 11 12  1  2  3  4  5  6  7]
Year[2021 2022 2023 2024]
Pto[0 1]


x = hh.Day_of_week
ys = ["Workout", "Pto", "Healing_hug"]

fig, axes = plt.subplots(nrows=len(ys), sharex=True)
fig.set_size_inches(5, 9)
fig.suptitle("Variables versus Day of Week")
fig.subplots_adjust(top=0.95)
for i in range(len(ys)):
    sns.barplot(x=x, 
                y=hh[ys[i]],
                ax=axes[i],
                palette='gist_stern')


x = hh.Work_shift
ys = ["Workout", "Healing_hug"]

fig, axes = plt.subplots(nrows=len(ys), sharex=True)
fig.set_size_inches(5, 6)
fig.suptitle("Variables versus Work Shift")
fig.subplots_adjust(top=0.95)

for i in range(len(ys)):
    sns.barplot(x=x, 
                y=hh[ys[i]],
                ax=axes[i],
                palette='gist_stern')


x = hh.Workout
ys = ["Healing_hug"]

fig, axes = plt.subplots(nrows=len(ys), sharex=True)
fig.set_size_inches(5, 3)
fig.suptitle("Healing Hug versus Workout")

for i in range(len(ys)):
    sns.barplot(x=x, 
                y=hh[ys[i]],
                palette='gist_stern')


days = pd.get_dummies(hh.Day_of_week)  # hold off on dropping a column until we see multicollinearities
ws = pd.get_dummies(hh.Work_shift)


hh_log = pd.concat([hh, days, ws], axis=1)
hh_log.drop(["Day_of_week", "Work_shift"], axis=1, inplace=True)


sns.heatmap(hh_log.corr()).set_title("Heatmap of Correlations")

Text(0.5, 1.0, 'Heatmap of Correlations')


hh_log.drop(["Sun", "none"], axis=1, inplace=True)


sns.heatmap(hh_log.corr()).set_title("Reduced Heatmap of Correlations")

Text(0.5, 1.0, 'Reduced Heatmap of Correlations')


def vif(df, target):
    X = add_constant(df.loc[:, df.columns != target])
    return pd.Series([variance_inflation_factor(X.values, i) for i in range(X.shape[1])], index = X.columns)


target = "Healing_hug"
vif(hh_log, target)

const      9.597573e+06
Workout    1.303679e+00
Month      1.151237e+00
Year       1.264833e+00
Pto        1.895963e+00
Fri        4.040642e+00
Mon        3.776184e+00
Sat        2.329812e+00
Thu        3.991869e+00
Tue        4.080095e+00
Wed        4.098180e+00
early      3.910134e+00
late       4.151093e+00
mid        3.101553e+00
on call    1.389997e+00
dtype: float64


X = hh_log.drop(["Healing_hug"], axis=1)
y = hh_log['Healing_hug']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=235)


log_model = LogisticRegression(class_weight="balanced")
log_model.fit(X_train, y_train)

LogisticRegression(class_weight='balanced')

LogisticRegression(class_weight='balanced')


y_preds = log_model.predict(X_test)
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

         0.0       0.86      0.62      0.72       117
         1.0       0.24      0.54      0.33        26

    accuracy                           0.60       143
   macro avg       0.55      0.58      0.52       143
weighted avg       0.74      0.60      0.65       143


feat_importance = log_model.coef_.flatten()
plt.barh(X.columns, feat_importance)
plt.title("Feature Importance")

Text(0.5, 1.0, 'Feature Importance')


hh2 = hh.copy(deep=True)
hh2.drop(["Work_shift"], axis=1, inplace=True)
hh2.Day_of_week.replace({'Sun' : 'weekend',
                         'Mon': 'weekday',
                         'Tue': 'weekday',
                         'Wed': 'weekday',
                         'Thu': 'weekday',
                         'Fri': 'weekday',
                         'Sat': 'weekend'}, inplace=True)


day_types = pd.get_dummies(hh2.Day_of_week, drop_first=False)
hh2_log = pd.concat([hh2, day_types], axis=1)
hh2_log.drop(["Day_of_week"], axis=1, inplace=True)
hh2_log.drop(["weekday"], axis=1, inplace=True)  # remove one column to avoid multicollinearity


sns.heatmap(hh2_log.corr()).set_title("Reduced Multicollinearity Heatmap")

Text(0.5, 1.0, 'Reduced Multicollinearity Heatmap')


target = "Healing_hug"
vif(hh2_log, target)

const      8.916061e+06
Workout    1.134841e+00
Month      1.143931e+00
Year       1.175285e+00
Pto        1.019541e+00
weekend    1.114163e+00
dtype: float64


X2 = hh2_log.drop(["Healing_hug"], axis=1)
y2 = hh2_log['Healing_hug']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.2, random_state=235)


model2 = LogisticRegression(class_weight='balanced')
model2.fit(X2_train, y2_train)
y2_preds = model2.predict(X2_test)
print(classification_report(y2_test, y2_preds))

              precision    recall  f1-score   support

         0.0       0.87      0.74      0.80       117
         1.0       0.30      0.50      0.37        26

    accuracy                           0.69       143
   macro avg       0.58      0.62      0.58       143
weighted avg       0.76      0.69      0.72       143


feat_importance2 = model2.coef_.flatten()
plt.barh(X2.columns, feat_importance2)
plt.title("Feature Importance")

Text(0.5, 1.0, 'Feature Importance')

Healing Hugs: Logistic Regression with Exploratory Analysis¶

Summary: This analysis uses data exploration and logistic regression to see if healing hugs are correlated to some variables.¶

Data exploration¶

Workout vs Day_of_week¶

Pto vs Day_of_week¶

Healing_hug¶

Workout vs Work_shift¶

Healing_hug vs Work_shift¶

Healing_hug vs Workout¶

Logistic Regression¶

Final recommendation to client¶

	Day_of_week	Workout	Work_shift	Healing_hug	Month	Year
0	Sun	1.0	on call	0.0	8	2021
1	Mon	0.0	late	1.0	8	2021
2	Tue	0.0	late	0.0	8	2021
3	Wed	0.0	late	0.0	9	2021
4	Thu	NaN	late	NaN	9	2021