import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


df = pd.read_csv('customer-churn.csv')


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7032 entries, 0 to 7031
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7032 non-null   object 
 1   gender            7032 non-null   object 
 2   SeniorCitizen     7032 non-null   int64  
 3   Partner           7032 non-null   object 
 4   Dependents        7032 non-null   object 
 5   tenure            7032 non-null   int64  
 6   PhoneService      7032 non-null   object 
 7   MultipleLines     7032 non-null   object 
 8   InternetService   7032 non-null   object 
 9   OnlineSecurity    7032 non-null   object 
 10  OnlineBackup      7032 non-null   object 
 11  DeviceProtection  7032 non-null   object 
 12  TechSupport       7032 non-null   object 
 13  StreamingTV       7032 non-null   object 
 14  StreamingMovies   7032 non-null   object 
 15  Contract          7032 non-null   object 
 16  PaperlessBilling  7032 non-null   object 
 17  PaymentMethod     7032 non-null   object 
 18  MonthlyCharges    7032 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7032 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB


df.describe()


df.isnull().sum()

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


plt.figure(figsize=(10, 4), dpi=150)
sns.countplot(data=df, x="Churn")

<AxesSubplot:xlabel='Churn', ylabel='count'>


sns.violinplot(x="Churn", y="TotalCharges", data=df)

<AxesSubplot:xlabel='Churn', ylabel='TotalCharges'>


plt.figure(figsize=(10,4), dpi=100)
sns.boxplot(x="Contract", y="TotalCharges", data=df, hue="Churn")
plt.legend(loc=(1.1, 0.5));


corr_df = pd.get_dummies(df[['gender', 'SeniorCitizen', 'Partner', 'Dependents','PhoneService', 'MultipleLines', 
 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'InternetService',
   'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', "Churn"]]).corr()


corr_df.iloc[1:-1]["Churn_Yes"]

gender_Female                              0.008545
gender_Male                               -0.008545
Partner_No                                 0.149982
Partner_Yes                               -0.149982
Dependents_No                              0.163128
Dependents_Yes                            -0.163128
PhoneService_No                           -0.011691
PhoneService_Yes                           0.011691
MultipleLines_No                          -0.032654
MultipleLines_No phone service            -0.011691
MultipleLines_Yes                          0.040033
OnlineSecurity_No                          0.342235
OnlineSecurity_No internet service        -0.227578
OnlineSecurity_Yes                        -0.171270
OnlineBackup_No                            0.267595
OnlineBackup_No internet service          -0.227578
OnlineBackup_Yes                          -0.082307
DeviceProtection_No                        0.252056
DeviceProtection_No internet service      -0.227578
DeviceProtection_Yes                      -0.066193
TechSupport_No                             0.336877
TechSupport_No internet service           -0.227578
TechSupport_Yes                           -0.164716
InternetService_DSL                       -0.124141
InternetService_Fiber optic                0.307463
InternetService_No                        -0.227578
StreamingTV_No                             0.128435
StreamingTV_No internet service           -0.227578
StreamingTV_Yes                            0.063254
StreamingMovies_No                         0.130920
StreamingMovies_No internet service       -0.227578
StreamingMovies_Yes                        0.060860
Contract_Month-to-month                    0.404565
Contract_One year                         -0.178225
Contract_Two year                         -0.301552
PaperlessBilling_No                       -0.191454
PaperlessBilling_Yes                       0.191454
PaymentMethod_Bank transfer (automatic)   -0.118136
PaymentMethod_Credit card (automatic)     -0.134687
PaymentMethod_Electronic check             0.301455
PaymentMethod_Mailed check                -0.090773
Churn_No                                  -1.000000
Name: Churn_Yes, dtype: float64


plt.figure(figsize=(10, 4), dpi=150)
sns.barplot(x=corr_df.sort_values("Churn_Yes").iloc[1:-1].index, y=corr_df.sort_values("Churn_Yes").iloc[1:-1]["Churn_Yes"].values)
plt.xticks(rotation=90);


df["Contract"].unique()

array(['Month-to-month', 'One year', 'Two year'], dtype=object)


plt.figure(figsize=(10, 4), dpi=100)
sns.histplot(x="tenure", data=df, bins=70);


sns.displot(x="tenure", bins=70, row="Churn", col="Contract", data=df);


plt.figure(figsize=(10, 4), dpi=200)
sns.scatterplot(x="MonthlyCharges", y="TotalCharges", hue="Churn", data=df, alpha=.4);


no_churn = df.groupby(["Churn", "tenure"]).count().loc["No"]
yes_churn = df.groupby(["Churn", "tenure"]).count().loc["Yes"]
yes_churn_rate = 100 * yes_churn / (no_churn + yes_churn)
yes_churn_rate["customerID"]

tenure
1     61.990212
2     51.680672
3     47.000000
4     47.159091
5     48.120301
        ...    
68     9.000000
69     8.421053
70     9.243697
71     3.529412
72     1.657459
Name: customerID, Length: 72, dtype: float64


plt.figure(figsize=(10, 4), dpi=100)
plt.plot(sorted(df["tenure"].value_counts().index), yes_churn_rate["customerID"].values)

[<matplotlib.lines.Line2D at 0x1b6bcdebeb0>]


def tenure_cohort(tenure):
    if tenure <= 12: return '0-12 Months'
    if tenure <= 24: return '12-24 Months'
    if tenure <= 48: return '24-48 Months'
    return 'Over 48 Months'


df["Tenure Cohort"] = df["tenure"].apply(tenure_cohort)


df["Tenure Cohort"]

0          0-12 Months
1         24-48 Months
2          0-12 Months
3         24-48 Months
4          0-12 Months
             ...      
7027      12-24 Months
7028    Over 48 Months
7029       0-12 Months
7030       0-12 Months
7031    Over 48 Months
Name: Tenure Cohort, Length: 7032, dtype: object


plt.figure(figsize=(10, 4), dpi=100)
sns.scatterplot(x="MonthlyCharges", y="TotalCharges", hue="Tenure Cohort", alpha=.5, data=df)

<AxesSubplot:xlabel='MonthlyCharges', ylabel='TotalCharges'>


plt.figure(figsize=(10, 4), dpi=100)
sns.countplot(x="Tenure Cohort", hue="Churn", data=df)

<AxesSubplot:xlabel='Tenure Cohort', ylabel='count'>


sns.catplot(x="Tenure Cohort",col="Contract", kind="count", hue="Churn", data=df)

<seaborn.axisgrid.FacetGrid at 0x1b6be791ac0>


X = df.drop(["customerID", "Churn"], axis=1)
#We do not customerID as it is just a primary key that defines each customer
y = df["Churn"]
#Churn is the label


X_obj = pd.get_dummies(X.select_dtypes(include="object"))
X_non_obj = X.select_dtypes(exclude="object")


X = pd.concat([X_obj, X_non_obj], axis=1)


from sklearn.model_selection import train_test_split, GridSearchCV


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)


from sklearn.tree import DecisionTreeClassifier


dt = DecisionTreeClassifier(random_state=101, max_features="log2", max_depth=7) #Actually I got those values using GridSearchCV
dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=7, max_features='log2', random_state=101)


preds = dt.predict(X_test)


from sklearn.metrics import classification_report, ConfusionMatrixDisplay


print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.88      0.86      0.87       557
         Yes       0.51      0.56      0.53       147

    accuracy                           0.80       704
   macro avg       0.70      0.71      0.70       704
weighted avg       0.80      0.80      0.80       704


ConfusionMatrixDisplay.from_predictions(y_test, preds)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b6ce53f460>


fi = pd.DataFrame(index=X_train.columns, columns=["Feature Importance"], data=dt.feature_importances_).sort_values("Feature Importance")
fi


plt.figure(figsize=(10, 4), dpi=150)
sns.barplot(x=fi.index, y="Feature Importance", data=fi)
plt.xticks(rotation = 90);


from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()


param_grid = {"n_estimators": [64, 100, 110, 124], 
              "max_depth": np.arange(1, 20), 
              "max_features": ["log2", "sqrt", "auto"], 
              "max_samples": [.3],
             "random_state": [101]}


grid = GridSearchCV(estimator=rf, param_grid=param_grid)
grid.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19]),
                         'max_features': ['log2', 'sqrt', 'auto'],
                         'max_samples': [0.3],
                         'n_estimators': [64, 100, 110, 124]})


grid_rf = grid.best_estimator_
preds = grid_rf.predict(X_test)


print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.88      0.92      0.90       557
         Yes       0.62      0.51      0.56       147

    accuracy                           0.83       704
   macro avg       0.75      0.71      0.73       704
weighted avg       0.82      0.83      0.83       704


ConfusionMatrixDisplay.from_predictions(y_test, preds)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b6ce6d0c40>


from sklearn.ensemble import AdaBoostClassifier


ab = AdaBoostClassifier()


param_grid = {'random_state': [101], "n_estimators":[64, 80, 100,110, 124]}
grid = GridSearchCV(estimator=ab, param_grid=param_grid)
grid.fit(X_train, y_train)

GridSearchCV(estimator=AdaBoostClassifier(),
             param_grid={'n_estimators': [64, 80, 100, 110, 124],
                         'random_state': [101]})


ab = grid.best_estimator_
preds = ab.predict(X_test)


print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          No       0.88      0.92      0.90       557
         Yes       0.62      0.51      0.56       147

    accuracy                           0.83       704
   macro avg       0.75      0.71      0.73       704
weighted avg       0.82      0.83      0.83       704


ConfusionMatrixDisplay.from_predictions(y_test, preds)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1b6ce646fd0>

	SeniorCitizen	tenure	MonthlyCharges	TotalCharges
count	7032.000000	7032.000000	7032.000000	7032.000000
mean	0.162400	32.421786	64.798208	2283.300441
std	0.368844	24.545260	30.085974	2266.771362
min	0.000000	1.000000	18.250000	18.800000
25%	0.000000	9.000000	35.587500	401.450000
50%	0.000000	29.000000	70.350000	1397.475000
75%	0.000000	55.000000	89.862500	3794.737500
max	1.000000	72.000000	118.750000	8684.800000

	Feature Importance
TechSupport_No internet service	0.000000
OnlineBackup_No internet service	0.000000
OnlineSecurity_Yes	0.000000
StreamingTV_No internet service	0.000000
InternetService_No	0.000000
StreamingTV_Yes	0.000000
StreamingMovies_No	0.000000
DeviceProtection_No	0.000000
MultipleLines_No phone service	0.000000
StreamingMovies_No internet service	0.000000
StreamingTV_No	0.000205
PhoneService_No	0.000338
PaperlessBilling_Yes	0.000346
OnlineBackup_No	0.000376
SeniorCitizen	0.000561
Partner_Yes	0.000628
TechSupport_Yes	0.000691
Dependents_No	0.000809
DeviceProtection_Yes	0.000859
Tenure Cohort_24-48 Months	0.000915
PaymentMethod_Mailed check	0.001600
Partner_No	0.001707
gender_Female	0.001930
gender_Male	0.002238
StreamingMovies_Yes	0.002368
PaymentMethod_Bank transfer (automatic)	0.002534
MultipleLines_No	0.002775
PhoneService_Yes	0.002813
InternetService_Fiber optic	0.003303
OnlineBackup_Yes	0.005074
Dependents_Yes	0.006631
Tenure Cohort_12-24 Months	0.007046
MultipleLines_Yes	0.007685
Contract_One year	0.010213
PaymentMethod_Credit card (automatic)	0.011581
PaymentMethod_Electronic check	0.011629
DeviceProtection_No internet service	0.012838
PaperlessBilling_No	0.017618
Tenure Cohort_Over 48 Months	0.019423
Contract_Month-to-month	0.020610
Contract_Two year	0.023526
tenure	0.028138
TechSupport_No	0.031651
InternetService_DSL	0.034066
TotalCharges	0.076058
MonthlyCharges	0.076934
OnlineSecurity_No	0.121761
OnlineSecurity_No internet service	0.130077
Tenure Cohort_0-12 Months	0.320447

Project: Customer Churn Prediction¶

Description:¶

PART 1: Importing and Reading Data¶

PART 2: Checking and Exploring Data¶

Part 3: Exploratory Data Analysis¶

Part 4: Churn Analysis¶

Creating cohort based on tenure¶

Cohort Groups:¶

Part 5: Predictive Modeling¶

Decision Tree Performance¶

Random Forest Performance:¶

AdaBoost Performance:¶

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	...	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	...	Yes	No	No	No	One year	No	Mailed check	56.95	1889.50	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	...	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	...	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	...	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes