import pandas as pd
import seaborn as sns


df = pd.read_csv('https://raw.githubusercontent.com/HuntsW/DataScienceClass/main/telco_customer_churn_dataset.csv')

df


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB


df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors="coerce")


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7043 non-null   object 
dtypes: float64(2), int64(2), object(17)
memory usage: 1.1+ MB


binary_columns = ["Partner", "PhoneService", "Dependents", "PaperlessBilling", "Churn"]

for column in binary_columns:
  df[column] = df[column].map({"No":0, "Yes":1})


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   int64  
 4   Dependents        7043 non-null   int64  
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   int64  
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   int64  
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7043 non-null   int64  
dtypes: float64(2), int64(7), object(12)
memory usage: 1.1+ MB


df.head()


df["MultipleLines"].unique()

array(['No phone service', 'No', 'Yes'], dtype=object)


df["MultipleLines"] = df["MultipleLines"].map({"No":0, "Yes":1, "No phone service":0})


df["MultipleLines"].unique()

array([0, 1])


df.head()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   int64  
 4   Dependents        7043 non-null   int64  
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   int64  
 7   MultipleLines     7043 non-null   int64  
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   int64  
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7032 non-null   float64
 20  Churn             7043 non-null   int64  
dtypes: float64(2), int64(8), object(11)
memory usage: 1.1+ MB


df["OnlineSecurity"].unique()

array(['No', 'Yes', 'No internet service'], dtype=object)


df["OnlineSecurity"] = df["OnlineSecurity"].map({"No":0, "Yes":1, "No internet service":0})


df["OnlineSecurity"].unique()

array([0, 1])


df["InternetService"].unique()

array(['DSL', 'Fiber optic', 'No'], dtype=object)


df = pd.get_dummies(data=df, columns=["InternetService"])


df.head()


df["gender"] = df["gender"].map({"Female":0, "Male":1})


internet_columns = ['DeviceProtection', 'TechSupport', 'OnlineBackup', 'StreamingTV', 'StreamingMovies']

for column in internet_columns:
  df[column] = df[column].map({"No":0, "Yes":1, "No internet service":0})


df.head()


df.describe()


df = df.dropna()


df.describe()


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7042
Data columns (total 23 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   customerID                   7032 non-null   object 
 1   gender                       7032 non-null   int64  
 2   SeniorCitizen                7032 non-null   int64  
 3   Partner                      7032 non-null   int64  
 4   Dependents                   7032 non-null   int64  
 5   tenure                       7032 non-null   int64  
 6   PhoneService                 7032 non-null   int64  
 7   MultipleLines                7032 non-null   int64  
 8   OnlineSecurity               7032 non-null   int64  
 9   OnlineBackup                 7032 non-null   int64  
 10  DeviceProtection             7032 non-null   int64  
 11  TechSupport                  7032 non-null   int64  
 12  StreamingTV                  7032 non-null   int64  
 13  StreamingMovies              7032 non-null   int64  
 14  Contract                     7032 non-null   object 
 15  PaperlessBilling             7032 non-null   int64  
 16  PaymentMethod                7032 non-null   object 
 17  MonthlyCharges               7032 non-null   float64
 18  TotalCharges                 7032 non-null   float64
 19  Churn                        7032 non-null   int64  
 20  InternetService_DSL          7032 non-null   uint8  
 21  InternetService_Fiber optic  7032 non-null   uint8  
 22  InternetService_No           7032 non-null   uint8  
dtypes: float64(2), int64(15), object(3), uint8(3)
memory usage: 1.1+ MB


# Displaying correlation coefficients in a heatmap
sns.set(rc={'figure.figsize':(15,12)})

corr = df.corr()
sns.heatmap(corr, annot=True, fmt=".2f")

<matplotlib.axes._subplots.AxesSubplot at 0x7ff8d503f050>


from matplotlib import pyplot as plt


main, subplots = plt.subplots(1, 3, figsize=(16,6))
sns.countplot(data=df, x="gender", hue="Churn", ax=subplots[0])
sns.countplot(data=df, x="SeniorCitizen", hue="Churn", ax=subplots[1])
sns.countplot(data=df, x="Partner", hue="Churn", ax=subplots[2])

<matplotlib.axes._subplots.AxesSubplot at 0x7ff8d19f1290>


main, subplots = plt.subplots(1, 3, figsize=(16,6))
sns.countplot(data=df, x="PhoneService", hue="Churn", ax=subplots[0])
sns.countplot(data=df, x="MultipleLines", hue="Churn", ax=subplots[1])
sns.countplot(data=df, x="DeviceProtection", hue="Churn", ax=subplots[2])

<matplotlib.axes._subplots.AxesSubplot at 0x7ff8d004cd90>


main, subplots = plt.subplots(1, 3, figsize=(16,6))
sns.countplot(data=df, x="OnlineSecurity", hue="Churn", ax=subplots[0])
sns.countplot(data=df, x="OnlineBackup", hue="Churn", ax=subplots[1])
sns.countplot(data=df, x="TechSupport", hue="Churn", ax=subplots[2])

<matplotlib.axes._subplots.AxesSubplot at 0x7ff8cff0e990>


main, subplots = plt.subplots(1, 3, figsize=(16,6))
sns.countplot(data=df, x="StreamingTV", hue="Churn", ax=subplots[0])
sns.countplot(data=df, x="StreamingMovies", hue="Churn", ax=subplots[1])
sns.countplot(data=df, x="PaperlessBilling", hue="Churn", ax=subplots[2])

<matplotlib.axes._subplots.AxesSubplot at 0x7ff8d1ec3110>


main, subplots = plt.subplots(1, 3, figsize=(16,6))
sns.boxplot(data=df, y="tenure", x="Churn", ax=subplots[0])
sns.boxplot(data=df, y="MonthlyCharges", x="Churn", ax=subplots[1])
sns.boxplot(data=df, y="TotalCharges", x="Churn", ax=subplots[2])

<matplotlib.axes._subplots.AxesSubplot at 0x7ff8cfd24d50>


columns = ['SeniorCitizen', 'Partner', 'PhoneService', 'DeviceProtection', 'OnlineSecurity',
           'OnlineBackup', 'TechSupport', 'PaperlessBilling', 'tenure', 'MonthlyCharges', 'TotalCharges']


train_data = df[columns]
train_labels = df['Churn']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data, train_labels, test_size=0.3, random_state=1)


from sklearn import tree

model = tree.DecisionTreeClassifier(max_depth = 3)
model.fit(X_train, X_test)

DecisionTreeClassifier(max_depth=3)


y_pred = model.predict(X_test)


from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.795260663507109


import graphviz
columns =  list(train_data.columns)

dot_data = tree.export_graphviz(model, out_file=None, feature_names=columns, class_names=['No', 'Yes'], filled=True, rounded=True)


graph = graphviz.Source(dot_data)
graph


df_predict = train_data.iloc[0:0].copy()
df_predict.describe()
# copy only the table structure not the data


df_predict = df_predict.append({
    "SeniorCitizen":0,
    "Partner":0,
    "PhoneService":1,
    "DeviceProtection":1,
    "OnlineSecurity":0,
    "OnlineBackup":0,
    "TechSupport":1,
    "PaperlessBilling":1,
    "tenure":12,
    "MonthlyCharges":50,
    "TotalCharges":70
}, ignore_index=True)
# add a new record to df
df_predict


model.predict(df_predict)
# testing our model on the single synthethic data point we created

array([0])

	customerID	gender	Partner	tenure	PhoneService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn	InternetService_DSL	InternetService_Fiber optic
0	7590-VHVEG	0	1	1	0	0	1	0	0	Month-to-month	1	Electronic check	29.85	29.85	0	1	0
1	5575-GNVDE	1	0	34	1	1	0	1	0	One year	0	Mailed check	56.95	1889.50	0	1	0
2	3668-QPYBK	1	0	2	1	1	1	0	0	Month-to-month	1	Mailed check	53.85	108.15	1	1	0
3	7795-CFOCW	1	0	45	0	1	0	1	1	One year	0	Bank transfer (automatic)	42.30	1840.75	0	1	0
4	9237-HQITU	0	0	2	1	0	0	0	0	Month-to-month	1	Electronic check	70.70	151.65	1	0	1

	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	PaperlessBilling	MonthlyCharges	TotalCharges	Churn	InternetService_DSL	InternetService_Fiber optic	InternetService_No
count	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7043.000000	7032.000000	7043.000000	7043.000000	7043.000000	7043.000000
mean	0.504756	0.162147	0.483033	0.299588	32.371149	0.903166	0.421837	0.286668	0.344881	0.343888	0.290217	0.384353	0.387903	0.592219	64.761692	2283.300441	0.265370	0.343746	0.439585	0.216669
std	0.500013	0.368612	0.499748	0.458110	24.559481	0.295752	0.493888	0.452237	0.475363	0.475038	0.453895	0.486477	0.487307	0.491457	30.090047	2266.771362	0.441561	0.474991	0.496372	0.412004
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	18.250000	18.800000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	9.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	35.500000	401.450000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	0.000000	0.000000	0.000000	29.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	70.350000	1397.475000	0.000000	0.000000	0.000000	0.000000
75%	1.000000	0.000000	1.000000	1.000000	55.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	89.850000	3794.737500	1.000000	1.000000	1.000000	0.000000
max	1.000000	1.000000	1.000000	1.000000	72.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	118.750000	8684.800000	1.000000	1.000000	1.000000	1.000000

	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	PaperlessBilling	MonthlyCharges	TotalCharges	Churn	InternetService_DSL	InternetService_Fiber optic	InternetService_No
count	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000	7032.000000
mean	0.504693	0.162400	0.482509	0.298493	32.421786	0.903299	0.421928	0.286547	0.344852	0.343857	0.290102	0.384386	0.388367	0.592719	64.798208	2283.300441	0.265785	0.343572	0.440273	0.216155
std	0.500014	0.368844	0.499729	0.457629	24.545260	0.295571	0.493902	0.452180	0.475354	0.475028	0.453842	0.486484	0.487414	0.491363	30.085974	2266.771362	0.441782	0.474934	0.496455	0.411650
min	0.000000	0.000000	0.000000	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	18.250000	18.800000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	9.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	35.587500	401.450000	0.000000	0.000000	0.000000	0.000000
50%	1.000000	0.000000	0.000000	0.000000	29.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	70.350000	1397.475000	0.000000	0.000000	0.000000	0.000000
75%	1.000000	0.000000	1.000000	1.000000	55.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	89.862500	3794.737500	1.000000	1.000000	1.000000	0.000000
max	1.000000	1.000000	1.000000	1.000000	72.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	118.750000	8684.800000	1.000000	1.000000	1.000000	1.000000

	SeniorCitizen	Partner	PhoneService	DeviceProtection	OnlineSecurity	OnlineBackup	TechSupport	PaperlessBilling	tenure	MonthlyCharges	TotalCharges
count	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
mean	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
std	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
min	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
50%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
75%	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
max	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

Churn Analysis¶

Data Cleaning¶

Exploratory Data Analysis (EDA)¶

Modelling¶

	customerID	gender	SeniorCitizen	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	0	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	0	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	0	No	No	2	Yes	No	DSL	Yes	Yes	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	0	No	No	45	No	No phone service	DSL	Yes	No	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	0	No	No	2	Yes	No	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
7038	6840-RESVB	Male	0	Yes	Yes	24	Yes	Yes	DSL	Yes	No	Yes	Yes	Yes	Yes	One year	Yes	Mailed check	84.80	1990.5	No
7039	2234-XADUH	Female	0	Yes	Yes	72	Yes	Yes	Fiber optic	No	Yes	Yes	No	Yes	Yes	One year	Yes	Credit card (automatic)	103.20	7362.9	No
7040	4801-JZAZL	Female	0	Yes	Yes	11	No	No phone service	DSL	Yes	No	No	No	No	No	Month-to-month	Yes	Electronic check	29.60	346.45	No
7041	8361-LTMKD	Male	1	Yes	No	4	Yes	Yes	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Mailed check	74.40	306.6	Yes
7042	3186-AJIEK	Male	0	No	No	66	Yes	No	Fiber optic	Yes	No	Yes	Yes	Yes	Yes	Two year	Yes	Bank transfer (automatic)	105.65	6844.5	No