Are you answering the right churn questions? | by Nicklas Ankarstad | Dec, 2020

[ad_1]


How to structure customer churn problems with classification and survival analysis

Nicklas Ankarstad
Photo by Jan Tinneberg on Unsplash
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,confusion_matrix,balanced_accuracy_score, accuracy_score, classification_report
import matplotlib.pylab as plt
import seaborn as sns
import shap
import xgboost as xgb
from interpret.glassbox import ExplainableBoostingClassifier, LogisticRegression, ClassificationTree, DecisionListClassifier
from interpret import show
from interpret.perf import ROC
import numpy as np
from lifelines import *
from lifelines import CoxPHFitter, KaplanMeierFitter, GeneralizedGammaFitter
from lifelines.utils import find_best_parametric_model
from lifelines.datasets import load_lymph_node
from lifelines.plotting import qq_plot
shap.initjs()
df = pd.read_csv('Telco-Customer-Churn.csv')
df.head()
df.isna().sum()
df['TotalCharges'] = df["TotalCharges"].replace(" ",np.nan).astype('float64')
X_train = X_train.replace(np.nan, 0)
X_test = X_test.replace(np.nan, 0)
df.dtypes
df.nunique()

Target Variable

## 1 = Churned , 0 = Not Churned
le = preprocessing.LabelEncoder()
le.fit(df['Churn'])
df['Churn'] = le.transform(df['Churn'])
sns.countplot(x="Churn", data=df)
Class Balance Plot
df.groupby('Churn').count()['customerID']/ df.shape[0]
sns.pointplot(x="Contract", y="Churn", hue="OnlineSecurity", kind="point", data=df);
sns.boxplot(x="Churn", y="TotalCharges", data=df)
for each in ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod' ]:
le.fit(df[each])
df[each] = le.transform(df[each])
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a mask for the upper triangle
corr = df.corr()
mask = np.triu(np.ones_like(corr, dtype=np.bool))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Plot correlation matrix
sns.heatmap(corr, cmap =cmap,linewidths=.5, mask=mask)
## Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Churn','customerID'],axis =1), df['Churn'], test_size=0.25, random_state=42)

Logistic Regression Model

seed = 1255
lr = LogisticRegression(random_state=seed, max_iter = 1000,penalty='l1', solver='liblinear')
lr.fit(X_train[['PhoneService','PaperlessBilling', 'TotalCharges', 'Contract', 'InternetService']], y_train)
lr_global = lr.explain_global(name='Logistic Regression')
show(lr_global)
lr_perf = ROC(lr.predict_proba).explain_perf(X_test, y_test, name='Logistic Regression')
show(lr_perf)
## Predict each class 
y_pred = lr.predict(X_test)
## Predict the probablities of each class
y_pred_proba = lr.predict_proba(X_test)
# Accuracy
LR_accuracy_score = accuracy_score(y_test, y_pred)
# ROC AUC
LR_roc_auc_score = roc_auc_score(y_test,y_pred_proba[:, 1])
# Balanced Accuracy
LR_balanced_accuracy_score = balanced_accuracy_score(y_test, y_pred)
data = pd.DataFrame(columns= ['metric','value'])
data['value'] = [LR_accuracy_score,LR_roc_auc_score,LR_balanced_accuracy_score]
data['metric'] = ['Accuracy Score','ROC AUC','Balanced_Accuracy']
# plot horizontal barplot
sns.set(rc={'figure.figsize':(10,5)})
ax = sns.barplot(x="value", y="metric", data=data)
ax.set(title='Logistic Regression Model Performance') # title barplot
# label each bar in barplot
for p in ax.patches:
height = p.get_height() # height of each horizontal bar is the same
width = p.get_width() # width
# adding text to each bar
ax.text(x = width, # x-coordinate position of data label, padded 3 to right of bar
y = p.get_y()+(height/2), # # y-coordinate position of data label, padded to be in the middle of the bar
s = '{:.00%}'.format(width), # data label, formatted to ignore decimals
va = 'center') # sets vertical alignment (va) to center
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

Explainable Boosting Machine (EBM)

seed =100
ebm = ExplainableBoostingClassifier(random_state=seed,interactions=100, n_estimators = 400, max_tree_splits = 10, n_jobs =3)
ebm.fit(X_train,y_train)
ebm_global = ebm.explain_global(name='EBM')
show(ebm_global)
EBM Global Variable Importance
ebm_local = ebm.explain_local(X_test[:50], y_test[:50], name='EBM')
show(ebm_local)
EBM Individual Performance
ebm_perf = ROC(ebm.predict_proba).explain_perf(X_test, y_test, name='EBM')
show(ebm_perf)
y_pred = ebm.predict(X_test)
## Predict the probablities of each class
y_pred_proba = ebm.predict_proba(X_test)
# Traditional Accuracy
EBM_accuracy_score = accuracy_score(y_test, y_pred)
# ROC AUC
EBM_roc_auc_score = roc_auc_score(y_test,y_pred_proba[:, 1])
#Balanced Accuracy
EBM_balanced_accuracy_score = balanced_accuracy_score(y_test, y_pred)
data = pd.DataFrame(columns= ['metric','value'])
data['value'] = [LR_accuracy_score,LR_roc_auc_score,LR_balanced_accuracy_score]
data['metric'] = ['Accuracy Score','ROC AUC','Balanced_Accuracy']
## From: https://medium.com/swlh/quick-guide-to-labelling-data-for-common-seaborn-plots-736e10bf14a9# plot horizontal barplot
sns.set(rc={'figure.figsize':(10,5)})
ax = sns.barplot(x="value", y="metric", data=data)
ax.set(title='Logistic Regression Model Performance') # title barplot
# label each bar in barplot
for p in ax.patches:
height = p.get_height() # height of each horizontal bar is the same
width = p.get_width() # width
# adding text to each bar
ax.text(x = width, # x-coordinate position of data label, padded 3 to right of bar
y = p.get_y()+(height/2), # # y-coordinate position of data label, padded to be in the middle of the bar
s = '{:.00%}'.format(width), # data label, formatted to ignore decimals
va = 'center') # sets vertical alignment (va) to center
target_names = ['class 0', 'class 1']
print(classification_report(y_test, y_pred, target_names=target_names))

XGBoost

## XGBoost has its own datamatrix that helps speed up computations.d_train = xgb.DMatrix(X_train, label=y_train)
d_test = xgb.DMatrix(X_test, label=y_test)
## Set up parameters.
params = {
"eta": 0.005,
"objective": "binary:logistic",
"eval_metric": "auc",
"scale_pos_weight": 3
}
model = xgb.train(params, d_train,
num_boost_round = 5000, verbose_eval=100)
xgb.plot_importance(model)
plt.title("xgboost.plot_importance(model)")
plt.show()
## Shap 'utf-8' codec can't decode byte 0xff for xgboost model" issue workaround
## https://github.com/slundberg/shap/issues/1215
model_bytearray = model.save_raw()[4:]
def myfun(self=None):
return model_bytearray
model.save_raw = myfun
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
shap.summary_plot(shap_values, X_train)
SHAP Summary Plot
shap.force_plot(explainer.expected_value, shap_values[0,:], X_train.iloc[0,:])
SHAP individual explaination for one customer
shap.dependence_plot("TotalCharges", shap_values,X_train, show=False)
pl.xlim(80,225)
pl.show()
SHAP Partial Dependence Plot
y_pred = model.predict(d_test)
## Make sure y_pred is binary (0 or 1)
y_pred = np.array(model.predict(d_test))
y_pred = y_pred > 0.5
y_pred = y_pred.astype(int)
## Predict the probablities of each class
y_pred_proba = model.predict(d_test)
## Traditional Accuracy
XGB_accuracy_score = accuracy_score(y_test, y_pred)
#ROC AUC
XGB_roc_auc_score = roc_auc_score(y_test,y_pred_proba)
## Balanced Accuracy
XGB_balanced_accuracy_score = balanced_accuracy_score(y_test, y_pred)
data = pd.DataFrame(columns= ['metric','value'])
data['value'] = [XGB_accuracy_score,XGB_roc_auc_score,XGB_balanced_accuracy_score]
data['metric'] = ['Accuracy Score','ROC AUC','Balanced_Accuracy']
# plot horizontal barplot
sns.set(rc={'figure.figsize':(10,5)})
ax = sns.barplot(x="value", y="metric", data=data)
ax.set(title='XGBoost Model Performance') # title barplot
# label each bar in barplot
for p in ax.patches:
height = p.get_height() # height of each horizontal bar is the same
width = p.get_width() # width
# adding text to each bar
ax.text(x = width, # x-coordinate position of data label, padded 3 to right of bar
y = p.get_y()+(height/2), # # y-coordinate position of data label, padded to be in the middle of the bar
s = '{:.00%}'.format(width), # data label, formatted to ignore decimals
va = 'center') # sets vertical alignment (va) to center
target_names = [‘class 0’, ‘class 1’]
print(classification_report(y_test, y_pred, target_names=target_names))
XGBoost Classification Report

Kaplan Meier

## Assign the target variables
T= X_train[“tenure”]
E = y_train
T_test = X_test[“tenure”]
E_test = y_test
## Create Boolean values of churns. (we will use this for scoring with concordance_index_censored from sksurv libary )
y_train_bolean= y_train.replace(0,False).replace(1,True)
y_test_bolean= y_test.replace(0,False).replace(1,True)
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)
kmf.plot()
Kaplan Meier Curve
fig, axes = plt.subplots(2, 2, figsize=(9, 9))
timeline = np.linspace(0, 0.25, 100)
wf = WeibullFitter().fit(T, E, label="Weibull", timeline=timeline)
lnf = LogNormalFitter().fit(T, E, label="Log Normal", timeline=timeline)
# plot what we just fit, along with the KMF estimate
kmf.plot_cumulative_density(ax=axes[0][0], ci_show=False)
wf.plot_cumulative_density(ax=axes[0][0], ci_show=False)
qq_plot(wf, ax=axes[0][1])
kmf.plot_cumulative_density(ax=axes[1][0], ci_show=False)
lnf.plot_cumulative_density(ax=axes[1][0], ci_show=False)
qq_plot(lnf, ax=axes[1][1])
best_model, best_aic_ = find_best_parametric_model(T, E)
best_model.plot_hazard()
Best model based on AIC

Cox’s proportional hazard model

X_train['Churn'] = y_train
cph = CoxPHFitter()
cph.fit(X_train, duration_col='tenure', event_col='Churn')
cph.print_summary()
cph.plot()
cph.plot_covariate_groups('Contract', [0, 1, 2], cmap='coolwarm')

XGBoost

X_train['Churn'] = y_train## Replace the customers who haven't churned with 0
X_train['Churn']= X_train['Churn'].replace(0,-1)
## Create target variable
y_cox_train = X_train['Churn'] * X_train['tenure']
## Drop churn and tenure to avoid leakage
X_train.drop(['Churn', 'tenure'], axis = 1, inplace =True)
## Convert these to xgb data matrix
xgb_train = xgb.DMatrix(X_train,label=y_cox_train)
X_test[‘Churn’] = y_test## Replace the customers who haven’t churned with 0
X_test[‘Churn’]= X_test[‘Churn’].replace(0,-1)
## Create target variable
y_cox_test = X_test[‘Churn’] * X_test[‘tenure’]
## Drop churn and tenure to avoid leakage
X_test.drop([‘Churn’,’tenure’], axis = 1, inplace =True)
## Convert these to xgb data matrix
xgb_test = xgb.DMatrix(X_test, label=y_cox_test)
# use validation set to choose # of trees
params = {
"eta": 0.002,
"max_depth": 3,
"objective": "survival:cox",
"subsample": 0.5
}
model_train = xgb.train(params, xgb_train, 10000, evals = [(xgb_test, "test")], verbose_eval=1000)
def c_statistic_harrell(pred, labels):
total = 0
matches = 0
for i in range(len(labels)):
for j in range(len(labels)):
if labels[j] > 0 and abs(labels[i]) > labels[j]:
total += 1
if pred[j] > pred[i]:
matches += 1
return matches/total
# see how well we can order people by survival
c_statistic_harrell(model_train.predict(xgb_test, ntree_limit=5000), y_cox_test.array)
shap_values = shap.TreeExplainer(model_train).shap_values(X_train)
shap.summary_plot(shap_values, X_train)
SHAP Summary Plot

References

Read More …

[ad_2]


Write a comment