-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdevelopment.py
344 lines (262 loc) · 13.3 KB
/
development.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
import pandas as pd
import numpy as np
pd.set_option('display.float_format', lambda x: '%.3f' % x)
#pd.reset_option('display.float_format')
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from joblib import dump
# Import raw data from Excel-file
raw_df = pd.read_csv("Telco-Customer-Churn.csv")
raw_df.columns
###############################################################################
########################## Exploratory Data Analysis ##########################
###############################################################################
########################### Minor data preprocessing ##########################
# Drop CompanyID
raw_df.drop('customerID', axis=1, inplace=True)
#_______________________
# Replace empty string values with 0 for the TotalCharges-column
raw_df['TotalCharges'] = raw_df['TotalCharges'].replace(' ', 0)
# Convert TotalCharges-column to float
raw_df['TotalCharges'] = raw_df['TotalCharges'].astype(float)
#_______________________
# Convert SeniorCitizen-column from 0/1 to Yes/No
raw_df['SeniorCitizen'] = raw_df['SeniorCitizen'].map({0: 'No', 1: 'Yes'})
############################## Data Exploration ###############################
# Check balance of the target variable
raw_df['Churn'].value_counts()
raw_df['Churn'].value_counts(normalize=True) * 100
#____________________________ Categorical Columns _____________________________
# List of columns to create pie charts for
categorical_columns = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']
# Loop to create pie charts for categorical columns
for column in categorical_columns:
# Calculate value counts
value_counts = raw_df[column].value_counts()
# Plot pie chart
plt.figure(figsize=(8, 6))
plt.pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=90,
colors=plt.cm.Paired.colors, textprops={'fontsize': 16})
plt.title(f'Pie Chart for {column}', fontsize=20)
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()
#_____________________________ Continuous Columns _____________________________
# Check descriptive statistics for continuous columns
descriptive_stats = raw_df[['tenure','MonthlyCharges', 'TotalCharges']].describe()
print(descriptive_stats)
# List of columns relevant to plot
continuous_columns = ['tenure','MonthlyCharges', 'TotalCharges']
# Create histograms for each column
plt.figure(figsize=(15, 20))
for i, column in enumerate(continuous_columns, 1):
plt.subplot(len(continuous_columns), 1, i)
plt.hist(raw_df[column], bins=50, edgecolor='k')
plt.title(f'Histogram of {column}', fontsize=20)
plt.xlabel(column)
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
###############################################################################
##################### Feature Engineering / Preprocessing #####################
###############################################################################
#_________________________ Convert binary columns _____________________________
# Function to perform binary encoding on columns
def convert_column_to_binary(raw_df, column, one_value):
raw_df[column] = (raw_df[column] == one_value).astype(int)
# Dict with all binary columns and the corresponding values that should be encoded as 1
binary_columns = {'gender': 'Male',
'Partner': 'Yes',
'Dependents': 'Yes',
'PhoneService': 'Yes',
'PaperlessBilling': 'Yes',
'Churn': 'Yes',
'SeniorCitizen': 'Yes'}
# Loop through binary columns
for column, one_value in binary_columns.items():
convert_column_to_binary(raw_df, column, one_value)
#_____________________ One-hot-encode multinomial columns _____________________
# Count number of each category in a column to determine which to exclude
raw_df['PaymentMethod'].value_counts()
# Function to one-hot-encode columns and create dummy variables
def get_dummy_variables(raw_df, column, drop_value):
# One-hot-encode the column
raw_df = pd.get_dummies(raw_df, columns = [column])
# Remove one dummy variable to avoid multicollinearity
raw_df = raw_df.drop([drop_value], axis=1)
return raw_df
# Dict with all multinomial columns and the category which should be removed
dummy_columns = {'MultipleLines': 'MultipleLines_No phone service',
'InternetService': 'InternetService_No',
'OnlineSecurity': 'OnlineSecurity_No internet service',
'OnlineBackup': 'OnlineBackup_No internet service',
'DeviceProtection': 'DeviceProtection_No internet service',
'TechSupport': 'TechSupport_No internet service',
'StreamingMovies': 'StreamingMovies_No internet service',
'StreamingTV': 'StreamingTV_No internet service',
'Contract': 'Contract_One year',
'PaymentMethod': 'PaymentMethod_Credit card (automatic)'}
# Loop through multinomial columns to apply one-hot encoding
for column, drop_value in dummy_columns.items():
raw_df = get_dummy_variables(raw_df, column, drop_value)
# Convert all boolean columns to binary (1/0)
bool_columns = raw_df.select_dtypes(include='bool').columns
raw_df[bool_columns] = raw_df[bool_columns].astype(int)
###############################################################################
######################## Outliers and Multicollinearity #######################
###############################################################################
#_______________________ Check and handle outliers ____________________________
# Check descriptive statistics for continuous columns
descriptive_stats = raw_df[['tenure','MonthlyCharges', 'TotalCharges']].describe()
print(descriptive_stats)
# List of columns relevant to plot
continuous_columns = ['tenure','MonthlyCharges', 'TotalCharges']
# Create histograms for each column
plt.figure(figsize=(15, 20))
for i, column in enumerate(continuous_columns, 1):
plt.subplot(len(continuous_columns), 1, i)
plt.hist(raw_df[column], bins=50, edgecolor='k')
plt.title(f'Histogram of {column}')
plt.xlabel(column)
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
# Impute outliers
for column in ['tenure','MonthlyCharges', 'TotalCharges']:
threshold = raw_df[column].quantile(0.90)
raw_df.loc[raw_df[column]>threshold, column] = threshold
# Check descriptive statistics for continuous columns
descriptive_stats = raw_df[['tenure','MonthlyCharges', 'TotalCharges']].describe()
print(descriptive_stats)
#_____________________ Check and handle multicollinearity _____________________
# Plot correlation heatmap
plt.figure(figsize=(14, 10))
sns.heatmap(raw_df.drop(columns=['Churn']).corr(method='pearson'),
vmin=-1, vmax=1, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap', fontsize=14, weight='bold');
# Calculate the correlation matrix
correlation_matrix = raw_df.drop(columns=['Churn']).corr(method='pearson')
# Create a mask to get only the upper triangle of the matrix
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
# Apply the mask to the correlation matrix
upper_triangle = correlation_matrix.where(mask)
# Find all pairs with correlation greater than 0.7
high_correlation_pairs = []
for column in upper_triangle.columns:
for row in upper_triangle.index:
if upper_triangle.loc[row, column] > 0.7 and row != column:
high_correlation_pairs.append((row, column, upper_triangle.loc[row, column]))
# Print highly correlated pairs
print(high_correlation_pairs)
# Drop one of each correlated pairs
raw_df.drop(['TotalCharges', 'InternetService_Fiber optic'], axis=1, inplace=True)
###############################################################################
############################## Logistic Regression ############################
###############################################################################
# Copy dataframe
ml_df = raw_df.copy()
# Separate features and target
X = ml_df.drop('Churn', axis=1)
y = ml_df['Churn']
# Identify continuous columns
continuous_cols = ['tenure', 'MonthlyCharges']
# Standardize the continuous variables
scaler = StandardScaler()
X[continuous_cols] = scaler.fit_transform(X[continuous_cols])
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)
#__________________________ Initial model creation ____________________________
# Initialize LogisticRegression model
model = LogisticRegression(max_iter=500,
class_weight='balanced') # Weight balancer for target variable
# Fit the model on X_train and y_train
model.fit(X_train, y_train)
# Predict on test data
y_preds = model.predict(X_test)
# Confusion matrix
cm = confusion_matrix(y_test, y_preds)
# Plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['retained', 'churned'])
disp.plot()
# Print classification report
print(classification_report(y_test, y_preds, target_names=['retained', 'churned']))
#_________________ Hyperparameter tuning with cross-validation ________________
# Define the hyperparameter grid
param_grid = [
{
'C': [0.01, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 10, 20, 30, 50, 80, 100], # Regularization strength
'penalty': ['l1', 'l2'], # Regularization type
'solver': ['liblinear'] # Solver that supports L1 and L2 regularization
},
{
'C': [0.01, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 10, 20, 30, 50, 80, 100], # Regularization strength
'penalty': ['l2'], # Regularization type
'solver': ['lbfgs', 'newton-cg', 'newton-cholesky', 'sag'] # Solvers that support only L2 regularization
},
{
'C': [0.01, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 10, 20, 30, 50, 80, 100], # Regularization strength
'penalty': ['elasticnet'], # Regularization type
'solver': ['saga'], # Solver that supports ElasticNet regularization
'l1_ratio': [0, 0.5, 1] # ElasticNet mixing parameter
},
{
'C': [0.01, 0.1, 0.15, 0.2, 0.3, 0.4, 0.5, 10, 20, 30, 50, 80, 100], # Regularization strength
'penalty': ['l1', 'l2'], # Regularization type
'solver': ['saga'] # Solver that supports L1 and L2 regularization
}
]
# Initialize Logistic Regression model
model = LogisticRegression(max_iter=2000, class_weight='balanced')
# Set up GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=StratifiedKFold(n_splits=5), scoring='recall', n_jobs=-1)
# Fit GridSearchCV on the training data
grid_search.fit(X_train, y_train)
# Get the best model
best_model = grid_search.best_estimator_
# Print best hyperparameters
print(f"Best Hyperparameters: {grid_search.best_params_}")
# Perform cross-validation on the whole dataset with the best model
cv_scores = cross_val_score(best_model, X, y, cv=StratifiedKFold(n_splits=5), scoring='recall')
# Print cross-validation scores
print(f"Cross-Validation Recall Scores: {cv_scores}")
print(f"Average Cross-Validation Recall Score: {cv_scores.mean()}")
#___________________________ Final model evaluation ___________________________
# Predict on test data using the best model
final_y_preds = best_model.predict(X_test)
# Confusion matrix for final model
final_cm = confusion_matrix(y_test, final_y_preds)
# Plot the confusion matrix
final_disp = ConfusionMatrixDisplay(confusion_matrix=final_cm, display_labels=['retained', 'churned'])
final_disp.plot()
# Print classification report
print("Final Model Classification Report:")
print(classification_report(y_test, final_y_preds, target_names=['retained', 'churned']))
#___________________________ Intercept and coefficients _______________________
# Calculate odds
odds = np.exp(best_model.intercept_)
# Calculate probability of churn for when all variables are 0
probability = odds / (1 + odds)
print(probability)
# Get coefficients for all the variables
model_coefficients = pd.DataFrame(best_model.coef_[0], index=X.columns)
model_coefficients = model_coefficients.sort_values(by=0, ascending=False)
print(model_coefficients)
# Plot coefficients
plt.figure(figsize=(10, 8))
model_coefficients[0].plot(kind='barh', color='blue')
plt.xlabel('Coefficient Value')
plt.ylabel('Predictor Variable')
plt.title('Logistic Regression Coefficients')
# Filter out and keep only the coefficients that are exactly zero
null_coefficients = model_coefficients.loc[model_coefficients[0] == 0]
print(null_coefficients)
# Save the model to a joblib file
dump(best_model, 'best_model.joblib')
# Save the scaler to a joblib file
dump(scaler, 'scaler.joblib')