In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np
In [68]:
# Load CSV file
data = pd.read_csv("Telco-Customer-Churn.csv", parse_dates=True)

data._name = 'data'

Info about the data and cleansing

In [69]:
def df_info(df):
    """Display the basic info about the dataset"""
    
    pd.set_option('display.max_columns', None) # Display all the columns of the dataset
    
    print(f"\033[1m5 head rows of the data:\n\033[0m")
    display(df.head(5))
    
    print(f"\033[1mInfo about the data:\n\033[0m")
    display(df.info())
    
    print(f"\033[1mDescription of the data:\n\033[0m")
    display(df.describe().T)

def df_dupl_null(df):
    """
    Handling the duplicates and print info about them and null values 
    """

    # Checking out duplicates
    duplicates = df[df.duplicated()].shape[0]
    if duplicates==0:
        print(f"\033[1mThere's no duplicates in the {df._name} DataFrame.\033[0m")
    else:
        print(f"\033[1mThere's {duplicates} duplicates in the {df._name} DataFrame.\033[0m")
        df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
        print("Duplicates dropped!")
    
    # Checking null values for each column
    pd.set_option('display.max_rows', None)
    rows = df.shape[0]
    print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
    return (df.isnull().sum()/rows)*100

def df_outliers(df):
    """
    Finding information about outliers in the df
    and returning it as a DataFrame
    """
    print(f"\033[1m Outliers detection in {df._name}: \033[0m ")
    
    # Get numeric columns excluding any columns with 'id' in their name
    numeric_cols = [col for col in df.select_dtypes(include='number').columns 
                if 'id' not in col.lower()]
    
    outliers_list = []
    # Outlier check for numeric columns
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outliers_list.append(len(outliers))
        
    rows = df.shape[0]
    # DataFrame for outliers
    outliers_df = pd.DataFrame(
        [[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
        columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
    )

    return outliers_df

def df_summary(df):
    """
    Print a summary of the cleaned dataset
    """
    print(f"\033[1mCleaning of {df._name} DataFrame completed!\033[0m")
    print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
    for column in df.columns:
        if df[column].nunique()==1:
            df.drop([column], inplace=True, axis=1)
        else:
            print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")
In [70]:
df_info(data)
5 head rows of the data:

customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
Info about the data:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
None
Description of the data:

count mean std min 25% 50% 75% max
SeniorCitizen 7043.0 0.162147 0.368612 0.00 0.0 0.00 0.00 1.00
tenure 7043.0 32.371149 24.559481 0.00 9.0 29.00 55.00 72.00
MonthlyCharges 7043.0 64.761692 30.090047 18.25 35.5 70.35 89.85 118.75
In [71]:
# Convert Total Charges column to a float type
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Convert Senior Citizen column to a Yes/No type
def convert(value):
    if value == 0:
        return 'No'
    else:
        return 'Yes'
    
data['SeniorCitizen'] = data['SeniorCitizen'].apply(convert)
In [72]:
df_dupl_null(data)
There's no duplicates in the data DataFrame.
 
List of null values for each column in percents: 

Out[72]:
customerID          0.000000
gender              0.000000
SeniorCitizen       0.000000
Partner             0.000000
Dependents          0.000000
tenure              0.000000
PhoneService        0.000000
MultipleLines       0.000000
InternetService     0.000000
OnlineSecurity      0.000000
OnlineBackup        0.000000
DeviceProtection    0.000000
TechSupport         0.000000
StreamingTV         0.000000
StreamingMovies     0.000000
Contract            0.000000
PaperlessBilling    0.000000
PaymentMethod       0.000000
MonthlyCharges      0.000000
TotalCharges        0.156183
Churn               0.000000
dtype: float64
In [73]:
df_outliers(data)
 Outliers detection in data:  
Out[73]:
column_name num_of_outliers pctg_of_outliers
0 tenure 0 0.0
1 MonthlyCharges 0 0.0
2 TotalCharges 0 0.0
In [74]:
df_summary(data)
Cleaning of data DataFrame completed!
7043 rows and 21 columns remaining.
- customerID: 7043 unique values
- gender: 2 unique values
- SeniorCitizen: 2 unique values
- Partner: 2 unique values
- Dependents: 2 unique values
- tenure: 73 unique values
- PhoneService: 2 unique values
- MultipleLines: 3 unique values
- InternetService: 3 unique values
- OnlineSecurity: 3 unique values
- OnlineBackup: 3 unique values
- DeviceProtection: 3 unique values
- TechSupport: 3 unique values
- StreamingTV: 3 unique values
- StreamingMovies: 3 unique values
- Contract: 3 unique values
- PaperlessBilling: 2 unique values
- PaymentMethod: 4 unique values
- MonthlyCharges: 1585 unique values
- TotalCharges: 6530 unique values
- Churn: 2 unique values

EDA

In [75]:
# Globar figure settings
plt.figure(figsize=(10,15))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
<Figure size 1000x1500 with 0 Axes>
In [78]:
sns.histplot(x='Churn', data=data, hue='Contract', palette='plasma', stat="percent", multiple='dodge', 
             shrink=0.9)
plt.title('Distribution of the Contract Type by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Contract Type', labels=data['Contract'].unique())
plt.subplots_adjust(right=1.2) # Make room for the legend
In [79]:
sns.histplot(x='Churn', data=data, hue='PaymentMethod', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Payment Method by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')

# Format the labels
def cut_string(s):
    if ' (automatic)' in s:
        return s.replace(' (automatic)','')
    else:
        return s

labels = (data['PaymentMethod'].apply(cut_string)).unique()

plt.legend(title='Payment Method', labels=labels)
plt.subplots_adjust(right=1.4)
In [80]:
sns.histplot(x='Churn', data=data, hue='InternetService', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Internet Service Type by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Internet Type', labels=data['InternetService'].unique())
plt.subplots_adjust(right=1.4) # Make room for the legend
In [81]:
colors = {'No': '#8A2BE2', 'Yes': '#FF6347'}  # BlueViolet and Tomato

sns.displot(x='tenure', data=data, hue='Churn', palette=colors, kind='kde', height=6, aspect=1.5)

# Get mean values for each churn type
tenure_mean = data.groupby('Churn')['tenure'].mean()
# Add mean value lines
for churn_type, color in colors.items():
    plt.axvline(x=tenure_mean[churn_type], color=color, linestyle='--', 
                alpha=0.6, linewidth=2, label=f'Mean Tenure ({churn_type}): {tenure_mean[churn_type]:.1f}')

plt.title('Distribution of Customer Tenure by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Tenure (months)')
plt.ylabel('Density')

plt.grid(axis='both', alpha=0.6, linestyle='--')

# Add annotations for mean values
for churn_type, color in colors.items():
    plt.annotate(f'Mean: {tenure_mean[churn_type]:.1f}', 
                xy=(tenure_mean[churn_type]-1, 0.0001),
                xytext=(0, 10),
                textcoords='offset points',
                color=color,
                fontweight='bold',
                ha='right')

plt.tight_layout()
In [82]:
sns.histplot(x='Churn', data=data, hue='SeniorCitizen', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Seniorship by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Senior Citizen Status', labels=data['SeniorCitizen'].unique())
Out[82]:
<matplotlib.legend.Legend at 0x175fcc98cd0>
In [83]:
sns.histplot(x='Churn', data=data, hue='gender', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Gender by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Gender', labels=data['gender'].unique())
Out[83]:
<matplotlib.legend.Legend at 0x175fcd854c0>
In [84]:
sns.histplot(x='Churn', data=data, hue='Partner', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Partnership by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Does the customer have a partner?', labels=data['Partner'].unique())
plt.subplots_adjust(right=1.2) # Make room for the legend

Feature Engineering: preparing the data for training models

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
In [6]:
# Drop customerId column, irrelevant for model training
data.drop('customerID', axis=1, inplace=True)

# Convert the target variable 'Churn' to binary type
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

# Split columns on different categories, to apply proper transformations
binary_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
               'PhoneService', 'PaperlessBilling']
categorical_cols = ['InternetService', 'Contract', 'PaymentMethod', 'MultipleLines', 
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                   'StreamingTV', 'StreamingMovies']
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Split the data into train and test sets
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
In [7]:
# Create features
def create_engineered_features(X):
    """Create engineered features outside the pipeline"""
    if not hasattr(X, 'columns'):
        raise ValueError("Input must be a pandas DataFrame")
    
    X_new = X.copy()
    
    # Add revenue per tenure feature
    X_new['revenue_per_tenure'] = X_new['MonthlyCharges'] / (X_new['tenure'] + 1)
    
    # Add tenure group as categorical feature
    tenure_bins = [-1, 12, 24, 36, 48, 60, 72]
    tenure_labels = [0, 1, 2, 3, 4, 5]
    X_new['tenure_group'] = pd.cut(X_new['tenure'], bins=tenure_bins, labels=tenure_labels)
    
    return X_new

# Apply feature engineering before pipeline to avoid errors
X_train_with_features = create_engineered_features(X_train)
X_test_with_features = create_engineered_features(X_test)
In [8]:
# PREPROCESSING PIPELINES

# Handle binary variables with OHE mapping function
binary_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

# Handle categorical variables
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

# Handle numerical variables
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Updated column lists
numerical_cols_updated = numerical_cols + ['revenue_per_tenure']  # Add new numerical feature
categorical_cols_updated = categorical_cols + ['tenure_group']    # Add new categorical feature

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', binary_transformer, binary_cols),
        ('categorical', categorical_transformer, categorical_cols_updated),
        ('numerical', numerical_transformer, numerical_cols_updated)
    ],
    remainder='passthrough'  # Pass through SeniorCitizen column
)

# Create main pipeline with feature selection and PCA
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(f_classif, k=15)),
    ('pca', PCA(n_components=0.95))
])
In [9]:
# Apply pipeline to training data
X_train_processed = pipeline.fit_transform(X_train_with_features, y_train)
X_test_processed = pipeline.transform(X_test_with_features)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)

# Print summary of data dimensions
print(f"Original training data dimensions: {X_train.shape}")
print(f"Processed training data dimensions: {X_train_processed.shape}")
print(f"Balanced training data dimensions: {X_train_balanced.shape}")
print(f"Original class distribution: {pd.Series(y_train).value_counts(normalize=True)}")
print(f"Balanced class distribution: {pd.Series(y_train_balanced).value_counts(normalize=True)}")
Original training data dimensions: (5634, 19)
Processed training data dimensions: (5634, 6)
Balanced training data dimensions: (8278, 6)
Original class distribution: 0    0.734647
1    0.265353
Name: Churn, dtype: float64
Balanced class distribution: 0    0.5
1    0.5
Name: Churn, dtype: float64
C:\Users\necke\anaconda3\lib\site-packages\sklearn\base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.
  warnings.warn(
In [ ]:
# Data is now ready for model training
# X_train_balanced, y_train_balanced - balanced data for training
# X_test_processed, y_test - test data for evaluation

Train Models

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
In [20]:
def evaluate_model(y_test, y_pred, model_name):
    """
    Comprehensive model evaluation function that displays:
    - Confusion matrix (raw numbers)
    - Classification report (precision, recall, F1-score)
    - Overall accuracy score
    
    Parameters:
    y_test (array): True labels from test set
    y_pred (array): Predicted labels from model
    model_name (str): Name of the model for display purposes
    """
    print(f"📊 Performance of {model_name}")
    
    # Display confusion matrix - shows true vs predicted classifications
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Display detailed classification metrics for each class
    print(f"\nClasification Report:\n")
    print(classification_report(y_test, y_pred))
    
    # Display overall accuracy as a percentage
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    
    print("-" * 40)

Logistic Regression

In [30]:
# Initialize and train Logistic Regression model
# max_iter=1000 ensures the algorithm has enough iterations to converge
log_model = LogisticRegression(max_iter=1000, penalty='l2')
log_model.fit(X_train_balanced, y_train_balanced)

# Generate predictions using the trained logistic regression model
y_pred_log = log_model.predict(X_test_processed)
y_pred_log_proba = log_model.predict_proba(X_test_processed)[:, 1]

# Evaluate model
evaluate_model(y_test, y_pred_log, "Logistic Regression")
📊 Performance of Logistic Regression

Confusion Matrix:
[[750 285]
 [ 92 282]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.89      0.72      0.80      1035
           1       0.50      0.75      0.60       374

    accuracy                           0.73      1409
   macro avg       0.69      0.74      0.70      1409
weighted avg       0.79      0.73      0.75      1409

Accuracy: 0.73
----------------------------------------

Random Forest

In [29]:
# Initialize and train Random Forest model
# random_state=42 ensures reproducible results across runs
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

# Generate predictions using the trained random forest model
y_pred_rf = rf_model.predict(X_test_processed)
y_pred_rf_proba = rf_model.predict_proba(X_test_processed)[:, 1]

# Evaluate model
evaluate_model(y_test, y_pred_rf, "Random Forest")
📊 Performance of Random Forest

Confusion Matrix:
[[821 214]
 [149 225]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.85      0.79      0.82      1035
           1       0.51      0.60      0.55       374

    accuracy                           0.74      1409
   macro avg       0.68      0.70      0.69      1409
weighted avg       0.76      0.74      0.75      1409

Accuracy: 0.74
----------------------------------------

XGBoost

In [28]:
# Initialize and train XGBoost model
# random_state=42 ensures reproducible results across runs
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_balanced, y_train_balanced)

# Generate predictions using the trained random forest model
y_pred_xgb = xgb_model.predict(X_test_processed)
y_pred_xgb_proba = xgb_model.predict_proba(X_test_processed)[:, 1]

# Evaluate model
evaluate_model(y_test, y_pred_xgb, "XGBoost")
📊 Performance of XGBoost

Confusion Matrix:
[[778 257]
 [104 270]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.88      0.75      0.81      1035
           1       0.51      0.72      0.60       374

    accuracy                           0.74      1409
   macro avg       0.70      0.74      0.71      1409
weighted avg       0.78      0.74      0.76      1409

Accuracy: 0.74
----------------------------------------
C:\Users\necke\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning: [14:44:59] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)

Compare Models

In [38]:
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score, precision_score, recall_score
In [34]:
roc_log = roc_auc_score(y_test, y_pred_log_proba)
roc_rf = roc_auc_score(y_test, y_pred_rf_proba)
roc_xgb = roc_auc_score(y_test, y_pred_xgb_proba)

print(f"Logistic Regression ROC-AUC: {roc_log:.3f}")
print(f"Random Forest ROC-AUC: {roc_rf:.3f}")
print(f"XGBoost ROC-AUC: {roc_xgb:.3f}")

print("Logistic Reggresion has the highest, hence, the best ROC-AUC score.")
Logistic Regression ROC-AUC: 0.829
Random Forest ROC-AUC: 0.781
XGBoost ROC-AUC: 0.809
Logistic Reggresion has the highest, hence, the best ROC_AUC score.
In [36]:
fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_log_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf_proba)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb_proba)


plt.figure(figsize=(10, 6))
plt.plot(fpr_log, tpr_log, label=f"Logistic Regression (AUC = {roc_log:.2f})")
plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC = {roc_rf:.2f})")
plt.plot(fpr_xgb, tpr_xgb, label=f"XGBoost (AUC = {roc_xgb:.2f})")
plt.plot([0, 1], [0, 1], "k--")  # diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()
In [37]:
prec_log, recall_log, _ = precision_recall_curve(y_test, y_pred_log_proba)
prec_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_rf_proba)
prec_xgb, recall_xgb, _ = precision_recall_curve(y_test, y_pred_xgb_proba)


plt.figure(figsize=(8, 6))
plt.plot(recall_log, prec_log, label="Logistic Regression")
plt.plot(recall_rf, prec_rf, label="Random Forest")
plt.plot(recall_xgb, prec_xgb, label="XGBoost")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid()
plt.show()
In [39]:
summary = {
    "Model": ["Logistic Regression", "Random Forest", "XGBoost"],
    "Accuracy": [accuracy_score(y_test, y_pred_log), accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_xgb)],
    "Precision": [precision_score(y_test, y_pred_log), precision_score(y_test, y_pred_rf), precision_score(y_test, y_pred_xgb)],
    "Recall": [recall_score(y_test, y_pred_log), recall_score(y_test, y_pred_rf), recall_score(y_test, y_pred_xgb)],
    "F1 Score": [f1_score(y_test, y_pred_log), f1_score(y_test, y_pred_rf), f1_score(y_test, y_pred_xgb)],
    "ROC-AUC": [roc_log, roc_rf, roc_xgb]
}

import pandas as pd
results_df = pd.DataFrame(summary)
print(results_df)
                 Model  Accuracy  Precision    Recall  F1 Score   ROC-AUC
0  Logistic Regression  0.732434   0.497354  0.754011  0.599362  0.829067
1        Random Forest  0.742370   0.512528  0.601604  0.553506  0.781343
2              XGBoost  0.743790   0.512334  0.721925  0.599334  0.809206

Hyperparameter Tuning

In [40]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
In [41]:
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 8]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1')
grid_rf.fit(X_train_balanced, y_train_balanced)

print("Best parameters:", grid_rf.best_params_)
best_rf = grid_rf.best_estimator_

y_pred_best_rf = best_rf.predict(X_test_processed)
evaluate_model(y_test, y_pred_best_rf, "Tuned Random Forest")
print("No significant changes in performance.")
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 300}
📊 Performance of Tuned Random Forest

Confusion Matrix:
[[817 218]
 [139 235]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.85      0.79      0.82      1035
           1       0.52      0.63      0.57       374

    accuracy                           0.75      1409
   macro avg       0.69      0.71      0.69      1409
weighted avg       0.77      0.75      0.75      1409

Accuracy: 0.75
----------------------------------------
No significant changes in performance.
In [42]:
param_grid_xgb = {
    'n_estimators': [300, 400, 500],             
    'max_depth': [11, 13, 15],                 
    'learning_rate': [0.01, 0.1, 0.2],     
    'subsample': [0.4, 0.6, 0.8],                 
    'colsample_bytree': [0.6, 0.8, 1]            
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

grid_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_xgb.fit(X_train_balanced, y_train_balanced)

print("Best Parameters:", grid_xgb.best_params_)

best_xgb = grid_xgb.best_estimator_

y_pred_best_xgb = best_xgb.predict(X_test_processed)
y_proba_best_xgb = best_xgb.predict_proba(X_test_processed)[:, 1]

evaluate_model(y_test, y_pred_best_xgb, "Tuned XGBoost")

print("No significant changes in performance")
Fitting 3 folds for each of 243 candidates, totalling 729 fits
C:\Users\necke\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning: [15:09:23] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)
Best Parameters: {'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 400, 'subsample': 0.8}
📊 Performance of Tuned XGBoost

Confusion Matrix:
[[795 240]
 [124 250]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.87      0.77      0.81      1035
           1       0.51      0.67      0.58       374

    accuracy                           0.74      1409
   macro avg       0.69      0.72      0.70      1409
weighted avg       0.77      0.74      0.75      1409

Accuracy: 0.74
----------------------------------------
No significant changes in performance
In [ ]: