import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np


# Load CSV file
data = pd.read_csv("Telco-Customer-Churn.csv", parse_dates=True)

data._name = 'data'


def df_info(df):
    """Display the basic info about the dataset"""
    
    pd.set_option('display.max_columns', None) # Display all the columns of the dataset
    
    print(f"\033[1m5 head rows of the data:\n\033[0m")
    display(df.head(5))
    
    print(f"\033[1mInfo about the data:\n\033[0m")
    display(df.info())
    
    print(f"\033[1mDescription of the data:\n\033[0m")
    display(df.describe().T)

def df_dupl_null(df):
    """
    Handling the duplicates and print info about them and null values 
    """

    # Checking out duplicates
    duplicates = df[df.duplicated()].shape[0]
    if duplicates==0:
        print(f"\033[1mThere's no duplicates in the {df._name} DataFrame.\033[0m")
    else:
        print(f"\033[1mThere's {duplicates} duplicates in the {df._name} DataFrame.\033[0m")
        df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
        print("Duplicates dropped!")
    
    # Checking null values for each column
    pd.set_option('display.max_rows', None)
    rows = df.shape[0]
    print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
    return (df.isnull().sum()/rows)*100

def df_outliers(df):
    """
    Finding information about outliers in the df
    and returning it as a DataFrame
    """
    print(f"\033[1m Outliers detection in {df._name}: \033[0m ")
    
    # Get numeric columns excluding any columns with 'id' in their name
    numeric_cols = [col for col in df.select_dtypes(include='number').columns 
                if 'id' not in col.lower()]
    
    outliers_list = []
    # Outlier check for numeric columns
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outliers_list.append(len(outliers))
        
    rows = df.shape[0]
    # DataFrame for outliers
    outliers_df = pd.DataFrame(
        [[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
        columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
    )

    return outliers_df

def df_summary(df):
    """
    Print a summary of the cleaned dataset
    """
    print(f"\033[1mCleaning of {df._name} DataFrame completed!\033[0m")
    print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
    for column in df.columns:
        if df[column].nunique()==1:
            df.drop([column], inplace=True, axis=1)
        else:
            print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")


df_info(data)

5 head rows of the data:

Info about the data:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 
 17  PaymentMethod     7043 non-null   object 
 18  MonthlyCharges    7043 non-null   float64
 19  TotalCharges      7043 non-null   object 
 20  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB

None

Description of the data:


# Convert Total Charges column to a float type
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')

# Convert Senior Citizen column to a Yes/No type
def convert(value):
    if value == 0:
        return 'No'
    else:
        return 'Yes'
    
data['SeniorCitizen'] = data['SeniorCitizen'].apply(convert)


df_dupl_null(data)

There's no duplicates in the data DataFrame.
 
List of null values for each column in percents:

customerID          0.000000
gender              0.000000
SeniorCitizen       0.000000
Partner             0.000000
Dependents          0.000000
tenure              0.000000
PhoneService        0.000000
MultipleLines       0.000000
InternetService     0.000000
OnlineSecurity      0.000000
OnlineBackup        0.000000
DeviceProtection    0.000000
TechSupport         0.000000
StreamingTV         0.000000
StreamingMovies     0.000000
Contract            0.000000
PaperlessBilling    0.000000
PaymentMethod       0.000000
MonthlyCharges      0.000000
TotalCharges        0.156183
Churn               0.000000
dtype: float64


df_outliers(data)

 Outliers detection in data:


df_summary(data)

Cleaning of data DataFrame completed!
7043 rows and 21 columns remaining.
- customerID: 7043 unique values
- gender: 2 unique values
- SeniorCitizen: 2 unique values
- Partner: 2 unique values
- Dependents: 2 unique values
- tenure: 73 unique values
- PhoneService: 2 unique values
- MultipleLines: 3 unique values
- InternetService: 3 unique values
- OnlineSecurity: 3 unique values
- OnlineBackup: 3 unique values
- DeviceProtection: 3 unique values
- TechSupport: 3 unique values
- StreamingTV: 3 unique values
- StreamingMovies: 3 unique values
- Contract: 3 unique values
- PaperlessBilling: 2 unique values
- PaymentMethod: 4 unique values
- MonthlyCharges: 1585 unique values
- TotalCharges: 6530 unique values
- Churn: 2 unique values


# Globar figure settings
plt.figure(figsize=(10,15))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13

<Figure size 1000x1500 with 0 Axes>


sns.histplot(x='Churn', data=data, hue='Contract', palette='plasma', stat="percent", multiple='dodge', 
             shrink=0.9)
plt.title('Distribution of the Contract Type by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Contract Type', labels=data['Contract'].unique())
plt.subplots_adjust(right=1.2) # Make room for the legend


sns.histplot(x='Churn', data=data, hue='PaymentMethod', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Payment Method by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')

# Format the labels
def cut_string(s):
    if ' (automatic)' in s:
        return s.replace(' (automatic)','')
    else:
        return s

labels = (data['PaymentMethod'].apply(cut_string)).unique()

plt.legend(title='Payment Method', labels=labels)
plt.subplots_adjust(right=1.4)


sns.histplot(x='Churn', data=data, hue='InternetService', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Internet Service Type by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Internet Type', labels=data['InternetService'].unique())
plt.subplots_adjust(right=1.4) # Make room for the legend


colors = {'No': '#8A2BE2', 'Yes': '#FF6347'}  # BlueViolet and Tomato

sns.displot(x='tenure', data=data, hue='Churn', palette=colors, kind='kde', height=6, aspect=1.5)

# Get mean values for each churn type
tenure_mean = data.groupby('Churn')['tenure'].mean()
# Add mean value lines
for churn_type, color in colors.items():
    plt.axvline(x=tenure_mean[churn_type], color=color, linestyle='--', 
                alpha=0.6, linewidth=2, label=f'Mean Tenure ({churn_type}): {tenure_mean[churn_type]:.1f}')

plt.title('Distribution of Customer Tenure by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Tenure (months)')
plt.ylabel('Density')

plt.grid(axis='both', alpha=0.6, linestyle='--')

# Add annotations for mean values
for churn_type, color in colors.items():
    plt.annotate(f'Mean: {tenure_mean[churn_type]:.1f}', 
                xy=(tenure_mean[churn_type]-1, 0.0001),
                xytext=(0, 10),
                textcoords='offset points',
                color=color,
                fontweight='bold',
                ha='right')

plt.tight_layout()


sns.histplot(x='Churn', data=data, hue='SeniorCitizen', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Seniorship by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Senior Citizen Status', labels=data['SeniorCitizen'].unique())

<matplotlib.legend.Legend at 0x175fcc98cd0>


sns.histplot(x='Churn', data=data, hue='gender', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Gender by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Gender', labels=data['gender'].unique())

<matplotlib.legend.Legend at 0x175fcd854c0>


sns.histplot(x='Churn', data=data, hue='Partner', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Partnership by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Does the customer have a partner?', labels=data['Partner'].unique())
plt.subplots_adjust(right=1.2) # Make room for the legend


from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin


# Drop customerId column, irrelevant for model training
data.drop('customerID', axis=1, inplace=True)

# Convert the target variable 'Churn' to binary type
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})

# Split columns on different categories, to apply proper transformations
binary_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 
               'PhoneService', 'PaperlessBilling']
categorical_cols = ['InternetService', 'Contract', 'PaymentMethod', 'MultipleLines', 
                   'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 
                   'StreamingTV', 'StreamingMovies']
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']

# Split the data into train and test sets
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


# Create features
def create_engineered_features(X):
    """Create engineered features outside the pipeline"""
    if not hasattr(X, 'columns'):
        raise ValueError("Input must be a pandas DataFrame")
    
    X_new = X.copy()
    
    # Add revenue per tenure feature
    X_new['revenue_per_tenure'] = X_new['MonthlyCharges'] / (X_new['tenure'] + 1)
    
    # Add tenure group as categorical feature
    tenure_bins = [-1, 12, 24, 36, 48, 60, 72]
    tenure_labels = [0, 1, 2, 3, 4, 5]
    X_new['tenure_group'] = pd.cut(X_new['tenure'], bins=tenure_bins, labels=tenure_labels)
    
    return X_new

# Apply feature engineering before pipeline to avoid errors
X_train_with_features = create_engineered_features(X_train)
X_test_with_features = create_engineered_features(X_test)


# PREPROCESSING PIPELINES

# Handle binary variables with OHE mapping function
binary_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

# Handle categorical variables
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', sparse_output=False))
])

# Handle numerical variables
numerical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Updated column lists
numerical_cols_updated = numerical_cols + ['revenue_per_tenure']  # Add new numerical feature
categorical_cols_updated = categorical_cols + ['tenure_group']    # Add new categorical feature

# Combine all transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('binary', binary_transformer, binary_cols),
        ('categorical', categorical_transformer, categorical_cols_updated),
        ('numerical', numerical_transformer, numerical_cols_updated)
    ],
    remainder='passthrough'  # Pass through SeniorCitizen column
)

# Create main pipeline with feature selection and PCA
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectKBest(f_classif, k=15)),
    ('pca', PCA(n_components=0.95))
])


# Apply pipeline to training data
X_train_processed = pipeline.fit_transform(X_train_with_features, y_train)
X_test_processed = pipeline.transform(X_test_with_features)

# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)

# Print summary of data dimensions
print(f"Original training data dimensions: {X_train.shape}")
print(f"Processed training data dimensions: {X_train_processed.shape}")
print(f"Balanced training data dimensions: {X_train_balanced.shape}")
print(f"Original class distribution: {pd.Series(y_train).value_counts(normalize=True)}")
print(f"Balanced class distribution: {pd.Series(y_train_balanced).value_counts(normalize=True)}")

Original training data dimensions: (5634, 19)
Processed training data dimensions: (5634, 6)
Balanced training data dimensions: (8278, 6)
Original class distribution: 0    0.734647
1    0.265353
Name: Churn, dtype: float64
Balanced class distribution: 0    0.5
1    0.5
Name: Churn, dtype: float64

C:\Users\necke\anaconda3\lib\site-packages\sklearn\base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.
  warnings.warn(


# Data is now ready for model training
# X_train_balanced, y_train_balanced - balanced data for training
# X_test_processed, y_test - test data for evaluation


from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier


def evaluate_model(y_test, y_pred, model_name):
    """
    Comprehensive model evaluation function that displays:
    - Confusion matrix (raw numbers)
    - Classification report (precision, recall, F1-score)
    - Overall accuracy score
    
    Parameters:
    y_test (array): True labels from test set
    y_pred (array): Predicted labels from model
    model_name (str): Name of the model for display purposes
    """
    print(f"📊 Performance of {model_name}")
    
    # Display confusion matrix - shows true vs predicted classifications
    print(f"\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    
    # Display detailed classification metrics for each class
    print(f"\nClasification Report:\n")
    print(classification_report(y_test, y_pred))
    
    # Display overall accuracy as a percentage
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
    
    print("-" * 40)


# Initialize and train Logistic Regression model
# max_iter=1000 ensures the algorithm has enough iterations to converge
log_model = LogisticRegression(max_iter=1000, penalty='l2')
log_model.fit(X_train_balanced, y_train_balanced)

# Generate predictions using the trained logistic regression model
y_pred_log = log_model.predict(X_test_processed)
y_pred_log_proba = log_model.predict_proba(X_test_processed)[:, 1]

# Evaluate model
evaluate_model(y_test, y_pred_log, "Logistic Regression")

📊 Performance of Logistic Regression

Confusion Matrix:
[[750 285]
 [ 92 282]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.89      0.72      0.80      1035
           1       0.50      0.75      0.60       374

    accuracy                           0.73      1409
   macro avg       0.69      0.74      0.70      1409
weighted avg       0.79      0.73      0.75      1409

Accuracy: 0.73
----------------------------------------


# Initialize and train Random Forest model
# random_state=42 ensures reproducible results across runs
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)

# Generate predictions using the trained random forest model
y_pred_rf = rf_model.predict(X_test_processed)
y_pred_rf_proba = rf_model.predict_proba(X_test_processed)[:, 1]

# Evaluate model
evaluate_model(y_test, y_pred_rf, "Random Forest")

📊 Performance of Random Forest

Confusion Matrix:
[[821 214]
 [149 225]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.85      0.79      0.82      1035
           1       0.51      0.60      0.55       374

    accuracy                           0.74      1409
   macro avg       0.68      0.70      0.69      1409
weighted avg       0.76      0.74      0.75      1409

Accuracy: 0.74
----------------------------------------


# Initialize and train XGBoost model
# random_state=42 ensures reproducible results across runs
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_balanced, y_train_balanced)

# Generate predictions using the trained random forest model
y_pred_xgb = xgb_model.predict(X_test_processed)
y_pred_xgb_proba = xgb_model.predict_proba(X_test_processed)[:, 1]

# Evaluate model
evaluate_model(y_test, y_pred_xgb, "XGBoost")

📊 Performance of XGBoost

Confusion Matrix:
[[778 257]
 [104 270]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.88      0.75      0.81      1035
           1       0.51      0.72      0.60       374

    accuracy                           0.74      1409
   macro avg       0.70      0.74      0.71      1409
weighted avg       0.78      0.74      0.76      1409

Accuracy: 0.74
----------------------------------------

C:\Users\necke\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning: [14:44:59] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)


from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score, precision_score, recall_score


roc_log = roc_auc_score(y_test, y_pred_log_proba)
roc_rf = roc_auc_score(y_test, y_pred_rf_proba)
roc_xgb = roc_auc_score(y_test, y_pred_xgb_proba)

print(f"Logistic Regression ROC-AUC: {roc_log:.3f}")
print(f"Random Forest ROC-AUC: {roc_rf:.3f}")
print(f"XGBoost ROC-AUC: {roc_xgb:.3f}")

print("Logistic Reggresion has the highest, hence, the best ROC-AUC score.")

Logistic Regression ROC-AUC: 0.829
Random Forest ROC-AUC: 0.781
XGBoost ROC-AUC: 0.809
Logistic Reggresion has the highest, hence, the best ROC_AUC score.


fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_log_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf_proba)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb_proba)


plt.figure(figsize=(10, 6))
plt.plot(fpr_log, tpr_log, label=f"Logistic Regression (AUC = {roc_log:.2f})")
plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC = {roc_rf:.2f})")
plt.plot(fpr_xgb, tpr_xgb, label=f"XGBoost (AUC = {roc_xgb:.2f})")
plt.plot([0, 1], [0, 1], "k--")  # diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()


prec_log, recall_log, _ = precision_recall_curve(y_test, y_pred_log_proba)
prec_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_rf_proba)
prec_xgb, recall_xgb, _ = precision_recall_curve(y_test, y_pred_xgb_proba)


plt.figure(figsize=(8, 6))
plt.plot(recall_log, prec_log, label="Logistic Regression")
plt.plot(recall_rf, prec_rf, label="Random Forest")
plt.plot(recall_xgb, prec_xgb, label="XGBoost")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid()
plt.show()


summary = {
    "Model": ["Logistic Regression", "Random Forest", "XGBoost"],
    "Accuracy": [accuracy_score(y_test, y_pred_log), accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_xgb)],
    "Precision": [precision_score(y_test, y_pred_log), precision_score(y_test, y_pred_rf), precision_score(y_test, y_pred_xgb)],
    "Recall": [recall_score(y_test, y_pred_log), recall_score(y_test, y_pred_rf), recall_score(y_test, y_pred_xgb)],
    "F1 Score": [f1_score(y_test, y_pred_log), f1_score(y_test, y_pred_rf), f1_score(y_test, y_pred_xgb)],
    "ROC-AUC": [roc_log, roc_rf, roc_xgb]
}

import pandas as pd
results_df = pd.DataFrame(summary)
print(results_df)

                 Model  Accuracy  Precision    Recall  F1 Score   ROC-AUC
0  Logistic Regression  0.732434   0.497354  0.754011  0.599362  0.829067
1        Random Forest  0.742370   0.512528  0.601604  0.553506  0.781343
2              XGBoost  0.743790   0.512334  0.721925  0.599334  0.809206


from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 8]
}

grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1')
grid_rf.fit(X_train_balanced, y_train_balanced)

print("Best parameters:", grid_rf.best_params_)
best_rf = grid_rf.best_estimator_

y_pred_best_rf = best_rf.predict(X_test_processed)
evaluate_model(y_test, y_pred_best_rf, "Tuned Random Forest")
print("No significant changes in performance.")

Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 300}
📊 Performance of Tuned Random Forest

Confusion Matrix:
[[817 218]
 [139 235]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.85      0.79      0.82      1035
           1       0.52      0.63      0.57       374

    accuracy                           0.75      1409
   macro avg       0.69      0.71      0.69      1409
weighted avg       0.77      0.75      0.75      1409

Accuracy: 0.75
----------------------------------------
No significant changes in performance.


param_grid_xgb = {
    'n_estimators': [300, 400, 500],             
    'max_depth': [11, 13, 15],                 
    'learning_rate': [0.01, 0.1, 0.2],     
    'subsample': [0.4, 0.6, 0.8],                 
    'colsample_bytree': [0.6, 0.8, 1]            
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

grid_xgb = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid_xgb,
    scoring='accuracy',
    cv=3,
    verbose=1,
    n_jobs=-1
)

grid_xgb.fit(X_train_balanced, y_train_balanced)

print("Best Parameters:", grid_xgb.best_params_)

best_xgb = grid_xgb.best_estimator_

y_pred_best_xgb = best_xgb.predict(X_test_processed)
y_proba_best_xgb = best_xgb.predict_proba(X_test_processed)[:, 1]

evaluate_model(y_test, y_pred_best_xgb, "Tuned XGBoost")

print("No significant changes in performance")

Fitting 3 folds for each of 243 candidates, totalling 729 fits

C:\Users\necke\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning: [15:09:23] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740: 
Parameters: { "use_label_encoder" } are not used.

  warnings.warn(smsg, UserWarning)

Best Parameters: {'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 400, 'subsample': 0.8}
📊 Performance of Tuned XGBoost

Confusion Matrix:
[[795 240]
 [124 250]]

Clasification Report:

              precision    recall  f1-score   support

           0       0.87      0.77      0.81      1035
           1       0.51      0.67      0.58       374

    accuracy                           0.74      1409
   macro avg       0.69      0.72      0.70      1409
weighted avg       0.77      0.74      0.75      1409

Accuracy: 0.74
----------------------------------------
No significant changes in performance

	count	mean	std	min	25%	50%	75%	max
SeniorCitizen	7043.0	0.162147	0.368612	0.00	0.0	0.00	0.00	1.00
tenure	7043.0	32.371149	24.559481	0.00	9.0	29.00	55.00	72.00
MonthlyCharges	7043.0	64.761692	30.090047	18.25	35.5	70.35	89.85	118.75

Info about the data and cleansing

EDA

Feature Engineering: preparing the data for training models

Train Models

Logistic Regression

Random Forest

XGBoost

Compare Models

Hyperparameter Tuning

	customerID	gender	Partner	Dependents	tenure	PhoneService	MultipleLines	InternetService	OnlineSecurity	OnlineBackup	DeviceProtection	TechSupport	StreamingTV	StreamingMovies	Contract	PaperlessBilling	PaymentMethod	MonthlyCharges	TotalCharges	Churn
0	7590-VHVEG	Female	Yes	No	1	No	No phone service	DSL	No	Yes	No	No	No	No	Month-to-month	Yes	Electronic check	29.85	29.85	No
1	5575-GNVDE	Male	No	No	34	Yes	No	DSL	Yes	No	Yes	No	No	No	One year	No	Mailed check	56.95	1889.5	No
2	3668-QPYBK	Male	No	No	2	Yes	No	DSL	Yes	Yes	No	No	No	No	Month-to-month	Yes	Mailed check	53.85	108.15	Yes
3	7795-CFOCW	Male	No	No	45	No	No phone service	DSL	Yes	No	Yes	Yes	No	No	One year	No	Bank transfer (automatic)	42.30	1840.75	No
4	9237-HQITU	Female	No	No	2	Yes	No	Fiber optic	No	No	No	No	No	No	Month-to-month	Yes	Electronic check	70.70	151.65	Yes