import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np
# Load CSV file
data = pd.read_csv("Telco-Customer-Churn.csv", parse_dates=True)
data._name = 'data'
def df_info(df):
"""Display the basic info about the dataset"""
pd.set_option('display.max_columns', None) # Display all the columns of the dataset
print(f"\033[1m5 head rows of the data:\n\033[0m")
display(df.head(5))
print(f"\033[1mInfo about the data:\n\033[0m")
display(df.info())
print(f"\033[1mDescription of the data:\n\033[0m")
display(df.describe().T)
def df_dupl_null(df):
"""
Handling the duplicates and print info about them and null values
"""
# Checking out duplicates
duplicates = df[df.duplicated()].shape[0]
if duplicates==0:
print(f"\033[1mThere's no duplicates in the {df._name} DataFrame.\033[0m")
else:
print(f"\033[1mThere's {duplicates} duplicates in the {df._name} DataFrame.\033[0m")
df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
print("Duplicates dropped!")
# Checking null values for each column
pd.set_option('display.max_rows', None)
rows = df.shape[0]
print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
return (df.isnull().sum()/rows)*100
def df_outliers(df):
"""
Finding information about outliers in the df
and returning it as a DataFrame
"""
print(f"\033[1m Outliers detection in {df._name}: \033[0m ")
# Get numeric columns excluding any columns with 'id' in their name
numeric_cols = [col for col in df.select_dtypes(include='number').columns
if 'id' not in col.lower()]
outliers_list = []
# Outlier check for numeric columns
for col in numeric_cols:
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
outliers_list.append(len(outliers))
rows = df.shape[0]
# DataFrame for outliers
outliers_df = pd.DataFrame(
[[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
)
return outliers_df
def df_summary(df):
"""
Print a summary of the cleaned dataset
"""
print(f"\033[1mCleaning of {df._name} DataFrame completed!\033[0m")
print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
for column in df.columns:
if df[column].nunique()==1:
df.drop([column], inplace=True, axis=1)
else:
print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")
df_info(data)
5 head rows of the data:
customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 7590-VHVEG | Female | 0 | Yes | No | 1 | No | No phone service | DSL | No | Yes | No | No | No | No | Month-to-month | Yes | Electronic check | 29.85 | 29.85 | No |
1 | 5575-GNVDE | Male | 0 | No | No | 34 | Yes | No | DSL | Yes | No | Yes | No | No | No | One year | No | Mailed check | 56.95 | 1889.5 | No |
2 | 3668-QPYBK | Male | 0 | No | No | 2 | Yes | No | DSL | Yes | Yes | No | No | No | No | Month-to-month | Yes | Mailed check | 53.85 | 108.15 | Yes |
3 | 7795-CFOCW | Male | 0 | No | No | 45 | No | No phone service | DSL | Yes | No | Yes | Yes | No | No | One year | No | Bank transfer (automatic) | 42.30 | 1840.75 | No |
4 | 9237-HQITU | Female | 0 | No | No | 2 | Yes | No | Fiber optic | No | No | No | No | No | No | Month-to-month | Yes | Electronic check | 70.70 | 151.65 | Yes |
Info about the data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 customerID 7043 non-null object
1 gender 7043 non-null object
2 SeniorCitizen 7043 non-null int64
3 Partner 7043 non-null object
4 Dependents 7043 non-null object
5 tenure 7043 non-null int64
6 PhoneService 7043 non-null object
7 MultipleLines 7043 non-null object
8 InternetService 7043 non-null object
9 OnlineSecurity 7043 non-null object
10 OnlineBackup 7043 non-null object
11 DeviceProtection 7043 non-null object
12 TechSupport 7043 non-null object
13 StreamingTV 7043 non-null object
14 StreamingMovies 7043 non-null object
15 Contract 7043 non-null object
16 PaperlessBilling 7043 non-null object
17 PaymentMethod 7043 non-null object
18 MonthlyCharges 7043 non-null float64
19 TotalCharges 7043 non-null object
20 Churn 7043 non-null object
dtypes: float64(1), int64(2), object(18)
memory usage: 1.1+ MB
None
Description of the data:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
SeniorCitizen | 7043.0 | 0.162147 | 0.368612 | 0.00 | 0.0 | 0.00 | 0.00 | 1.00 |
tenure | 7043.0 | 32.371149 | 24.559481 | 0.00 | 9.0 | 29.00 | 55.00 | 72.00 |
MonthlyCharges | 7043.0 | 64.761692 | 30.090047 | 18.25 | 35.5 | 70.35 | 89.85 | 118.75 |
# Convert Total Charges column to a float type
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
# Convert Senior Citizen column to a Yes/No type
def convert(value):
if value == 0:
return 'No'
else:
return 'Yes'
data['SeniorCitizen'] = data['SeniorCitizen'].apply(convert)
df_dupl_null(data)
There's no duplicates in the data DataFrame. List of null values for each column in percents:
customerID 0.000000 gender 0.000000 SeniorCitizen 0.000000 Partner 0.000000 Dependents 0.000000 tenure 0.000000 PhoneService 0.000000 MultipleLines 0.000000 InternetService 0.000000 OnlineSecurity 0.000000 OnlineBackup 0.000000 DeviceProtection 0.000000 TechSupport 0.000000 StreamingTV 0.000000 StreamingMovies 0.000000 Contract 0.000000 PaperlessBilling 0.000000 PaymentMethod 0.000000 MonthlyCharges 0.000000 TotalCharges 0.156183 Churn 0.000000 dtype: float64
df_outliers(data)
Outliers detection in data:
column_name | num_of_outliers | pctg_of_outliers | |
---|---|---|---|
0 | tenure | 0 | 0.0 |
1 | MonthlyCharges | 0 | 0.0 |
2 | TotalCharges | 0 | 0.0 |
df_summary(data)
Cleaning of data DataFrame completed! 7043 rows and 21 columns remaining. - customerID: 7043 unique values - gender: 2 unique values - SeniorCitizen: 2 unique values - Partner: 2 unique values - Dependents: 2 unique values - tenure: 73 unique values - PhoneService: 2 unique values - MultipleLines: 3 unique values - InternetService: 3 unique values - OnlineSecurity: 3 unique values - OnlineBackup: 3 unique values - DeviceProtection: 3 unique values - TechSupport: 3 unique values - StreamingTV: 3 unique values - StreamingMovies: 3 unique values - Contract: 3 unique values - PaperlessBilling: 2 unique values - PaymentMethod: 4 unique values - MonthlyCharges: 1585 unique values - TotalCharges: 6530 unique values - Churn: 2 unique values
# Globar figure settings
plt.figure(figsize=(10,15))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
<Figure size 1000x1500 with 0 Axes>
sns.histplot(x='Churn', data=data, hue='Contract', palette='plasma', stat="percent", multiple='dodge',
shrink=0.9)
plt.title('Distribution of the Contract Type by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Contract Type', labels=data['Contract'].unique())
plt.subplots_adjust(right=1.2) # Make room for the legend
sns.histplot(x='Churn', data=data, hue='PaymentMethod', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Payment Method by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
# Format the labels
def cut_string(s):
if ' (automatic)' in s:
return s.replace(' (automatic)','')
else:
return s
labels = (data['PaymentMethod'].apply(cut_string)).unique()
plt.legend(title='Payment Method', labels=labels)
plt.subplots_adjust(right=1.4)
sns.histplot(x='Churn', data=data, hue='InternetService', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Internet Service Type by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Internet Type', labels=data['InternetService'].unique())
plt.subplots_adjust(right=1.4) # Make room for the legend
colors = {'No': '#8A2BE2', 'Yes': '#FF6347'} # BlueViolet and Tomato
sns.displot(x='tenure', data=data, hue='Churn', palette=colors, kind='kde', height=6, aspect=1.5)
# Get mean values for each churn type
tenure_mean = data.groupby('Churn')['tenure'].mean()
# Add mean value lines
for churn_type, color in colors.items():
plt.axvline(x=tenure_mean[churn_type], color=color, linestyle='--',
alpha=0.6, linewidth=2, label=f'Mean Tenure ({churn_type}): {tenure_mean[churn_type]:.1f}')
plt.title('Distribution of Customer Tenure by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Tenure (months)')
plt.ylabel('Density')
plt.grid(axis='both', alpha=0.6, linestyle='--')
# Add annotations for mean values
for churn_type, color in colors.items():
plt.annotate(f'Mean: {tenure_mean[churn_type]:.1f}',
xy=(tenure_mean[churn_type]-1, 0.0001),
xytext=(0, 10),
textcoords='offset points',
color=color,
fontweight='bold',
ha='right')
plt.tight_layout()
sns.histplot(x='Churn', data=data, hue='SeniorCitizen', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Seniorship by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Senior Citizen Status', labels=data['SeniorCitizen'].unique())
<matplotlib.legend.Legend at 0x175fcc98cd0>
sns.histplot(x='Churn', data=data, hue='gender', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Gender by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Gender', labels=data['gender'].unique())
<matplotlib.legend.Legend at 0x175fcd854c0>
sns.histplot(x='Churn', data=data, hue='Partner', palette='plasma', stat="percent", multiple='dodge', shrink=0.7)
plt.title('Distribution of the Partnership by Churn Status', fontsize=18, fontweight='bold')
plt.xlabel('Churn')
plt.ylabel('Percentage')
plt.legend(title='Does the customer have a partner?', labels=data['Partner'].unique())
plt.subplots_adjust(right=1.2) # Make room for the legend
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, TransformerMixin
# Drop customerId column, irrelevant for model training
data.drop('customerID', axis=1, inplace=True)
# Convert the target variable 'Churn' to binary type
data['Churn'] = data['Churn'].map({'Yes': 1, 'No': 0})
# Split columns on different categories, to apply proper transformations
binary_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents',
'PhoneService', 'PaperlessBilling']
categorical_cols = ['InternetService', 'Contract', 'PaymentMethod', 'MultipleLines',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies']
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
# Split the data into train and test sets
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Create features
def create_engineered_features(X):
"""Create engineered features outside the pipeline"""
if not hasattr(X, 'columns'):
raise ValueError("Input must be a pandas DataFrame")
X_new = X.copy()
# Add revenue per tenure feature
X_new['revenue_per_tenure'] = X_new['MonthlyCharges'] / (X_new['tenure'] + 1)
# Add tenure group as categorical feature
tenure_bins = [-1, 12, 24, 36, 48, 60, 72]
tenure_labels = [0, 1, 2, 3, 4, 5]
X_new['tenure_group'] = pd.cut(X_new['tenure'], bins=tenure_bins, labels=tenure_labels)
return X_new
# Apply feature engineering before pipeline to avoid errors
X_train_with_features = create_engineered_features(X_train)
X_test_with_features = create_engineered_features(X_test)
# PREPROCESSING PIPELINES
# Handle binary variables with OHE mapping function
binary_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(drop='first', sparse_output=False))
])
# Handle categorical variables
categorical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder(drop='first', sparse_output=False))
])
# Handle numerical variables
numerical_transformer = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
# Updated column lists
numerical_cols_updated = numerical_cols + ['revenue_per_tenure'] # Add new numerical feature
categorical_cols_updated = categorical_cols + ['tenure_group'] # Add new categorical feature
# Combine all transformers
preprocessor = ColumnTransformer(
transformers=[
('binary', binary_transformer, binary_cols),
('categorical', categorical_transformer, categorical_cols_updated),
('numerical', numerical_transformer, numerical_cols_updated)
],
remainder='passthrough' # Pass through SeniorCitizen column
)
# Create main pipeline with feature selection and PCA
pipeline = Pipeline([
('preprocessor', preprocessor),
('feature_selection', SelectKBest(f_classif, k=15)),
('pca', PCA(n_components=0.95))
])
# Apply pipeline to training data
X_train_processed = pipeline.fit_transform(X_train_with_features, y_train)
X_test_processed = pipeline.transform(X_test_with_features)
# Apply SMOTE for class balancing
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_processed, y_train)
# Print summary of data dimensions
print(f"Original training data dimensions: {X_train.shape}")
print(f"Processed training data dimensions: {X_train_processed.shape}")
print(f"Balanced training data dimensions: {X_train_balanced.shape}")
print(f"Original class distribution: {pd.Series(y_train).value_counts(normalize=True)}")
print(f"Balanced class distribution: {pd.Series(y_train_balanced).value_counts(normalize=True)}")
Original training data dimensions: (5634, 19) Processed training data dimensions: (5634, 6) Balanced training data dimensions: (8278, 6) Original class distribution: 0 0.734647 1 0.265353 Name: Churn, dtype: float64 Balanced class distribution: 0 0.5 1 0.5 Name: Churn, dtype: float64
C:\Users\necke\anaconda3\lib\site-packages\sklearn\base.py:474: FutureWarning: `BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API. warnings.warn(
# Data is now ready for model training
# X_train_balanced, y_train_balanced - balanced data for training
# X_test_processed, y_test - test data for evaluation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from xgboost import XGBClassifier
def evaluate_model(y_test, y_pred, model_name):
"""
Comprehensive model evaluation function that displays:
- Confusion matrix (raw numbers)
- Classification report (precision, recall, F1-score)
- Overall accuracy score
Parameters:
y_test (array): True labels from test set
y_pred (array): Predicted labels from model
model_name (str): Name of the model for display purposes
"""
print(f"📊 Performance of {model_name}")
# Display confusion matrix - shows true vs predicted classifications
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Display detailed classification metrics for each class
print(f"\nClasification Report:\n")
print(classification_report(y_test, y_pred))
# Display overall accuracy as a percentage
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("-" * 40)
# Initialize and train Logistic Regression model
# max_iter=1000 ensures the algorithm has enough iterations to converge
log_model = LogisticRegression(max_iter=1000, penalty='l2')
log_model.fit(X_train_balanced, y_train_balanced)
# Generate predictions using the trained logistic regression model
y_pred_log = log_model.predict(X_test_processed)
y_pred_log_proba = log_model.predict_proba(X_test_processed)[:, 1]
# Evaluate model
evaluate_model(y_test, y_pred_log, "Logistic Regression")
📊 Performance of Logistic Regression Confusion Matrix: [[750 285] [ 92 282]] Clasification Report: precision recall f1-score support 0 0.89 0.72 0.80 1035 1 0.50 0.75 0.60 374 accuracy 0.73 1409 macro avg 0.69 0.74 0.70 1409 weighted avg 0.79 0.73 0.75 1409 Accuracy: 0.73 ----------------------------------------
# Initialize and train Random Forest model
# random_state=42 ensures reproducible results across runs
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_balanced, y_train_balanced)
# Generate predictions using the trained random forest model
y_pred_rf = rf_model.predict(X_test_processed)
y_pred_rf_proba = rf_model.predict_proba(X_test_processed)[:, 1]
# Evaluate model
evaluate_model(y_test, y_pred_rf, "Random Forest")
📊 Performance of Random Forest Confusion Matrix: [[821 214] [149 225]] Clasification Report: precision recall f1-score support 0 0.85 0.79 0.82 1035 1 0.51 0.60 0.55 374 accuracy 0.74 1409 macro avg 0.68 0.70 0.69 1409 weighted avg 0.76 0.74 0.75 1409 Accuracy: 0.74 ----------------------------------------
# Initialize and train XGBoost model
# random_state=42 ensures reproducible results across runs
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_balanced, y_train_balanced)
# Generate predictions using the trained random forest model
y_pred_xgb = xgb_model.predict(X_test_processed)
y_pred_xgb_proba = xgb_model.predict_proba(X_test_processed)[:, 1]
# Evaluate model
evaluate_model(y_test, y_pred_xgb, "XGBoost")
📊 Performance of XGBoost Confusion Matrix: [[778 257] [104 270]] Clasification Report: precision recall f1-score support 0 0.88 0.75 0.81 1035 1 0.51 0.72 0.60 374 accuracy 0.74 1409 macro avg 0.70 0.74 0.71 1409 weighted avg 0.78 0.74 0.76 1409 Accuracy: 0.74 ----------------------------------------
C:\Users\necke\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning: [14:44:59] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning)
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score, precision_score, recall_score
roc_log = roc_auc_score(y_test, y_pred_log_proba)
roc_rf = roc_auc_score(y_test, y_pred_rf_proba)
roc_xgb = roc_auc_score(y_test, y_pred_xgb_proba)
print(f"Logistic Regression ROC-AUC: {roc_log:.3f}")
print(f"Random Forest ROC-AUC: {roc_rf:.3f}")
print(f"XGBoost ROC-AUC: {roc_xgb:.3f}")
print("Logistic Reggresion has the highest, hence, the best ROC-AUC score.")
Logistic Regression ROC-AUC: 0.829 Random Forest ROC-AUC: 0.781 XGBoost ROC-AUC: 0.809 Logistic Reggresion has the highest, hence, the best ROC_AUC score.
fpr_log, tpr_log, _ = roc_curve(y_test, y_pred_log_proba)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_rf_proba)
fpr_xgb, tpr_xgb, _ = roc_curve(y_test, y_pred_xgb_proba)
plt.figure(figsize=(10, 6))
plt.plot(fpr_log, tpr_log, label=f"Logistic Regression (AUC = {roc_log:.2f})")
plt.plot(fpr_rf, tpr_rf, label=f"Random Forest (AUC = {roc_rf:.2f})")
plt.plot(fpr_xgb, tpr_xgb, label=f"XGBoost (AUC = {roc_xgb:.2f})")
plt.plot([0, 1], [0, 1], "k--") # diagonal line
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend()
plt.grid()
plt.show()
prec_log, recall_log, _ = precision_recall_curve(y_test, y_pred_log_proba)
prec_rf, recall_rf, _ = precision_recall_curve(y_test, y_pred_rf_proba)
prec_xgb, recall_xgb, _ = precision_recall_curve(y_test, y_pred_xgb_proba)
plt.figure(figsize=(8, 6))
plt.plot(recall_log, prec_log, label="Logistic Regression")
plt.plot(recall_rf, prec_rf, label="Random Forest")
plt.plot(recall_xgb, prec_xgb, label="XGBoost")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.legend()
plt.grid()
plt.show()
summary = {
"Model": ["Logistic Regression", "Random Forest", "XGBoost"],
"Accuracy": [accuracy_score(y_test, y_pred_log), accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_xgb)],
"Precision": [precision_score(y_test, y_pred_log), precision_score(y_test, y_pred_rf), precision_score(y_test, y_pred_xgb)],
"Recall": [recall_score(y_test, y_pred_log), recall_score(y_test, y_pred_rf), recall_score(y_test, y_pred_xgb)],
"F1 Score": [f1_score(y_test, y_pred_log), f1_score(y_test, y_pred_rf), f1_score(y_test, y_pred_xgb)],
"ROC-AUC": [roc_log, roc_rf, roc_xgb]
}
import pandas as pd
results_df = pd.DataFrame(summary)
print(results_df)
Model Accuracy Precision Recall F1 Score ROC-AUC 0 Logistic Regression 0.732434 0.497354 0.754011 0.599362 0.829067 1 Random Forest 0.742370 0.512528 0.601604 0.553506 0.781343 2 XGBoost 0.743790 0.512334 0.721925 0.599334 0.809206
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
param_grid_rf = {
'n_estimators': [100, 200, 300],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 8]
}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1')
grid_rf.fit(X_train_balanced, y_train_balanced)
print("Best parameters:", grid_rf.best_params_)
best_rf = grid_rf.best_estimator_
y_pred_best_rf = best_rf.predict(X_test_processed)
evaluate_model(y_test, y_pred_best_rf, "Tuned Random Forest")
print("No significant changes in performance.")
Best parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 300} 📊 Performance of Tuned Random Forest Confusion Matrix: [[817 218] [139 235]] Clasification Report: precision recall f1-score support 0 0.85 0.79 0.82 1035 1 0.52 0.63 0.57 374 accuracy 0.75 1409 macro avg 0.69 0.71 0.69 1409 weighted avg 0.77 0.75 0.75 1409 Accuracy: 0.75 ---------------------------------------- No significant changes in performance.
param_grid_xgb = {
'n_estimators': [300, 400, 500],
'max_depth': [11, 13, 15],
'learning_rate': [0.01, 0.1, 0.2],
'subsample': [0.4, 0.6, 0.8],
'colsample_bytree': [0.6, 0.8, 1]
}
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
grid_xgb = GridSearchCV(
estimator=xgb,
param_grid=param_grid_xgb,
scoring='accuracy',
cv=3,
verbose=1,
n_jobs=-1
)
grid_xgb.fit(X_train_balanced, y_train_balanced)
print("Best Parameters:", grid_xgb.best_params_)
best_xgb = grid_xgb.best_estimator_
y_pred_best_xgb = best_xgb.predict(X_test_processed)
y_proba_best_xgb = best_xgb.predict_proba(X_test_processed)[:, 1]
evaluate_model(y_test, y_pred_best_xgb, "Tuned XGBoost")
print("No significant changes in performance")
Fitting 3 folds for each of 243 candidates, totalling 729 fits
C:\Users\necke\anaconda3\lib\site-packages\xgboost\core.py:158: UserWarning: [15:09:23] WARNING: C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-08cbc0333d8d4aae1-1\xgboost\xgboost-ci-windows\src\learner.cc:740: Parameters: { "use_label_encoder" } are not used. warnings.warn(smsg, UserWarning)
Best Parameters: {'colsample_bytree': 1, 'learning_rate': 0.01, 'max_depth': 15, 'n_estimators': 400, 'subsample': 0.8} 📊 Performance of Tuned XGBoost Confusion Matrix: [[795 240] [124 250]] Clasification Report: precision recall f1-score support 0 0.87 0.77 0.81 1035 1 0.51 0.67 0.58 374 accuracy 0.74 1409 macro avg 0.69 0.72 0.70 1409 weighted avg 0.77 0.74 0.75 1409 Accuracy: 0.74 ---------------------------------------- No significant changes in performance