In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np

import scipy.stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')
In [2]:
# Load CSV file
data = pd.read_csv("StudentsPerformance.csv")

Info about the data and cleansing

In [3]:
def df_info(df):
    """Display the basic info about the dataset"""
    
    pd.set_option('display.max_columns', None) # Display all the columns of the dataset
    
    print(f"\033[1m5 head rows of the data:\n\033[0m")
    display(df.head(5))
    
    print(f"\033[1mInfo about the data:\n\033[0m")
    display(df.info())
    
    print(f"\033[1mDescription of the data:\n\033[0m")
    display(df.describe().T)

def df_dupl_null(df, drop=True):
    """
    Handling the duplicates and print info about them and null values 
    """

    # Checking out duplicates
    duplicates = df[df.duplicated()].shape[0]
    if duplicates==0:
        print(f"\033[1mThere are no duplicates in the DataFrame.\033[0m")
    else:
        print(f"\033[1mThere are {duplicates} duplicates in the DataFrame.\033[0m")
        if drop==True:
            df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
            print("Duplicates dropped!")
        else:
            print('Duplicates kept.')
    
    # Checking null values for each column
    pd.set_option('display.max_rows', None)
    rows = df.shape[0]
    print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
    return (df.isnull().sum()/rows)*100

def df_outliers(df):
    """
    Finding information about outliers in the df
    and returning it as a DataFrame
    """
    print(f"\033[1m Outliers detection: \033[0m ")
    
    # Get numeric columns excluding any columns with 'id' in their name
    numeric_cols = [col for col in df.select_dtypes(include='number').columns 
                if 'id' not in col.lower()]
    
    outliers_list = []
    # Outlier check for numeric columns
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outliers_list.append(len(outliers))
        
    rows = df.shape[0]
    # DataFrame for outliers
    outliers_df = pd.DataFrame(
        [[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
        columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
    )

    return outliers_df

def df_summary(df):
    """
    Print a summary of the cleaned dataset
    """
    print(f"\033[1mCleaning of the DataFrame completed!\033[0m")
    print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
    for column in df.columns:
        if df[column].nunique()==1:
            df.drop([column], inplace=True, axis=1)
        else:
            print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")
In [4]:
df_info(data)
df_dupl_null(data)
df_outliers(data)
df_summary(data)
5 head rows of the data:

gender race/ethnicity parental level of education lunch test preparation course math score reading score writing score
0 female group B bachelor's degree standard none 72 72 74
1 female group C some college standard completed 69 90 88
2 female group B master's degree standard none 90 95 93
3 male group A associate's degree free/reduced none 47 57 44
4 male group C some college standard none 76 78 75
Info about the data:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None
Description of the data:

count mean std min 25% 50% 75% max
math score 1000.0 66.089 15.163080 0.0 57.00 66.0 77.0 100.0
reading score 1000.0 69.169 14.600192 17.0 59.00 70.0 79.0 100.0
writing score 1000.0 68.054 15.195657 10.0 57.75 69.0 79.0 100.0
There are no duplicates in the DataFrame.
 
List of null values for each column in percents: 

 Outliers detection:  
Cleaning of the DataFrame completed!
1000 rows and 8 columns remaining.
- gender: 2 unique values
- race/ethnicity: 5 unique values
- parental level of education: 6 unique values
- lunch: 2 unique values
- test preparation course: 2 unique values
- math score: 81 unique values
- reading score: 72 unique values
- writing score: 77 unique values

EDA

In [5]:
# Globar figure settings
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
In [6]:
# Setting numerical labels for features
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Gender: female -> 0, male -> 1
le.fit(['female', 'male'])
data['gender'] = le.transform(data['gender'])

# Race/ethnicity: group A -> 1, group B -> 2,...
le.fit(['group A', 'group B', 'group C', 'group D', 'group E'])
data['race/ethnicity'] = le.transform(data['race/ethnicity'])

# Parental level of education: 
# Custom mapping
education_mapping = {
    'some high school': 0,
    'high school': 0,
    "associate's degree": 1,
    'some college': 2,
    "bachelor's degree": 3,
    "master's degree": 4
}
data['parental level of education'] = data['parental level of education'].map(education_mapping)

# Lunch: free/reduced -> 0, standard -> 1
le.fit(['free/reduced', 'standard'])
data['lunch'] = le.transform(data['lunch'])

# Test preparation course: none -> 0, completed -> 1
le.fit(['none', 'completed'])
data['test preparation course'] = le.transform(data['test preparation course'])
In [7]:
# Plotting different features impact on the scores
scores = ['math score', 'reading score', 'writing score']
features = list(data.columns)
for s in scores:
    features.remove(s)
    
fig, axes = plt.subplots(len(features), len(scores), figsize=(15, 15))

for i in range(len(features)):
    for j in range(len(scores)):
        sns.barplot(x=data[features[i]], y=data[scores[j]], ax=axes[i, j])
    
plt.tight_layout()

Data Preprocessing

In [8]:
# Leave the education as ordered categorical variable
# Race -> dummy variables columns

race_dummies = pd.get_dummies(data['race/ethnicity'], prefix='race', drop_first=True)
data = pd.concat([data, race_dummies], axis=1)
data.drop('race/ethnicity', axis=1, inplace=True)
In [9]:
# Adding the average score as the target variable
data['avg_score'] = ((data['math score'] + data['writing score'] + data['reading score']) / 3).round(0)
data.head()
Out[9]:
gender parental level of education lunch test preparation course math score reading score writing score race_1 race_2 race_3 race_4 avg_score
0 0 3 1 1 72 72 74 1 0 0 0 73.0
1 0 2 1 0 69 90 88 0 1 0 0 82.0
2 0 4 1 1 90 95 93 1 0 0 0 93.0
3 1 1 0 1 47 57 44 0 0 0 0 49.0
4 1 2 1 1 76 78 75 0 1 0 0 76.0
In [10]:
# Defining the target and predictors
y = data['avg_score']
X = data.drop(scores+['avg_score'], axis=1)
In [11]:
pca = PCA(n_components=0.9)
X_pca = pca.fit_transform(X)

print(f"\nPCA transformed data shape: {X_pca.shape}")
print("\nExplained variance ratio:")
for i, var_ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {var_ratio:.4f}")
print(f"\nCumulative explained variance ratio:")
cumulative_var = np.cumsum(pca.explained_variance_ratio_)
for i, cum_var in enumerate(cumulative_var):
    print(f"PC1 to PC{i+1}: {cum_var:.4f}")
PCA transformed data shape: (1000, 6)

Explained variance ratio:
PC1: 0.5251
PC2: 0.1017
PC3: 0.0844
PC4: 0.0801
PC5: 0.0779
PC6: 0.0721

Cumulative explained variance ratio:
PC1 to PC1: 0.5251
PC1 to PC2: 0.6269
PC1 to PC3: 0.7113
PC1 to PC4: 0.7914
PC1 to PC5: 0.8693
PC1 to PC6: 0.9414

Models Cross-Validation

In [12]:
random_state=0

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=random_state),
    'Lasso': Lasso(random_state=random_state),
    'Random Forest': RandomForestRegressor(random_state=random_state),
    'Gradient Boosting': GradientBoostingRegressor(random_state=random_state),
    'SVR': SVR(),
    'XGBoost': XGBRegressor()
}

results = {}

for model_name, model in models.items():
        
    # Full pipeline: prep + model
    original_full = Pipeline([('model', model)])
    pca_full = pca_full = Pipeline([('pca', PCA(n_components=0.9)), ('model', model)])

    # Cross-validation of both pipelines
    cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
    cv_original = cross_val_score(original_full, X, y, cv=cv, scoring='neg_root_mean_squared_error')
    cv_pca = cross_val_score(pca_full, X, y, cv=cv, scoring='neg_root_mean_squared_error')

    # Saving the results
    results[model_name] = {'original': {'mean': abs(cv_original.mean()), 'std': cv_original.std()},
        'pca': {'mean': abs(cv_pca.mean()), 'std': cv_pca.std()}
    }        
In [13]:
print("CROSS-VALIDATION R**2 RESULTS")
print("="*50)
    
for model_name, score in results.items():
    print(f"\n{model_name}:")
    print(f"  Original Features: {score['original']['mean']:.4f} (+/- {score['original']['std']*2:.4f})")
    print(f"  PCA Features:      {score['pca']['mean']:.4f} (+/- {score['pca']['std']*2:.4f})")
CROSS-VALIDATION R**2 RESULTS
==================================================

Linear Regression:
  Original Features: 12.5904 (+/- 0.9023)
  PCA Features:      12.6062 (+/- 0.9551)

Ridge:
  Original Features: 12.5894 (+/- 0.9002)
  PCA Features:      12.6060 (+/- 0.9543)

Lasso:
  Original Features: 13.1953 (+/- 0.9041)
  PCA Features:      13.0963 (+/- 0.7956)

Random Forest:
  Original Features: 14.0103 (+/- 0.9976)
  PCA Features:      14.0136 (+/- 1.0095)

Gradient Boosting:
  Original Features: 12.8588 (+/- 0.9938)
  PCA Features:      13.3864 (+/- 1.0887)

SVR:
  Original Features: 12.8800 (+/- 0.9832)
  PCA Features:      12.8800 (+/- 1.0417)

XGBoost:
  Original Features: 14.3389 (+/- 1.1102)
  PCA Features:      14.3834 (+/- 1.2352)

Tunning

In [14]:
# RandomForest only, no pca
pipeline = Pipeline([('model', RandomForestRegressor(random_state=0))])

# Define parameter grid
param_grid = {
    'model__n_estimators': [210, 220, 230],
    'model__max_depth': [3, 4],
    'model__min_samples_split': [2, 3],
    'model__min_samples_leaf': [5, 7, 9]
}

cv = KFold(n_splits=5, shuffle=True, random_state=0)
scoring = 'r2'

search = GridSearchCV(
        pipeline, param_grid, cv=cv, scoring=scoring,
        n_jobs=-1, verbose=1, return_train_score=True
    )

# Fit the search
print(f"Starting grid search with {cv}-fold CV...")
search.fit(X, y)

# Get comprehensive evaluation of best model
best_pipeline = search.best_estimator_

scoring_metrics = ['r2', 'neg_root_mean_squared_error']

cv_results = cross_validate(best_pipeline, X, y, cv=cv, 
                           scoring=scoring_metrics, return_train_score=True)


# Compile results
results = {
    'best_params': search.best_params_,
    'best_score': search.best_score_,
    'best_model': best_pipeline,
    'cv_results': {
        metric.replace('_macro', ''): {
            'mean': cv_results[f'test_{metric}'].mean(),
            'std': cv_results[f'test_{metric}'].std()
        }
        for metric in scoring_metrics
    },
    'search_results': search.cv_results_
}

"""Print formatted hyperparameter tuning results"""
print("=" * 60)
print("RANDOM FOREST HYPERPARAMETER TUNING RESULTS")
print("=" * 60)

print(f"Best CV Score: {results['best_score']:.4f}")
print()

print("BEST PARAMETERS:")
for param, value in results['best_params'].items():
    print(f"  {param}: {value}")
print()

print("DETAILED EVALUATION OF BEST MODEL:")
for metric_name, values in results['cv_results'].items():
    print(f"  {metric_name.capitalize()}: {abs(values['mean']):.4f} (±{values['std']:.4f})")
print()
Starting grid search with KFold(n_splits=5, random_state=0, shuffle=True)-fold CV...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
============================================================
RANDOM FOREST HYPERPARAMETER TUNING RESULTS
============================================================
Best CV Score: 0.1909

BEST PARAMETERS:
  model__max_depth: 3
  model__min_samples_leaf: 5
  model__min_samples_split: 2
  model__n_estimators: 210

DETAILED EVALUATION OF BEST MODEL:
  R2: 0.1909 (±0.0345)
  Neg_root_mean_squared_error: 12.7862 (±0.5451)

In [15]:
# XGBoost only, no pca
xgb_pipeline = Pipeline([('model', XGBRegressor(random_state=0, eval_metric='rmse'))])

# Define parameter grid
xgb_param_grid = {
    'model__n_estimators': [90, 100, 110],
    'model__max_depth': [1, 2], 
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__subsample': [0.7, 0.6],
    'model__reg_alpha': [0.1], 
    'model__reg_lambda': [0.3, 0.2],
}
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scoring = 'r2'

search = GridSearchCV(
        xgb_pipeline, xgb_param_grid, cv=cv, scoring=scoring,
        n_jobs=-1, verbose=1, return_train_score=True
    )

# Fit the search
print(f"Starting grid search with {cv}-fold CV...")
search.fit(X, y)

# Get comprehensive evaluation of best model
best_pipeline = search.best_estimator_

scoring_metrics = ['r2', 'neg_root_mean_squared_error']

cv_results = cross_validate(best_pipeline, X, y, cv=cv, 
                           scoring=scoring_metrics, return_train_score=True)


# Compile results
results = {
    'best_params': search.best_params_,
    'best_score': search.best_score_,
    'best_model': best_pipeline,
    'cv_results': {
        metric.replace('_macro', ''): {
            'mean': cv_results[f'test_{metric}'].mean(),
            'std': cv_results[f'test_{metric}'].std()
        }
        for metric in scoring_metrics
    },
    'search_results': search.cv_results_
}

"""Print formatted hyperparameter tuning results"""
print("=" * 60)
print("XGBOOST HYPERPARAMETER TUNING RESULTS")
print("=" * 60)

print(f"Best CV Score: {results['best_score']:.4f}")
print()

print("BEST PARAMETERS:")
for param, value in results['best_params'].items():
    print(f"  {param}: {value}")
print()

print("DETAILED EVALUATION OF BEST MODEL:")
for metric_name, values in results['cv_results'].items():
    print(f"  {metric_name.capitalize()}: {abs(values['mean']):.4f} (±{values['std']:.4f})")
print()
Starting grid search with KFold(n_splits=5, random_state=0, shuffle=True)-fold CV...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
============================================================
XGBOOST HYPERPARAMETER TUNING RESULTS
============================================================
Best CV Score: 0.2191

BEST PARAMETERS:
  model__learning_rate: 0.1
  model__max_depth: 1
  model__n_estimators: 100
  model__reg_alpha: 0.1
  model__reg_lambda: 0.2
  model__subsample: 0.7

DETAILED EVALUATION OF BEST MODEL:
  R2: 0.2191 (±0.0325)
  Neg_root_mean_squared_error: 12.5590 (±0.4586)

In [ ]: