import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np

import scipy.stats as stats

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR

import warnings
warnings.filterwarnings('ignore')


# Load CSV file
data = pd.read_csv("StudentsPerformance.csv")


def df_info(df):
    """Display the basic info about the dataset"""
    
    pd.set_option('display.max_columns', None) # Display all the columns of the dataset
    
    print(f"\033[1m5 head rows of the data:\n\033[0m")
    display(df.head(5))
    
    print(f"\033[1mInfo about the data:\n\033[0m")
    display(df.info())
    
    print(f"\033[1mDescription of the data:\n\033[0m")
    display(df.describe().T)

def df_dupl_null(df, drop=True):
    """
    Handling the duplicates and print info about them and null values 
    """

    # Checking out duplicates
    duplicates = df[df.duplicated()].shape[0]
    if duplicates==0:
        print(f"\033[1mThere are no duplicates in the DataFrame.\033[0m")
    else:
        print(f"\033[1mThere are {duplicates} duplicates in the DataFrame.\033[0m")
        if drop==True:
            df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
            print("Duplicates dropped!")
        else:
            print('Duplicates kept.')
    
    # Checking null values for each column
    pd.set_option('display.max_rows', None)
    rows = df.shape[0]
    print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
    return (df.isnull().sum()/rows)*100

def df_outliers(df):
    """
    Finding information about outliers in the df
    and returning it as a DataFrame
    """
    print(f"\033[1m Outliers detection: \033[0m ")
    
    # Get numeric columns excluding any columns with 'id' in their name
    numeric_cols = [col for col in df.select_dtypes(include='number').columns 
                if 'id' not in col.lower()]
    
    outliers_list = []
    # Outlier check for numeric columns
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outliers_list.append(len(outliers))
        
    rows = df.shape[0]
    # DataFrame for outliers
    outliers_df = pd.DataFrame(
        [[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
        columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
    )

    return outliers_df

def df_summary(df):
    """
    Print a summary of the cleaned dataset
    """
    print(f"\033[1mCleaning of the DataFrame completed!\033[0m")
    print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
    for column in df.columns:
        if df[column].nunique()==1:
            df.drop([column], inplace=True, axis=1)
        else:
            print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")


df_info(data)
df_dupl_null(data)
df_outliers(data)
df_summary(data)

5 head rows of the data:

Info about the data:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB

None

Description of the data:

There are no duplicates in the DataFrame.
 
List of null values for each column in percents: 

 Outliers detection:  
Cleaning of the DataFrame completed!
1000 rows and 8 columns remaining.
- gender: 2 unique values
- race/ethnicity: 5 unique values
- parental level of education: 6 unique values
- lunch: 2 unique values
- test preparation course: 2 unique values
- math score: 81 unique values
- reading score: 72 unique values
- writing score: 77 unique values


# Globar figure settings
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13


# Setting numerical labels for features
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# Gender: female -> 0, male -> 1
le.fit(['female', 'male'])
data['gender'] = le.transform(data['gender'])

# Race/ethnicity: group A -> 1, group B -> 2,...
le.fit(['group A', 'group B', 'group C', 'group D', 'group E'])
data['race/ethnicity'] = le.transform(data['race/ethnicity'])

# Parental level of education: 
# Custom mapping
education_mapping = {
    'some high school': 0,
    'high school': 0,
    "associate's degree": 1,
    'some college': 2,
    "bachelor's degree": 3,
    "master's degree": 4
}
data['parental level of education'] = data['parental level of education'].map(education_mapping)

# Lunch: free/reduced -> 0, standard -> 1
le.fit(['free/reduced', 'standard'])
data['lunch'] = le.transform(data['lunch'])

# Test preparation course: none -> 0, completed -> 1
le.fit(['none', 'completed'])
data['test preparation course'] = le.transform(data['test preparation course'])


# Plotting different features impact on the scores
scores = ['math score', 'reading score', 'writing score']
features = list(data.columns)
for s in scores:
    features.remove(s)
    
fig, axes = plt.subplots(len(features), len(scores), figsize=(15, 15))

for i in range(len(features)):
    for j in range(len(scores)):
        sns.barplot(x=data[features[i]], y=data[scores[j]], ax=axes[i, j])
    
plt.tight_layout()


# Leave the education as ordered categorical variable
# Race -> dummy variables columns

race_dummies = pd.get_dummies(data['race/ethnicity'], prefix='race', drop_first=True)
data = pd.concat([data, race_dummies], axis=1)
data.drop('race/ethnicity', axis=1, inplace=True)


# Adding the average score as the target variable
data['avg_score'] = ((data['math score'] + data['writing score'] + data['reading score']) / 3).round(0)
data.head()


# Defining the target and predictors
y = data['avg_score']
X = data.drop(scores+['avg_score'], axis=1)


pca = PCA(n_components=0.9)
X_pca = pca.fit_transform(X)

print(f"\nPCA transformed data shape: {X_pca.shape}")
print("\nExplained variance ratio:")
for i, var_ratio in enumerate(pca.explained_variance_ratio_):
    print(f"PC{i+1}: {var_ratio:.4f}")
print(f"\nCumulative explained variance ratio:")
cumulative_var = np.cumsum(pca.explained_variance_ratio_)
for i, cum_var in enumerate(cumulative_var):
    print(f"PC1 to PC{i+1}: {cum_var:.4f}")

PCA transformed data shape: (1000, 6)

Explained variance ratio:
PC1: 0.5251
PC2: 0.1017
PC3: 0.0844
PC4: 0.0801
PC5: 0.0779
PC6: 0.0721

Cumulative explained variance ratio:
PC1 to PC1: 0.5251
PC1 to PC2: 0.6269
PC1 to PC3: 0.7113
PC1 to PC4: 0.7914
PC1 to PC5: 0.8693
PC1 to PC6: 0.9414


random_state=0

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(random_state=random_state),
    'Lasso': Lasso(random_state=random_state),
    'Random Forest': RandomForestRegressor(random_state=random_state),
    'Gradient Boosting': GradientBoostingRegressor(random_state=random_state),
    'SVR': SVR(),
    'XGBoost': XGBRegressor()
}

results = {}

for model_name, model in models.items():
        
    # Full pipeline: prep + model
    original_full = Pipeline([('model', model)])
    pca_full = pca_full = Pipeline([('pca', PCA(n_components=0.9)), ('model', model)])

    # Cross-validation of both pipelines
    cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
    cv_original = cross_val_score(original_full, X, y, cv=cv, scoring='neg_root_mean_squared_error')
    cv_pca = cross_val_score(pca_full, X, y, cv=cv, scoring='neg_root_mean_squared_error')

    # Saving the results
    results[model_name] = {'original': {'mean': abs(cv_original.mean()), 'std': cv_original.std()},
        'pca': {'mean': abs(cv_pca.mean()), 'std': cv_pca.std()}
    }


print("CROSS-VALIDATION R**2 RESULTS")
print("="*50)
    
for model_name, score in results.items():
    print(f"\n{model_name}:")
    print(f"  Original Features: {score['original']['mean']:.4f} (+/- {score['original']['std']*2:.4f})")
    print(f"  PCA Features:      {score['pca']['mean']:.4f} (+/- {score['pca']['std']*2:.4f})")

CROSS-VALIDATION R**2 RESULTS
==================================================

Linear Regression:
  Original Features: 12.5904 (+/- 0.9023)
  PCA Features:      12.6062 (+/- 0.9551)

Ridge:
  Original Features: 12.5894 (+/- 0.9002)
  PCA Features:      12.6060 (+/- 0.9543)

Lasso:
  Original Features: 13.1953 (+/- 0.9041)
  PCA Features:      13.0963 (+/- 0.7956)

Random Forest:
  Original Features: 14.0103 (+/- 0.9976)
  PCA Features:      14.0136 (+/- 1.0095)

Gradient Boosting:
  Original Features: 12.8588 (+/- 0.9938)
  PCA Features:      13.3864 (+/- 1.0887)

SVR:
  Original Features: 12.8800 (+/- 0.9832)
  PCA Features:      12.8800 (+/- 1.0417)

XGBoost:
  Original Features: 14.3389 (+/- 1.1102)
  PCA Features:      14.3834 (+/- 1.2352)


# RandomForest only, no pca
pipeline = Pipeline([('model', RandomForestRegressor(random_state=0))])

# Define parameter grid
param_grid = {
    'model__n_estimators': [210, 220, 230],
    'model__max_depth': [3, 4],
    'model__min_samples_split': [2, 3],
    'model__min_samples_leaf': [5, 7, 9]
}

cv = KFold(n_splits=5, shuffle=True, random_state=0)
scoring = 'r2'

search = GridSearchCV(
        pipeline, param_grid, cv=cv, scoring=scoring,
        n_jobs=-1, verbose=1, return_train_score=True
    )

# Fit the search
print(f"Starting grid search with {cv}-fold CV...")
search.fit(X, y)

# Get comprehensive evaluation of best model
best_pipeline = search.best_estimator_

scoring_metrics = ['r2', 'neg_root_mean_squared_error']

cv_results = cross_validate(best_pipeline, X, y, cv=cv, 
                           scoring=scoring_metrics, return_train_score=True)


# Compile results
results = {
    'best_params': search.best_params_,
    'best_score': search.best_score_,
    'best_model': best_pipeline,
    'cv_results': {
        metric.replace('_macro', ''): {
            'mean': cv_results[f'test_{metric}'].mean(),
            'std': cv_results[f'test_{metric}'].std()
        }
        for metric in scoring_metrics
    },
    'search_results': search.cv_results_
}

"""Print formatted hyperparameter tuning results"""
print("=" * 60)
print("RANDOM FOREST HYPERPARAMETER TUNING RESULTS")
print("=" * 60)

print(f"Best CV Score: {results['best_score']:.4f}")
print()

print("BEST PARAMETERS:")
for param, value in results['best_params'].items():
    print(f"  {param}: {value}")
print()

print("DETAILED EVALUATION OF BEST MODEL:")
for metric_name, values in results['cv_results'].items():
    print(f"  {metric_name.capitalize()}: {abs(values['mean']):.4f} (±{values['std']:.4f})")
print()

Starting grid search with KFold(n_splits=5, random_state=0, shuffle=True)-fold CV...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
============================================================
RANDOM FOREST HYPERPARAMETER TUNING RESULTS
============================================================
Best CV Score: 0.1909

BEST PARAMETERS:
  model__max_depth: 3
  model__min_samples_leaf: 5
  model__min_samples_split: 2
  model__n_estimators: 210

DETAILED EVALUATION OF BEST MODEL:
  R2: 0.1909 (±0.0345)
  Neg_root_mean_squared_error: 12.7862 (±0.5451)


# XGBoost only, no pca
xgb_pipeline = Pipeline([('model', XGBRegressor(random_state=0, eval_metric='rmse'))])

# Define parameter grid
xgb_param_grid = {
    'model__n_estimators': [90, 100, 110],
    'model__max_depth': [1, 2], 
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__subsample': [0.7, 0.6],
    'model__reg_alpha': [0.1], 
    'model__reg_lambda': [0.3, 0.2],
}
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scoring = 'r2'

search = GridSearchCV(
        xgb_pipeline, xgb_param_grid, cv=cv, scoring=scoring,
        n_jobs=-1, verbose=1, return_train_score=True
    )

# Fit the search
print(f"Starting grid search with {cv}-fold CV...")
search.fit(X, y)

# Get comprehensive evaluation of best model
best_pipeline = search.best_estimator_

scoring_metrics = ['r2', 'neg_root_mean_squared_error']

cv_results = cross_validate(best_pipeline, X, y, cv=cv, 
                           scoring=scoring_metrics, return_train_score=True)


# Compile results
results = {
    'best_params': search.best_params_,
    'best_score': search.best_score_,
    'best_model': best_pipeline,
    'cv_results': {
        metric.replace('_macro', ''): {
            'mean': cv_results[f'test_{metric}'].mean(),
            'std': cv_results[f'test_{metric}'].std()
        }
        for metric in scoring_metrics
    },
    'search_results': search.cv_results_
}

"""Print formatted hyperparameter tuning results"""
print("=" * 60)
print("XGBOOST HYPERPARAMETER TUNING RESULTS")
print("=" * 60)

print(f"Best CV Score: {results['best_score']:.4f}")
print()

print("BEST PARAMETERS:")
for param, value in results['best_params'].items():
    print(f"  {param}: {value}")
print()

print("DETAILED EVALUATION OF BEST MODEL:")
for metric_name, values in results['cv_results'].items():
    print(f"  {metric_name.capitalize()}: {abs(values['mean']):.4f} (±{values['std']:.4f})")
print()

Starting grid search with KFold(n_splits=5, random_state=0, shuffle=True)-fold CV...
Fitting 5 folds for each of 72 candidates, totalling 360 fits
============================================================
XGBOOST HYPERPARAMETER TUNING RESULTS
============================================================
Best CV Score: 0.2191

BEST PARAMETERS:
  model__learning_rate: 0.1
  model__max_depth: 1
  model__n_estimators: 100
  model__reg_alpha: 0.1
  model__reg_lambda: 0.2
  model__subsample: 0.7

DETAILED EVALUATION OF BEST MODEL:
  R2: 0.2191 (±0.0325)
  Neg_root_mean_squared_error: 12.5590 (±0.4586)

	count	mean	std	min	25%	50%	75%	max
math score	1000.0	66.089	15.163080	0.0	57.00	66.0	77.0	100.0
reading score	1000.0	69.169	14.600192	17.0	59.00	70.0	79.0	100.0
writing score	1000.0	68.054	15.195657	10.0	57.75	69.0	79.0	100.0

Info about the data and cleansing

EDA

Data Preprocessing

Models Cross-Validation

Tunning

	gender	race/ethnicity	parental level of education	lunch	test preparation course	math score	reading score	writing score
0	female	group B	bachelor's degree	standard	none	72	72	74
1	female	group C	some college	standard	completed	69	90	88
2	female	group B	master's degree	standard	none	90	95	93
3	male	group A	associate's degree	free/reduced	none	47	57	44
4	male	group C	some college	standard	none	76	78	75

	gender	parental level of education	lunch	test preparation course	math score	reading score	writing score	race_1	race_2	avg_score
0	0	3	1	1	72	72	74	1	0	73.0
1	0	2	1	0	69	90	88	0	1	82.0
2	0	4	1	1	90	95	93	1	0	93.0
3	1	1	0	1	47	57	44	0	0	49.0
4	1	2	1	1	76	78	75	0	1	76.0