import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np
import scipy.stats as stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, cross_validate, KFold
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
import warnings
warnings.filterwarnings('ignore')
# Load CSV file
data = pd.read_csv("StudentsPerformance.csv")
def df_info(df):
"""Display the basic info about the dataset"""
pd.set_option('display.max_columns', None) # Display all the columns of the dataset
print(f"\033[1m5 head rows of the data:\n\033[0m")
display(df.head(5))
print(f"\033[1mInfo about the data:\n\033[0m")
display(df.info())
print(f"\033[1mDescription of the data:\n\033[0m")
display(df.describe().T)
def df_dupl_null(df, drop=True):
"""
Handling the duplicates and print info about them and null values
"""
# Checking out duplicates
duplicates = df[df.duplicated()].shape[0]
if duplicates==0:
print(f"\033[1mThere are no duplicates in the DataFrame.\033[0m")
else:
print(f"\033[1mThere are {duplicates} duplicates in the DataFrame.\033[0m")
if drop==True:
df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
print("Duplicates dropped!")
else:
print('Duplicates kept.')
# Checking null values for each column
pd.set_option('display.max_rows', None)
rows = df.shape[0]
print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
return (df.isnull().sum()/rows)*100
def df_outliers(df):
"""
Finding information about outliers in the df
and returning it as a DataFrame
"""
print(f"\033[1m Outliers detection: \033[0m ")
# Get numeric columns excluding any columns with 'id' in their name
numeric_cols = [col for col in df.select_dtypes(include='number').columns
if 'id' not in col.lower()]
outliers_list = []
# Outlier check for numeric columns
for col in numeric_cols:
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
outliers_list.append(len(outliers))
rows = df.shape[0]
# DataFrame for outliers
outliers_df = pd.DataFrame(
[[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
)
return outliers_df
def df_summary(df):
"""
Print a summary of the cleaned dataset
"""
print(f"\033[1mCleaning of the DataFrame completed!\033[0m")
print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
for column in df.columns:
if df[column].nunique()==1:
df.drop([column], inplace=True, axis=1)
else:
print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")
df_info(data)
df_dupl_null(data)
df_outliers(data)
df_summary(data)
5 head rows of the data:
gender | race/ethnicity | parental level of education | lunch | test preparation course | math score | reading score | writing score | |
---|---|---|---|---|---|---|---|---|
0 | female | group B | bachelor's degree | standard | none | 72 | 72 | 74 |
1 | female | group C | some college | standard | completed | 69 | 90 | 88 |
2 | female | group B | master's degree | standard | none | 90 | 95 | 93 |
3 | male | group A | associate's degree | free/reduced | none | 47 | 57 | 44 |
4 | male | group C | some college | standard | none | 76 | 78 | 75 |
Info about the data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 gender 1000 non-null object
1 race/ethnicity 1000 non-null object
2 parental level of education 1000 non-null object
3 lunch 1000 non-null object
4 test preparation course 1000 non-null object
5 math score 1000 non-null int64
6 reading score 1000 non-null int64
7 writing score 1000 non-null int64
dtypes: int64(3), object(5)
memory usage: 62.6+ KB
None
Description of the data:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
math score | 1000.0 | 66.089 | 15.163080 | 0.0 | 57.00 | 66.0 | 77.0 | 100.0 |
reading score | 1000.0 | 69.169 | 14.600192 | 17.0 | 59.00 | 70.0 | 79.0 | 100.0 |
writing score | 1000.0 | 68.054 | 15.195657 | 10.0 | 57.75 | 69.0 | 79.0 | 100.0 |
There are no duplicates in the DataFrame. List of null values for each column in percents: Outliers detection: Cleaning of the DataFrame completed! 1000 rows and 8 columns remaining. - gender: 2 unique values - race/ethnicity: 5 unique values - parental level of education: 6 unique values - lunch: 2 unique values - test preparation course: 2 unique values - math score: 81 unique values - reading score: 72 unique values - writing score: 77 unique values
# Globar figure settings
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# Setting numerical labels for features
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
# Gender: female -> 0, male -> 1
le.fit(['female', 'male'])
data['gender'] = le.transform(data['gender'])
# Race/ethnicity: group A -> 1, group B -> 2,...
le.fit(['group A', 'group B', 'group C', 'group D', 'group E'])
data['race/ethnicity'] = le.transform(data['race/ethnicity'])
# Parental level of education:
# Custom mapping
education_mapping = {
'some high school': 0,
'high school': 0,
"associate's degree": 1,
'some college': 2,
"bachelor's degree": 3,
"master's degree": 4
}
data['parental level of education'] = data['parental level of education'].map(education_mapping)
# Lunch: free/reduced -> 0, standard -> 1
le.fit(['free/reduced', 'standard'])
data['lunch'] = le.transform(data['lunch'])
# Test preparation course: none -> 0, completed -> 1
le.fit(['none', 'completed'])
data['test preparation course'] = le.transform(data['test preparation course'])
# Plotting different features impact on the scores
scores = ['math score', 'reading score', 'writing score']
features = list(data.columns)
for s in scores:
features.remove(s)
fig, axes = plt.subplots(len(features), len(scores), figsize=(15, 15))
for i in range(len(features)):
for j in range(len(scores)):
sns.barplot(x=data[features[i]], y=data[scores[j]], ax=axes[i, j])
plt.tight_layout()
# Leave the education as ordered categorical variable
# Race -> dummy variables columns
race_dummies = pd.get_dummies(data['race/ethnicity'], prefix='race', drop_first=True)
data = pd.concat([data, race_dummies], axis=1)
data.drop('race/ethnicity', axis=1, inplace=True)
# Adding the average score as the target variable
data['avg_score'] = ((data['math score'] + data['writing score'] + data['reading score']) / 3).round(0)
data.head()
gender | parental level of education | lunch | test preparation course | math score | reading score | writing score | race_1 | race_2 | race_3 | race_4 | avg_score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 3 | 1 | 1 | 72 | 72 | 74 | 1 | 0 | 0 | 0 | 73.0 |
1 | 0 | 2 | 1 | 0 | 69 | 90 | 88 | 0 | 1 | 0 | 0 | 82.0 |
2 | 0 | 4 | 1 | 1 | 90 | 95 | 93 | 1 | 0 | 0 | 0 | 93.0 |
3 | 1 | 1 | 0 | 1 | 47 | 57 | 44 | 0 | 0 | 0 | 0 | 49.0 |
4 | 1 | 2 | 1 | 1 | 76 | 78 | 75 | 0 | 1 | 0 | 0 | 76.0 |
# Defining the target and predictors
y = data['avg_score']
X = data.drop(scores+['avg_score'], axis=1)
pca = PCA(n_components=0.9)
X_pca = pca.fit_transform(X)
print(f"\nPCA transformed data shape: {X_pca.shape}")
print("\nExplained variance ratio:")
for i, var_ratio in enumerate(pca.explained_variance_ratio_):
print(f"PC{i+1}: {var_ratio:.4f}")
print(f"\nCumulative explained variance ratio:")
cumulative_var = np.cumsum(pca.explained_variance_ratio_)
for i, cum_var in enumerate(cumulative_var):
print(f"PC1 to PC{i+1}: {cum_var:.4f}")
PCA transformed data shape: (1000, 6) Explained variance ratio: PC1: 0.5251 PC2: 0.1017 PC3: 0.0844 PC4: 0.0801 PC5: 0.0779 PC6: 0.0721 Cumulative explained variance ratio: PC1 to PC1: 0.5251 PC1 to PC2: 0.6269 PC1 to PC3: 0.7113 PC1 to PC4: 0.7914 PC1 to PC5: 0.8693 PC1 to PC6: 0.9414
random_state=0
# Define models
models = {
'Linear Regression': LinearRegression(),
'Ridge': Ridge(random_state=random_state),
'Lasso': Lasso(random_state=random_state),
'Random Forest': RandomForestRegressor(random_state=random_state),
'Gradient Boosting': GradientBoostingRegressor(random_state=random_state),
'SVR': SVR(),
'XGBoost': XGBRegressor()
}
results = {}
for model_name, model in models.items():
# Full pipeline: prep + model
original_full = Pipeline([('model', model)])
pca_full = pca_full = Pipeline([('pca', PCA(n_components=0.9)), ('model', model)])
# Cross-validation of both pipelines
cv = KFold(n_splits=5, shuffle=True, random_state=random_state)
cv_original = cross_val_score(original_full, X, y, cv=cv, scoring='neg_root_mean_squared_error')
cv_pca = cross_val_score(pca_full, X, y, cv=cv, scoring='neg_root_mean_squared_error')
# Saving the results
results[model_name] = {'original': {'mean': abs(cv_original.mean()), 'std': cv_original.std()},
'pca': {'mean': abs(cv_pca.mean()), 'std': cv_pca.std()}
}
print("CROSS-VALIDATION R**2 RESULTS")
print("="*50)
for model_name, score in results.items():
print(f"\n{model_name}:")
print(f" Original Features: {score['original']['mean']:.4f} (+/- {score['original']['std']*2:.4f})")
print(f" PCA Features: {score['pca']['mean']:.4f} (+/- {score['pca']['std']*2:.4f})")
CROSS-VALIDATION R**2 RESULTS ================================================== Linear Regression: Original Features: 12.5904 (+/- 0.9023) PCA Features: 12.6062 (+/- 0.9551) Ridge: Original Features: 12.5894 (+/- 0.9002) PCA Features: 12.6060 (+/- 0.9543) Lasso: Original Features: 13.1953 (+/- 0.9041) PCA Features: 13.0963 (+/- 0.7956) Random Forest: Original Features: 14.0103 (+/- 0.9976) PCA Features: 14.0136 (+/- 1.0095) Gradient Boosting: Original Features: 12.8588 (+/- 0.9938) PCA Features: 13.3864 (+/- 1.0887) SVR: Original Features: 12.8800 (+/- 0.9832) PCA Features: 12.8800 (+/- 1.0417) XGBoost: Original Features: 14.3389 (+/- 1.1102) PCA Features: 14.3834 (+/- 1.2352)
# RandomForest only, no pca
pipeline = Pipeline([('model', RandomForestRegressor(random_state=0))])
# Define parameter grid
param_grid = {
'model__n_estimators': [210, 220, 230],
'model__max_depth': [3, 4],
'model__min_samples_split': [2, 3],
'model__min_samples_leaf': [5, 7, 9]
}
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scoring = 'r2'
search = GridSearchCV(
pipeline, param_grid, cv=cv, scoring=scoring,
n_jobs=-1, verbose=1, return_train_score=True
)
# Fit the search
print(f"Starting grid search with {cv}-fold CV...")
search.fit(X, y)
# Get comprehensive evaluation of best model
best_pipeline = search.best_estimator_
scoring_metrics = ['r2', 'neg_root_mean_squared_error']
cv_results = cross_validate(best_pipeline, X, y, cv=cv,
scoring=scoring_metrics, return_train_score=True)
# Compile results
results = {
'best_params': search.best_params_,
'best_score': search.best_score_,
'best_model': best_pipeline,
'cv_results': {
metric.replace('_macro', ''): {
'mean': cv_results[f'test_{metric}'].mean(),
'std': cv_results[f'test_{metric}'].std()
}
for metric in scoring_metrics
},
'search_results': search.cv_results_
}
"""Print formatted hyperparameter tuning results"""
print("=" * 60)
print("RANDOM FOREST HYPERPARAMETER TUNING RESULTS")
print("=" * 60)
print(f"Best CV Score: {results['best_score']:.4f}")
print()
print("BEST PARAMETERS:")
for param, value in results['best_params'].items():
print(f" {param}: {value}")
print()
print("DETAILED EVALUATION OF BEST MODEL:")
for metric_name, values in results['cv_results'].items():
print(f" {metric_name.capitalize()}: {abs(values['mean']):.4f} (±{values['std']:.4f})")
print()
Starting grid search with KFold(n_splits=5, random_state=0, shuffle=True)-fold CV... Fitting 5 folds for each of 36 candidates, totalling 180 fits ============================================================ RANDOM FOREST HYPERPARAMETER TUNING RESULTS ============================================================ Best CV Score: 0.1909 BEST PARAMETERS: model__max_depth: 3 model__min_samples_leaf: 5 model__min_samples_split: 2 model__n_estimators: 210 DETAILED EVALUATION OF BEST MODEL: R2: 0.1909 (±0.0345) Neg_root_mean_squared_error: 12.7862 (±0.5451)
# XGBoost only, no pca
xgb_pipeline = Pipeline([('model', XGBRegressor(random_state=0, eval_metric='rmse'))])
# Define parameter grid
xgb_param_grid = {
'model__n_estimators': [90, 100, 110],
'model__max_depth': [1, 2],
'model__learning_rate': [0.01, 0.05, 0.1],
'model__subsample': [0.7, 0.6],
'model__reg_alpha': [0.1],
'model__reg_lambda': [0.3, 0.2],
}
cv = KFold(n_splits=5, shuffle=True, random_state=0)
scoring = 'r2'
search = GridSearchCV(
xgb_pipeline, xgb_param_grid, cv=cv, scoring=scoring,
n_jobs=-1, verbose=1, return_train_score=True
)
# Fit the search
print(f"Starting grid search with {cv}-fold CV...")
search.fit(X, y)
# Get comprehensive evaluation of best model
best_pipeline = search.best_estimator_
scoring_metrics = ['r2', 'neg_root_mean_squared_error']
cv_results = cross_validate(best_pipeline, X, y, cv=cv,
scoring=scoring_metrics, return_train_score=True)
# Compile results
results = {
'best_params': search.best_params_,
'best_score': search.best_score_,
'best_model': best_pipeline,
'cv_results': {
metric.replace('_macro', ''): {
'mean': cv_results[f'test_{metric}'].mean(),
'std': cv_results[f'test_{metric}'].std()
}
for metric in scoring_metrics
},
'search_results': search.cv_results_
}
"""Print formatted hyperparameter tuning results"""
print("=" * 60)
print("XGBOOST HYPERPARAMETER TUNING RESULTS")
print("=" * 60)
print(f"Best CV Score: {results['best_score']:.4f}")
print()
print("BEST PARAMETERS:")
for param, value in results['best_params'].items():
print(f" {param}: {value}")
print()
print("DETAILED EVALUATION OF BEST MODEL:")
for metric_name, values in results['cv_results'].items():
print(f" {metric_name.capitalize()}: {abs(values['mean']):.4f} (±{values['std']:.4f})")
print()
Starting grid search with KFold(n_splits=5, random_state=0, shuffle=True)-fold CV... Fitting 5 folds for each of 72 candidates, totalling 360 fits ============================================================ XGBOOST HYPERPARAMETER TUNING RESULTS ============================================================ Best CV Score: 0.2191 BEST PARAMETERS: model__learning_rate: 0.1 model__max_depth: 1 model__n_estimators: 100 model__reg_alpha: 0.1 model__reg_lambda: 0.2 model__subsample: 0.7 DETAILED EVALUATION OF BEST MODEL: R2: 0.2191 (±0.0325) Neg_root_mean_squared_error: 12.5590 (±0.4586)