import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors
import warnings
# Do not display warnings
warnings.filterwarnings('ignore')
from dateutil import parser
import re
# Load the data
df=pd.read_csv("df_arabica_clean.csv", index_col='ID')
df.head(5)
Unnamed: 0 | Country of Origin | Farm Name | Lot Number | Mill | ICO Number | Company | Altitude | Region | Producer | ... | Total Cup Points | Moisture Percentage | Category One Defects | Quakers | Color | Category Two Defects | Expiration | Certification Body | Certification Address | Certification Contact | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
ID | |||||||||||||||||||||
0 | 0 | Colombia | Finca El Paraiso | CQU2022015 | Finca El Paraiso | NaN | Coffee Quality Union | 1700-1930 | Piendamo,Cauca | Diego Samuel Bermudez | ... | 89.33 | 11.8 | 0 | 0 | green | 3 | September 21st, 2023 | Japan Coffee Exchange | 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... | 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901 |
1 | 1 | Taiwan | Royal Bean Geisha Estate | The 2022 Pacific Rim Coffee Summit,T037 | Royal Bean Geisha Estate | NaN | Taiwan Coffee Laboratory | 1200 | Chiayi | 曾福森 | ... | 87.58 | 10.5 | 0 | 0 | blue-green | 0 | November 15th, 2023 | Taiwan Coffee Laboratory 台灣咖啡研究室 | QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd... | Lin, Jen-An Neil 林仁安 - 886-289116612 |
2 | 2 | Laos | OKLAO coffee farms | The 2022 Pacific Rim Coffee Summit,LA01 | oklao coffee processing plant | NaN | Taiwan Coffee Laboratory | 1300 | Laos Borofen Plateau | WU TAO CHI | ... | 87.42 | 10.4 | 0 | 0 | yellowish | 2 | November 15th, 2023 | Taiwan Coffee Laboratory 台灣咖啡研究室 | QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd... | Lin, Jen-An Neil 林仁安 - 886-289116612 |
3 | 3 | Costa Rica | La Cumbre | CQU2022017 | La Montana Tarrazu MIll | NaN | Coffee Quality Union | 1900 | Los Santos,Tarrazu | Santa Maria de Dota | ... | 87.17 | 11.8 | 0 | 0 | green | 0 | September 21st, 2023 | Japan Coffee Exchange | 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... | 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901 |
4 | 4 | Colombia | Finca Santuario | CQU2023002 | Finca Santuario | NaN | Coffee Quality Union | 1850-2100 | Popayan,Cauca | Camilo Merizalde | ... | 87.08 | 11.6 | 0 | 2 | yellow-green | 2 | March 5th, 2024 | Japan Coffee Exchange | 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... | 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901 |
5 rows × 40 columns
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 207 entries, 0 to 206 Data columns (total 40 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Unnamed: 0 207 non-null int64 1 Country of Origin 207 non-null object 2 Farm Name 205 non-null object 3 Lot Number 206 non-null object 4 Mill 204 non-null object 5 ICO Number 75 non-null object 6 Company 207 non-null object 7 Altitude 206 non-null object 8 Region 205 non-null object 9 Producer 206 non-null object 10 Number of Bags 207 non-null int64 11 Bag Weight 207 non-null object 12 In-Country Partner 207 non-null object 13 Harvest Year 207 non-null object 14 Grading Date 207 non-null object 15 Owner 207 non-null object 16 Variety 201 non-null object 17 Status 207 non-null object 18 Processing Method 202 non-null object 19 Aroma 207 non-null float64 20 Flavor 207 non-null float64 21 Aftertaste 207 non-null float64 22 Acidity 207 non-null float64 23 Body 207 non-null float64 24 Balance 207 non-null float64 25 Uniformity 207 non-null float64 26 Clean Cup 207 non-null float64 27 Sweetness 207 non-null float64 28 Overall 207 non-null float64 29 Defects 207 non-null float64 30 Total Cup Points 207 non-null float64 31 Moisture Percentage 207 non-null float64 32 Category One Defects 207 non-null int64 33 Quakers 207 non-null int64 34 Color 207 non-null object 35 Category Two Defects 207 non-null int64 36 Expiration 207 non-null object 37 Certification Body 207 non-null object 38 Certification Address 207 non-null object 39 Certification Contact 207 non-null object dtypes: float64(13), int64(5), object(22) memory usage: 66.3+ KB
df.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Unnamed: 0 | 207.0 | 103.000000 | 59.899917 | 0.00 | 51.50 | 103.00 | 154.500 | 206.00 |
Number of Bags | 207.0 | 155.449275 | 244.484868 | 1.00 | 1.00 | 14.00 | 275.000 | 2240.00 |
Aroma | 207.0 | 7.721063 | 0.287626 | 6.50 | 7.58 | 7.67 | 7.920 | 8.58 |
Flavor | 207.0 | 7.744734 | 0.279613 | 6.75 | 7.58 | 7.75 | 7.920 | 8.50 |
Aftertaste | 207.0 | 7.599758 | 0.275911 | 6.67 | 7.42 | 7.58 | 7.750 | 8.42 |
Acidity | 207.0 | 7.690290 | 0.259510 | 6.83 | 7.50 | 7.67 | 7.875 | 8.58 |
Body | 207.0 | 7.640918 | 0.233499 | 6.83 | 7.50 | 7.67 | 7.750 | 8.25 |
Balance | 207.0 | 7.644058 | 0.256299 | 6.67 | 7.50 | 7.67 | 7.790 | 8.42 |
Uniformity | 207.0 | 9.990338 | 0.103306 | 8.67 | 10.00 | 10.00 | 10.000 | 10.00 |
Clean Cup | 207.0 | 10.000000 | 0.000000 | 10.00 | 10.00 | 10.00 | 10.000 | 10.00 |
Sweetness | 207.0 | 10.000000 | 0.000000 | 10.00 | 10.00 | 10.00 | 10.000 | 10.00 |
Overall | 207.0 | 7.676812 | 0.306359 | 6.67 | 7.50 | 7.67 | 7.920 | 8.58 |
Defects | 207.0 | 0.000000 | 0.000000 | 0.00 | 0.00 | 0.00 | 0.000 | 0.00 |
Total Cup Points | 207.0 | 83.706570 | 1.730417 | 78.00 | 82.58 | 83.75 | 84.830 | 89.33 |
Moisture Percentage | 207.0 | 10.735266 | 1.247468 | 0.00 | 10.10 | 10.80 | 11.500 | 13.50 |
Category One Defects | 207.0 | 0.135266 | 0.592070 | 0.00 | 0.00 | 0.00 | 0.000 | 5.00 |
Quakers | 207.0 | 0.690821 | 1.686918 | 0.00 | 0.00 | 0.00 | 1.000 | 12.00 |
Category Two Defects | 207.0 | 2.251208 | 2.950183 | 0.00 | 0.00 | 1.00 | 3.000 | 16.00 |
print('Shape of the dataset: \n')
print(df.shape)
print('Columns names: \n')
print(df.columns)
Shape of the dataset: (207, 40) Columns names: Index(['Unnamed: 0', 'Country of Origin', 'Farm Name', 'Lot Number', 'Mill', 'ICO Number', 'Company', 'Altitude', 'Region', 'Producer', 'Number of Bags', 'Bag Weight', 'In-Country Partner', 'Harvest Year', 'Grading Date', 'Owner', 'Variety', 'Status', 'Processing Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean Cup', 'Sweetness', 'Overall', 'Defects', 'Total Cup Points', 'Moisture Percentage', 'Category One Defects', 'Quakers', 'Color', 'Category Two Defects', 'Expiration', 'Certification Body', 'Certification Address', 'Certification Contact'], dtype='object')
# Delete the column 'Unnamed: 0', which is redundant and the columns irrelevant for the analysis
df.drop(['Unnamed: 0', 'Lot Number', 'ICO Number', 'Certification Address',
'Certification Contact', 'Mill'], inplace=True, axis=1)
# No duplicates
df[df.duplicated()].shape
(0, 34)
# Null values for each column
df.isnull().sum()
Country of Origin 0 Farm Name 2 Company 0 Altitude 1 Region 2 Producer 1 Number of Bags 0 Bag Weight 0 In-Country Partner 0 Harvest Year 0 Grading Date 0 Owner 0 Variety 6 Status 0 Processing Method 5 Aroma 0 Flavor 0 Aftertaste 0 Acidity 0 Body 0 Balance 0 Uniformity 0 Clean Cup 0 Sweetness 0 Overall 0 Defects 0 Total Cup Points 0 Moisture Percentage 0 Category One Defects 0 Quakers 0 Color 0 Category Two Defects 0 Expiration 0 Certification Body 0 dtype: int64
# Get all rows with at least one missing value
rows_with_nulls = df[df.isna().any(axis=1)]
print('Number of rows with missing values: ')
print(rows_with_nulls.shape[0])
Number of rows with missing values: 11
# There are only few missing values, so I delete them
# Drop rows with any null values
df.dropna(axis=0, inplace=True)
df[df.isna().any(axis=1)].shape
(0, 34)
print('Final shape of the data: ',df.shape)
Final shape of the data: (196, 34)
# Renaming the columns for easier retrival
df.rename(columns={"Country of Origin": "Country", "Farm Name": "FarmName", "Number of Bags": "NumBags",
"Total Cup Points": "TotCupPoints", "Moisture Percentage": "MoisturePerc",
"Category Two Defects": "Cat2Defects", "Certification Body": "CertBody"}, inplace=True)
# Cleansing the same names of colors
# Use a dictionary mapping and regex for color replacements
color_mapping = {
r'yellow green|yellow-green|yellow- green|yello-green': 'yellow-green',
r'yellowish|pale yellow': 'yellow',
r'blue-green|bluish-green': 'blue-green',
r'browish-green': 'brownish',
r'^green$|^greenish$': 'green'
}
for pattern, replacement in color_mapping.items():
df['Color'] = df['Color'].str.replace(pat=pattern, repl=replacement, case=False, regex=True)
df['Color'].value_counts()
green 131 blue-green 30 yellow-green 16 yellow 10 brownish 9 Name: Color, dtype: int64
# Renaming 'Tanzania, United Republic of' to 'Tanzania'
mask = df['Country'].str.contains('Tanzania', case=False, na=False)
df.loc[mask, 'Country'] = 'Tanzania'
# Define function that will process altitude values
def process_altitude(alt):
if pd.isna(alt): # Handle NaN values
return alt
alt_str = str(alt)
# Handle different separators
if any(sep in alt_str for sep in ['-', 'A', '~']):
# Extract numbers
numbers = re.findall(r'\d+', alt_str)
if len(numbers) >= 2:
# Return averageo of the 2 values
return int((int(numbers[0]) + int(numbers[1])) / 2)
# For single values
try:
return int(alt_str)
except ValueError:
return None # Return None if conversion fails
df['Altitude'] = df['Altitude'].apply(process_altitude)
# Mapping the processing methods
processing_mapping = {
"Double Anaerobic Washed": "Washed / Wet",
"Semi Washed": "Washed / Wet",
"Honey,Mossto": "Pulped natural / honey",
"Double Carbonic Maceration / Natural": "Natural / Dry",
"Wet Hulling": "Washed / Wet",
"Anaerobico 1000h": "Washed / Wet",
"SEMI-LAVADO": "Natural / Dry",
"Washed / Wet" : "Washed / Wet",
"Pulped natural / honey": "Pulped natural / honey",
"Natural / Dry": "Natural / Dry"
}
# Changing the values in the column
df['Processing Method'] = df['Processing Method'].map(processing_mapping)
# OUTLIERS DETECTION
# Get only numeric columns
numeric_cols = df.select_dtypes(include='number').columns
# Outlier check for numeric columns
for col in numeric_cols:
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
print(f"Outliers in {col}: {len(outliers)}")
# Plot boxplots for each numeric column
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# Atitude, NumBags and TotCupPoints have larger values compared to other numeric columns,
# so plot them separately
fig, axes = plt.subplots(1, 3, figsize=(12, 5)) # 1 row, 3 columns
# Boxplot for Altitude
sns.boxplot(y=df['Altitude'], ax=axes[0], palette='YlOrBr',
flierprops={"marker": "o", "markerfacecolor": "grey", "markersize": 7}, width=.5)
axes[0].set_title('Boxplot of Altitude')
axes[0].set_ylabel('Altitude')
axes[0].grid(True, linestyle='--', alpha=0.7)
# Boxplot for NumBags
sns.boxplot(y=df['NumBags'], ax=axes[1], palette='YlOrBr',
flierprops={"marker": "o", "markerfacecolor": "grey", "markersize": 7}, width=.5)
axes[1].set_title('Boxplot of Number of Bags')
axes[1].set_ylabel('Number of Bags')
axes[1].grid(True, linestyle='--', alpha=0.7)
# Boxplot for TotCupPoints
sns.boxplot(y=df['TotCupPoints'], ax=axes[2], palette='YlOrBr',
flierprops={"marker": "o", "markerfacecolor": "grey", "markersize": 7}, width=.5)
axes[2].set_title('Boxplot of Total Cup Points')
axes[2].set_ylabel('Total Cup Points')
axes[2].grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
# Remove Altitude, NumBags and TotCupPoints from the rest of columns
rest_numeric_cols = [col for col in numeric_cols if col not in ['Altitude', 'NumBags', 'TotCupPoints']]
# Melt the DataFrame to long format
df_melted = df[rest_numeric_cols].melt(var_name='Feature', value_name='Value')
plt.figure(figsize=(12, 8))
sns.boxplot(x='Feature', y='Value', data=df_melted, palette='YlOrBr',
flierprops={"marker": "o", "markerfacecolor": "grey", "markersize": 5}, width=.5, linewidth=.6)
# Customize the plot
plt.title("Boxplots of Numeric Features", fontsize=18)
plt.xticks(rotation=45)
plt.ylabel("Value")
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
Outliers in Altitude: 4 Outliers in NumBags: 2 Outliers in Aroma: 3 Outliers in Flavor: 3 Outliers in Aftertaste: 7 Outliers in Acidity: 2 Outliers in Body: 9 Outliers in Balance: 9 Outliers in Uniformity: 2 Outliers in Clean Cup: 0 Outliers in Sweetness: 0 Outliers in Overall: 4 Outliers in Defects: 0 Outliers in TotCupPoints: 3 Outliers in MoisturePerc: 1 Outliers in Category One Defects: 12 Outliers in Quakers: 15 Outliers in Cat2Defects: 14
# Print a summary of the cleaned dataset
print("\033[1mCleaning completed!\033[0m")
print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining")
for column in df.columns:
print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")
# print(f"- Countries: {df['Country'].nunique()} unique values")
# print(f"- Colors: {df['Color'].nunique()} unique values")
Cleaning completed! 196 rows and 34 columns remaining - Country: 22 unique values - FarmName: 165 unique values - Company: 69 unique values - Altitude: 75 unique values - Region: 115 unique values - Producer: 163 unique values - NumBags: 53 unique values - Bag Weight: 39 unique values - In-Country Partner: 20 unique values - Harvest Year: 7 unique values - Grading Date: 71 unique values - Owner: 75 unique values - Variety: 47 unique values - Status: 1 unique values - Processing Method: 3 unique values - Aroma: 19 unique values - Flavor: 19 unique values - Aftertaste: 20 unique values - Acidity: 19 unique values - Body: 17 unique values - Balance: 18 unique values - Uniformity: 3 unique values - Clean Cup: 1 unique values - Sweetness: 1 unique values - Overall: 21 unique values - Defects: 1 unique values - TotCupPoints: 79 unique values - MoisturePerc: 44 unique values - Category One Defects: 6 unique values - Quakers: 11 unique values - Color: 5 unique values - Cat2Defects: 14 unique values - Expiration: 71 unique values - CertBody: 20 unique values
# Get unique farms per country
unique_farms = df.groupby('Country')['FarmName'].nunique().sort_values(ascending=False)
# Calculate mean cup points for each country
mean_cup_points = df.groupby('Country')['TotCupPoints'].mean()
# Create a DataFrame with both metrics
country_analysis = pd.DataFrame({
'Number of Farms': unique_farms,
'Mean Cup Points': mean_cup_points
}).reset_index()
# Some countries have the same num. of farms so,
# Add rank for farms count
country_analysis['Rank'] = country_analysis['Number of Farms'].rank(method='dense',
ascending=False)
top10_countries = country_analysis[country_analysis['Rank'] <= 10].sort_values(['Rank', 'Country'],
ignore_index=True)
# Group by rank and aggregate the countries
rank_groups = top10_countries.groupby('Rank').agg({
'Country': lambda x: ', '.join(sorted(x)),
'Number of Farms': 'first' # All countries in the same rank have the same number
}).reset_index()
top10_cuppoints = country_analysis.sort_values(['Mean Cup Points', 'Country'], ascending=False, ignore_index=True).head(10)
# Print top countries by number of farms
print(f"\033[1mTop 10 countries by number of farms:\033[0m")
print(top10_countries.to_string(index=False))
# Print top countries by cup points
print(f"\033[1m\nTop 10 countries by mean cup points:\033[0m")
print(top10_cuppoints.to_string(index=False))
Top 10 countries by number of farms: Country Number of Farms Mean Cup Points Rank Taiwan 56 84.363559 1.0 Colombia 16 83.781250 2.0 Guatemala 15 84.416500 3.0 Ethiopia 10 84.960909 4.0 Brazil 8 81.541250 5.0 Honduras 7 83.282308 6.0 Thailand 6 82.827500 7.0 Costa Rica 5 83.740000 8.0 El Salvador 5 81.532857 8.0 United States (Hawaii) 5 83.650000 8.0 Mexico 4 82.710000 9.0 Nicaragua 4 82.125000 9.0 Tanzania 4 84.735000 9.0 Vietnam 4 82.892500 9.0 Laos 3 83.390000 10.0 Peru 3 82.332500 10.0 Uganda 3 83.916667 10.0 Top 10 countries by mean cup points: Country Number of Farms Mean Cup Points Rank Ethiopia 10 84.960909 4.0 Tanzania 4 84.735000 9.0 Guatemala 15 84.416500 3.0 Taiwan 56 84.363559 1.0 Madagascar 1 84.250000 12.0 Indonesia 1 84.250000 12.0 Uganda 3 83.916667 10.0 Colombia 16 83.781250 2.0 Costa Rica 5 83.740000 8.0 Kenya 2 83.710000 11.0
def create_hbarplot(data, x_col, y_col, title, xlim=None, figsize=(12,8), bar_labels=False):
"""
Create a horizontal bar plot with customized styling.
Args:
data: DataFrame containing the data
x_col: Column name for x-axis values
y_col: Column name for y-axis categories
title: Plot title
xlim: Optional tuple for x-axis limits (min, max)
figsize: Optional tuple for figure size
bar_labels: Optional, add customed labels to the bars
"""
fig, ax = plt.subplots(figsize=(12, 8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# Create the bar plot
bars = sns.barplot(
data=data,
x=x_col,
y=y_col,
palette=sns.color_palette('YlOrBr', n_colors=len(data) + 3),
edgecolor='lightgray',
ax=ax
)
# Add labels to the bars
if bar_labels:
for index, row in data.iterrows():
ax.text(row[x_col]-0.2,
index,
str(round(row[x_col],2))+':',
va='center',
color='black',
ha='right'
)
ax.text(row[x_col]+0.2,
index,
str(row[y_col]),
va='center',
color='black',
ha='left'
)
# Customize the plot
ax.set_xlabel(x_col)
ax.set_ylabel(None)
ax.set_yticks([])
ax.set_title(title, fontsize=18)
ax.grid(axis='both', linestyle='--', alpha=0.7)
if xlim:
ax.set_xlim(xlim)
else:
ax.set_xlim(0, max(data[x_col]) * 1.1)
plt.tight_layout()
return fig, ax
fig, ax = create_hbarplot(data=rank_groups, x_col='Number of Farms', y_col='Country',
title='Countries Grouped by Number of Farms',
xlim=(0, max(rank_groups['Number of Farms']) * 1.07),
bar_labels=True
)
ax.annotate(
f"Taiwan has the highest \nnumber of farms despite \nits small size.",
xy=(54, 0), xytext=(38, 2),
arrowprops=dict(arrowstyle="->", color="gray"),
fontsize=16,
color='chocolate',
fontweight='bold'
)
plt.show()
fig, ax_cup = create_hbarplot(data=top10_cuppoints, x_col='Mean Cup Points', y_col='Country',
title='Top 10 Countries by Mean Total Cup Points',
xlim = (80, max(top10_cuppoints['Mean Cup Points'])*1.015),
figsize=(8,8),
bar_labels=True,
)
ax_cup.annotate(
"Ethiopia \nis known for \nspecialty coffee \nvarieties",
xy=(85.4, 0.2), xytext=(85.15, 3.6),
arrowprops=dict(arrowstyle="->", color="gray"),
fontsize=16,
color='chocolate',
fontweight='bold'
)
plt.show()
# Create a scatterplot to examine relationship between farms and quality
plt.figure(figsize=(10, 8))
sns.scatterplot(x=country_analysis['Number of Farms'], y=country_analysis['Mean Cup Points'],
hue=np.log(country_analysis['Number of Farms']),
s=200,
alpha=0.6,
palette='YlOrBr', edgecolor='#3D1C02', linewidth=0.5
)
plt.xlabel('Number of Farms')
plt.ylabel('Mean Cup Points')
plt.title('Relationship Between Number of Farms and Cup Quality', fontsize=16)
plt.grid(True, alpha=0.3)
plt.legend().remove()
plt.xscale('log')
xticks = [1,2,3,4,5,6,7,8,9,10,20,30,40,50,60]
xlabels = [str(x) for x in xticks]
plt.xticks(xticks, labels=xlabels)
plt.tight_layout()
plt.show()
coffee_colors = df['Color'].value_counts()
# Creating a DataFrame for coffee colors
coffee_colors_df = pd.DataFrame({'Color': coffee_colors.index, 'Count' : coffee_colors.values})
print(coffee_colors_df)
Color Count 0 green 131 1 blue-green 30 2 yellow-green 16 3 yellow 10 4 brownish 9
# Creating a dictionary mapping coffee color names to actual colors
color_mapping = {
'green': '#006400', # Dark green
'blue-green': '#029386', # Teal
'yellow-green': '#9ACD32', # Yellow-green
'brownish': '#A0522D', # Sienna/brown
'yellow': '#FBDD7E', # Wheat
}
plt.figure(figsize=(6,6))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# Extract data
labels = coffee_colors_df['Color'] # Coffee color names column
sizes = coffee_colors_df['Count'] # Count column
# Create a list of colors in the same order as labels
colors = [color_mapping[color] for color in labels]
pie = plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', pctdistance=0.8, textprops={})
# Customize percent labels
for autotext in pie[2]:
autotext.set_horizontalalignment('center')
autotext.set_color('white')
plt.axis('equal') # Equal aspect ratio ensures that pie is drawn as a circle
plt.title('Distribution of Coffee Colors', fontsize=18)
plt.tight_layout()
plt.show()
plt.figure(figsize=(10,5))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# Set y-axis to logarithmic scale
plt.yscale('log')
# List of altitude values for y-axis ticks
max_alt = df['Altitude'].max()
yticks = [150,300,600,1200,2400,4800]
ylabels = [str(y) for y in yticks]
# Set y-axis ticks
plt.yticks(ticks=yticks, labels=ylabels)
# Disable minor ticks
plt.minorticks_off()
print(f"\033[1m There is no strong correlation between the altitude and the color of the coffee:\033[0m")
sns.swarmplot(x=df['Color'], y=df['Altitude'], palette=color_mapping, s=6, edgecolor='black',
linewidth=0.5, alpha=0.8, dodge=True)
plt.title('Altitude Distribution by Coffee Bean Color', fontsize=18)
plt.ylabel('Altitude (meters, log scale)')
plt.xlabel('Bean Color')
plt.grid(axis='both', linestyle='--', alpha=0.7)
# Add altitude range annotations for each color
for i, color in enumerate(df['Color'].unique()):
color_data = df[df['Color'] == color]['Altitude']
mean_alt = color_data.mean()
# Add range text below the plot
plt.text(
i, 3000,
f"Mean: {round(mean_alt,1)}m",
ha='center',
fontsize=9
)
plt.tight_layout()
plt.show()
There is no strong correlation between the altitude and the color of the coffee:
color_labels = df['Color'].unique()
# list of features
features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance']
# Dictionary: for each feature, calculate means for all colors
feature_means = {
feature: [df[df['Color'] == color][feature].mean() for color in color_labels]
for feature in features
}
# Merge the data for creating a heatmap
heatmap_data = pd.DataFrame(feature_means, index=color_labels).T
# Order columns by total cup points
color_by_points = df.groupby('Color')['TotCupPoints'].mean().sort_values(ascending=False).index
heatmap_data = heatmap_data[color_by_points]
plt.figure(figsize=(10, 9))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# Create a diverging colormap centered around the mean
mean_value = heatmap_data.values.mean()
# Create heatmap with improved color scheme
hm = sns.heatmap(
heatmap_data,
annot=True,
fmt=".2f",
cmap='YlOrBr',
center=mean_value,
cbar_kws={'label': 'Mean Score'},
linewidths=0.5
)
# Highlight the highest value in each row
for i, feature in enumerate(features):
max_idx = heatmap_data.loc[feature].idxmax()
j = list(heatmap_data.columns).index(max_idx)
hm.add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=2))
plt.yticks(rotation=0, ha='right')
plt.xticks(rotation=20, ha='right')
plt.title('Mean Feature Values by Coffee Color', fontsize=18)
plt.xlabel('Coffee Color (ordered by total cup points)')
plt.ylabel('Taste Feature')
plt.show()
# Create a radar chart comparing flavor profiles by color
color_labels = df['Color'].unique()
features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance']
# Compute mean values for each color and feature
means = {}
for color in color_labels:
means[color] = [df[df['Color'] == color][feature].mean() for feature in features]
# Create the radar chart
fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# color_mapping = get_coffee_color_map()
angles = np.linspace(0, 2*np.pi, len(features), endpoint=False).tolist()
angles += angles[:1] # Close the loop
# Plot each color
for color in color_labels:
values = means[color]
values += values[:1] # Close the loop
ax.plot(angles, values, color=color_mapping[color], linewidth=2, label=color)
ax.fill(angles, values, color=color_mapping[color], alpha=0.25)
# Set feature labels
ax.set_xticks(angles[:-1], features)
# Add padding for better label visibility
for tick in ax.xaxis.get_major_ticks():
tick.set_pad(15)
# Set y limits for consistency
ax.set_ylim(7.5, 7.95)
ax.set_yticks([7.5, 7.6, 7.7, 7.8, 7.9])
ax.set_title('Flavor Profile Comparison by Coffee Color', fontsize=18, y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
<matplotlib.legend.Legend at 0x16549e746a0>
# Retrieving mean value of total cup points for each color
cuppoints_means = df.groupby('Color')['TotCupPoints'].mean()
# Sort the list by numerical values in descending order
sorted_cuppoints = cuppoints_means.sort_values(ascending=False)
# Create a DataFrame for sorted points
df_cuppoints = sorted_cuppoints.reset_index(name='TotCupPoints')
# Display the DataFrame
print(df_cuppoints)
Color TotCupPoints 0 brownish 84.776667 1 yellow-green 84.233125 2 yellow 83.792000 3 green 83.649160 4 blue-green 83.455333
# Create the bar plot
plt.figure(figsize=(6, 5))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
bars = sns.barplot(data=df_cuppoints, x='TotCupPoints', y='Color',
palette=color_mapping,
edgecolor='lightgray',
)
# Add labels to the bars
for index, row in df_cuppoints.iterrows():
plt.text(row['TotCupPoints']-0.1,
index,
str(row['Color'])+': '+str(round(float(row['TotCupPoints']),2)),
va='center',
color='white',
ha='right',
weight='bold'
)
# Customize the plot
plt.xlabel('Mean Total Cup Points')
plt.ylabel(None)
plt.yticks([])
plt.title('Mean Total Cup Points by Color', fontsize=18)
plt.grid(axis='x', linestyle='--', alpha=0.8)
plt.xlim(80, max(df_cuppoints['TotCupPoints'])*1.001)
plt.tight_layout()
plt.show()
# Perform statistical tests to verify significance of color differences"""
from scipy import stats
color_labels = df['Color'].unique()
features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'TotCupPoints']
# Create a results dataframe
results = pd.DataFrame(columns=['Feature', 'F_statistic', 'p_value', 'Significant'])
for feature in features:
# Perform one-way ANOVA
groups = [df[df['Color'] == color][feature] for color in color_labels]
f_stat, p_val = stats.f_oneway(*groups)
# Add to results
results = results.append({
'Feature': feature,
'F_statistic': round(f_stat, 2),
'p_value': round(p_val, 4),
'Significant': 'Yes' if p_val < 0.05 else 'No'
}, ignore_index=True)
print("Statistical Significance of Color Differences:")
print(results)
# Create a bar chart of p-values
fig, ax = plt.subplots(figsize=(10, 6))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
bars = ax.bar(results['Feature'], -np.log10(results['p_value']),
edgecolor=['green' if sig == 'Yes' else 'gray' for sig in results['Significant']],
linewidth=2.5,
fill=False, hatch='//')
# Add significance threshold line
ax.axhline(y=-np.log10(0.05), color='purple', linestyle='--',
label='Significance threshold (p=0.05)')
ax.set_ylabel('-log10(p_value)')
ax.set_title('Statistical Significance of Coffee Color Impact', fontsize=18)
ax.set_xticklabels(results['Feature'], rotation=45)
# Add annotation
significant_count = results['Significant'].value_counts().get('Yes', 0)
ax.annotate(
f"{significant_count} out of {len(features)} features show significant differences",
xy=(0.5, 0.9),
xycoords='axes fraction',
ha='center',
fontsize=13
)
plt.legend(loc=(0.58,0.7))
plt.tight_layout()
Statistical Significance of Color Differences: Feature F_statistic p_value Significant 0 Aroma 1.27 0.2851 No 1 Flavor 2.19 0.0712 No 2 Aftertaste 1.82 0.1258 No 3 Acidity 0.76 0.5542 No 4 Body 0.95 0.4374 No 5 Balance 1.28 0.2782 No 6 TotCupPoints 1.44 0.2221 No
def format_company_report(df, top_n=3):
"""Generate a formatted report for top companies with improved styling"""
from IPython.display import display, HTML
# Companies with the biggest number of farms
companies = df.groupby('Company')['FarmName'].nunique().sort_values(ascending=False).head(top_n)
df_companies = companies.reset_index(name='Number of Farms')
# Features we want to analyze
features = ['Aroma', 'Flavor', 'Overall', 'TotCupPoints']
# Create HTML tables for better visualization
html_output = "<style>.company-table {border-collapse: collapse; width: 50%; margin-bottom: 20px;} "
html_output += ".company-table th {background-color: #D4A76A; color: white; text-align: left; \
padding: 8px; font-size: 14px} "
html_output += ".company-table td {padding: 8px; border-bottom: 1px solid #ddd; font-size: 14px} "
html_output += ".company-header {background-color: #8B4513; width: 55%; color: white;\
font-weight: bold; padding: 10px; font-size: 18px;}</style>"
# Loop through top companies
for i, (company, farm_count) in enumerate(zip(df_companies['Company'], df_companies['Number of Farms'])):
company_data = df[df['Company'] == company]
# Company header
html_output += f"<div class='company-header'>{i+1}. {company} ({farm_count} farms)</div>"
# Create a table for this company's stats
html_output += "<table class='company-table'>"
# Top countries section
top3_countries = company_data.groupby('Country')['FarmName'].nunique().sort_values(ascending=False).head(3)
html_output += "<tr><th colspan='2'>Top 3 Countries by Farm Count</th></tr>"
for country, count in top3_countries.items():
html_output += f"<tr><td>{country}</td><td>{count} farms</td></tr>"
# Feature averages section
html_output += "<tr><th colspan='2'>Quality Metrics</th></tr>"
for feature in features:
feature_mean = company_data[feature].mean()
html_output += f"<tr><td>{feature}</td><td>{feature_mean:.2f}</td></tr>"
# Additional metrics
html_output += "<tr><th colspan='2'>Other Metrics</th></tr>"
html_output += f"<tr><td>Mean Altitude</td><td>{company_data['Altitude'].mean():.0f} m</td></tr>"
html_output += f"<tr><td>Most Common Processing Method</td><td>{company_data['Processing Method'].value_counts().index[0]}</td></tr>"
html_output += "</table>"
return HTML(html_output)
format_company_report(df)
Top 3 Countries by Farm Count | |
---|---|
Taiwan | 27 farms |
United States (Hawaii) | 5 farms |
Guatemala | 4 farms |
Quality Metrics | |
Aroma | 7.94 |
Flavor | 7.92 |
Overall | 7.86 |
TotCupPoints | 84.82 |
Other Metrics | |
Mean Altitude | 1161 m |
Most Common Processing Method | Washed / Wet |
Top 3 Countries by Farm Count | |
---|---|
Taiwan | 25 farms |
Quality Metrics | |
Aroma | 7.63 |
Flavor | 7.69 |
Overall | 7.61 |
TotCupPoints | 83.16 |
Other Metrics | |
Mean Altitude | 372 m |
Most Common Processing Method | Pulped natural / honey |
Top 3 Countries by Farm Count | |
---|---|
Colombia | 6 farms |
Thailand | 2 farms |
Brazil | 1 farms |
Quality Metrics | |
Aroma | 7.78 |
Flavor | 7.76 |
Overall | 7.78 |
TotCupPoints | 84.11 |
Other Metrics | |
Mean Altitude | 1593 m |
Most Common Processing Method | Washed / Wet |
def compare_top_companies(df, top_n=3):
"""Create visual comparisons between top coffee companies"""
# Get top companies
top_companies = df.groupby('Company')['FarmName'].nunique().sort_values(ascending=False).head(top_n).index
# Create a filtered dataframe with only top companies
top_df = df[df['Company'].isin(top_companies)]
# Plot: Feature comparison between companies
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
features = ['Aroma', 'Flavor', 'Aftertaste', 'Balance']
axes = axes.flatten()
for i, feature in enumerate(features):
sns.boxplot(x='Company', y=feature, data=top_df, ax=axes[i], palette='YlOrBr')
axes[i].set_title(f'Distribution of {feature} Scores')
axes[i].set_xlabel('')
labels = axes[i].get_xticklabels()
wrapped = [label.get_text().replace(" ", "\n") for label in labels]
axes[i].set_xticklabels(wrapped)
axes[i].set_ylabel(feature)
axes[i].grid(axis='y', linestyle='--', alpha=0.7)
# Add suptitle to the figure
fig.suptitle('Feature Comparison Between Top Coffee Companies', fontsize=18)
fig.tight_layout()
fig.subplots_adjust(top=0.9)
return fig
fig = compare_top_companies(df)
plt.show()
def plot_cup_points_distribution(df, top_n=3):
"""Compare total cup points distributions among top companies"""
# Get top companies
top_companies = df.groupby('Company')['FarmName'].nunique().sort_values(ascending=False).head(top_n).index
# Create filtered dataframe
top_df = df[df['Company'].isin(top_companies)]
# Create the plot
fig, ax = plt.subplots(figsize=(12, 6))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# Create violin plot with inner box plot
sns.violinplot(
x='Company',
y='TotCupPoints',
data=top_df,
palette='YlOrBr',
inner='box',
ax=ax
)
# Add individual points with jitter
sns.stripplot(
x='Company',
y='TotCupPoints',
data=top_df,
color='black',
size=5,
alpha=0.3,
jitter=True,
ax=ax
)
# Add mean markers
for i, company in enumerate(top_companies):
mean_val = top_df[top_df['Company'] == company]['TotCupPoints'].mean()
ax.plot([i-0.2, i+0.2], [mean_val, mean_val], color='gray', linewidth=12)
ax.text(i, mean_val-0.2, f"Mean: {mean_val:.2f}", ha='center', fontsize=10, fontweight='bold', color='white')
# Customize the plot
ax.set_title('Distribution of Total Cup Points by Company', fontsize=18)
ax.set_ylabel('Total Cup Points')
ax.set_xlabel('')
ax.grid(axis='y', linestyle='--', alpha=0.7)
# Add statistical significance test results
from scipy import stats
f_stat, p_val = stats.f_oneway(*[top_df[top_df['Company'] == company]['TotCupPoints'] for company in top_companies])
significance = "significant" if p_val < 0.05 else "not significant"
ax.annotate(
f"ANOVA p-value: {p_val:.4f} (differences are {significance})",
xy=(0.43, 0.79),
xycoords='figure fraction',
ha='center',
fontsize=12
)
return fig
fig = plot_cup_points_distribution(df)
plt.show()
def create_company_correlation_matrices(df, top_n=3):
"""Create correlation matrices for top companies to compare their quality patterns"""
# Get top companies
top_companies = df.groupby('Company')['FarmName'].nunique().sort_values(ascending=False).head(top_n).index
# Features to analyze
features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'TotCupPoints']
# Create subplots
fig, axes = plt.subplots(top_n, 1, figsize=(8, 15))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
vmin = 1
# Plot correlation matrix for each company
for i, company in enumerate(top_companies):
company_data = df[df['Company'] == company][features]
corr = company_data.corr()
if corr.min().min() < vmin:
vmin = corr.min().min()
# Create heatmap
sns.heatmap(
corr,
annot=True,
fmt=".2f",
cmap='YlOrBr',
vmin=vmin,
vmax=1,
ax=axes[i],
linewidths=0.5
)
axes[i].set_title(f"{company}", fontsize=14)
axes[i].set_xticklabels(features, rotation=45)
plt.tight_layout()
plt.suptitle('Correlation Between Quality Metrics for Top Companies', fontsize=18, y=1.05)
return fig
fig = create_company_correlation_matrices(df)
plt.show()
print(df.columns)
Index(['Country', 'FarmName', 'Company', 'Altitude', 'Region', 'Producer', 'NumBags', 'Bag Weight', 'In-Country Partner', 'Harvest Year', 'Grading Date', 'Owner', 'Variety', 'Status', 'Processing Method', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'Uniformity', 'Clean Cup', 'Sweetness', 'Overall', 'Defects', 'TotCupPoints', 'MoisturePerc', 'Category One Defects', 'Quakers', 'Color', 'Cat2Defects', 'Expiration', 'CertBody'], dtype='object')
# Get top processing methods
process_methods = df['Processing Method'].value_counts()
top_methods = process_methods.head(3)
# Create a DataFrame for the top methods
process_methods_df = pd.DataFrame({
'ProcessMethod': top_methods.index,
'Count': top_methods.values
})
# TOP COFFEE PROCESSING METHODS VISUALIZATION
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
fig, axes = plt.subplots(2, 1, figsize=(8, 15))
#plt.figure(figsize=(6,6))
# ------------------Plot 1: Pie chart----------------------------------
# Add percentage and absolute count to labels
total = sum(top_methods.values)
labels = [f"{method}\n({count:,} beans, {count/total:.1%})"
for method, count in zip(top_methods.index, top_methods.values)]
pie_palette = sns.color_palette('YlOrBr',3)
wedges, texts = axes[0].pie(
top_methods.values,
colors=pie_palette,
startangle=90,
wedgeprops={'edgecolor': 'w', 'linewidth': 1.5}
)
# Add a white circle at the center to create a donut chart
centre_circle = plt.Circle((0, 0), 0.5, fc='white')
axes[0].add_artist(centre_circle)
axes[0].legend(wedges, labels, loc="center left", bbox_to_anchor=(1, 0.5))
axes[0].set_title('Top 3 Coffee Processing Methods', fontsize=18)
axes[0].text(0, 0, f'Total: {total:,}', ha='center', va='center', fontsize=14)
# -------------------Plot 2: Horizontal bar chart with quality scores--------------------
# Create a new DataFrame with average quality scores for each processing method
quality_metrics = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance',
'Overall', 'TotCupPoints']
# Calculate mean scores for each processing method
method_quality = df[df['Processing Method'].isin(top_methods.index)].groupby('Processing Method')[quality_metrics].mean()
# Sort by total cup points
method_quality = method_quality.sort_values('TotCupPoints', ascending=False)
# Plot horizontal bar chart for TotCupPoints
sns.barplot(
x='TotCupPoints',
y=method_quality.index,
data=method_quality.reset_index(),
palette=pie_palette,
ax=axes[1]
)
# Add data labels
for i, v in enumerate(method_quality['TotCupPoints']):
axes[1].text(v + 0.01, i, f"{v:.2f}", va='center')
axes[1].set_title('Average Quality Score by Processing Method', fontsize=18)
axes[1].set_xlabel('Total Cup Points')
axes[1].set_ylabel('Processing Method')
# Set x-axis to start at a reasonable value to better show differences
min_score = method_quality['TotCupPoints'].min() - 0.5
max_score = method_quality['TotCupPoints'].max() + 0.1
axes[1].set_xlim(min_score, max_score)
# Add annotations explaining processing methods
method_descriptions = {
"Washed / Wet": "Beans have the fruit removed before drying, resulting in clean, bright flavors",
"Natural / Dry": "Beans are dried with the fruit intact, creating fruity, complex flavors",
"Pulped natural / honey": "Partial removal of fruit before drying, balancing sweetness and acidity",
"Semi-washed / Semi-pulped": "Hybrid method with some mucilage left on during drying"
}
# Add annotations for available methods
plt.figtext(0.5, 0.02, "\n".join([f"{k}: {v}" for k, v in method_descriptions.items()
if k in top_methods.index]),
ha='center', fontsize=14, bbox=dict(facecolor='white', alpha=0.5))
#plt.tight_layout()
plt.subplots_adjust(bottom=0.15) # Make room for annotations
# Adding a new useful feature: Coffee Age in days
# Extract the prior year from the "Harvest Year" column
df['Harvest Year'] = df['Harvest Year'].str.split('/').str[0].str.strip()
# Convert "Harvest Year" and "Expiration" columns to datetime objects using dateutil parser
df['Harvest Year'] = pd.to_datetime(df['Harvest Year'], format='%Y')
df['Expiration'] = df['Expiration'].apply(parser.parse)
# Calculate the coffe age, i.e., the difference in days between "Expiration" and "Harvest Year" columns
df['Coffee Age'] = (df['Expiration'] - df['Harvest Year']).dt.days
# Check for and handle outliers in Coffee Age
q1 = df['Coffee Age'].quantile(0.25)
q3 = df['Coffee Age'].quantile(0.75)
iqr = q3 - q1
upper_bound = q3 + 1.5 * iqr
lower_bound = q1 - 1.5 * iqr
# Create a filtered version without extreme outliers
df_filtered = df[(df['Coffee Age'] >= lower_bound) &
(df['Coffee Age'] <= upper_bound)].copy()
outliers_count = len(df) - len(df_filtered)
if outliers_count > 0:
print(f"Found {outliers_count} outliers in Coffee Age ({outliers_count/len(df):.1%})")
# Select features for analysis
coffee_features = ['Altitude', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
'Balance', 'Coffee Age', 'TotCupPoints']
Found 2 outliers in Coffee Age (1.0%)
# Coffee Age Distribution
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
plt.figure(figsize=(12, 6))
# Create two subplots side by side
ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)
# Plot histogram with KDE on first subplot
sns.histplot(df['Coffee Age'].dropna(), kde=True, color='chocolate',
edgecolor='#3D1C02', alpha=0.7, ax=ax1)
ax1.set_title('Coffee Age Distribution (All Data)', fontsize=18)
ax1.set_xlabel('Age (days)')
ax1.set_ylabel('Frequency')
# Add mean and median lines
mean_age = df['Coffee Age'].mean()
median_age = df['Coffee Age'].median()
ax1.axvline(mean_age, color='#3D1C02', linestyle='--',
label=f'Mean: {mean_age:.0f} days')
ax1.axvline(median_age, color='#8B4513', linestyle='-',
label=f'Median: {median_age:.0f} days')
ax1.legend()
# Plot boxplot on second subplot
sns.boxplot(y=df['Coffee Age'].dropna(), color='chocolate', ax=ax2)
ax2.set_title('Coffee Age Boxplot', fontsize=18)
ax2.set_ylabel('Age (days)')
# Add annotation about outliers
if outliers_count > 0:
ax2.text(0.8, 0.95, f"{outliers_count} outliers detected",
transform=ax2.transAxes, ha='center',
bbox=dict(facecolor='white', alpha=0.8))
plt.tight_layout()
# Coffee Age vs. Quality Score - Filtered Data (without outliers)
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
plt.figure(figsize=(10, 6))
# Scatter plot with regression line
sns.regplot(x='Coffee Age', y='TotCupPoints', data=df_filtered,
scatter_kws={'alpha': 0.6, 'color': 'chocolate', 'edgecolor': '#3D1C02'},
line_kws={'color': '#3D1C02'})
plt.title('Coffee Age vs. Overall Quality Score', fontsize=18)
plt.xlabel('Coffee Age (days)')
plt.ylabel('Total Cup Points')
plt.grid(True, linestyle='--', alpha=0.7)
# Calculate and display correlation
corr = df_filtered[['Coffee Age', 'TotCupPoints']].corr().iloc[0, 1]
plt.annotate(f"Correlation: {corr:.2f}",
xy=(0.05, 0.95), xycoords='axes fraction',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
# Add explanation based on correlation value
if abs(corr) < 0.1:
explanation = "Coffee age appears to have minimal impact on quality"
elif corr > 0:
explanation = "Longer aging tends to slightly improve quality"
else:
explanation = "Quality tends to slightly decrease with age"
plt.annotate(explanation, xy=(0.05, 0.89), xycoords='axes fraction',
bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
plt.tight_layout()
# Pairplot - Filtered Data (without outliers)
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
plt.figure(figsize=(15, 15))
# Create pairplot
g = sns.pairplot(
df_filtered[coffee_features],
kind='reg',
diag_kind='kde',
plot_kws={
'line_kws': {
'color': '#3D1C02',
'lw': 2
},
'scatter_kws': {
's': 15,
'alpha': 0.6,
'color': 'chocolate',
'edgecolor': '#3D1C02',
'linewidth': 0.3
}
},
diag_kws={
'color': 'chocolate',
'edgecolor': '#3D1C02',
'alpha': 0.7,
'linewidth': 1
}
)
g.fig.suptitle('Relationships Between Coffee Features', y=1.02, fontsize=20)
# Add gridlines to each subplot
for ax in g.axes.flat:
ax.grid(True, linestyle='--', alpha=0.6, color='lightgray')
# Improve tick label readability
ax.tick_params(axis='both', which='major', labelsize=8)
# Rotate x-axis labels if they exist
if ax.get_xlabel():
ax.set_xlabel(ax.get_xlabel(), fontsize=9)
if ax.get_ylabel():
ax.set_ylabel(ax.get_ylabel(), fontsize=9)
g.fig.tight_layout()
<Figure size 1500x1500 with 0 Axes>
# Correlation heatmap
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
plt.figure(figsize=(12, 10))
# Calculate correlation matrix
corr_matrix = df_filtered[coffee_features].corr()
# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
# Create the heatmap
hmt = sns.heatmap(
round(corr_matrix, 2),
annot=True,
fmt=".2f",
cmap='YlOrBr',
mask=mask,
cbar_kws={'label': 'Correlation Coefficient'},
linewidths=1,
linecolor='white',
square=True
)
plt.yticks(rotation=0, ha='right')
plt.xticks(rotation=45, ha='right')
plt.title('Correlation Between Coffee Features', fontsize=18, pad=20)
plt.tight_layout()
plt.subplots_adjust(bottom=0.25)
# Coffee Age by Processing Method
plt.figure(figsize=(12, 6))
# Get processing methods
p_methods = df['Processing Method'].value_counts().index
method_data = df[df['Processing Method'].isin(p_methods)]
# Create boxplot
sns.boxplot(x='Processing Method', y='Coffee Age', data=method_data,
palette=sns.color_palette("YlOrBr", n_colors=len(p_methods)))
plt.title('Coffee Age by Processing Method', fontsize=18)
plt.xlabel('Processing Method')
plt.ylabel('Coffee Age (days)')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
# Calculate and display average age by method
avg_age_by_method = method_data.groupby('Processing Method')['Coffee Age'].mean().sort_values()
annotation = "\nAverage age (days):\n\n"
for method, avg in avg_age_by_method.items():
annotation += f"• {method}: {avg:.0f}\n"
plt.annotate(annotation, xy=(0.75, 0.97), xycoords='axes fraction', va='top',
bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8),
fontsize=12)
plt.tight_layout()