In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.colors as mcolors

import warnings
# Do not display warnings
warnings.filterwarnings('ignore')

from dateutil import parser
import re

Info about data

In [2]:
# Load the data
df=pd.read_csv("df_arabica_clean.csv", index_col='ID')
df.head(5)
Out[2]:
Unnamed: 0 Country of Origin Farm Name Lot Number Mill ICO Number Company Altitude Region Producer ... Total Cup Points Moisture Percentage Category One Defects Quakers Color Category Two Defects Expiration Certification Body Certification Address Certification Contact
ID
0 0 Colombia Finca El Paraiso CQU2022015 Finca El Paraiso NaN Coffee Quality Union 1700-1930 Piendamo,Cauca Diego Samuel Bermudez ... 89.33 11.8 0 0 green 3 September 21st, 2023 Japan Coffee Exchange 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901
1 1 Taiwan Royal Bean Geisha Estate The 2022 Pacific Rim Coffee Summit,T037 Royal Bean Geisha Estate NaN Taiwan Coffee Laboratory 1200 Chiayi 曾福森 ... 87.58 10.5 0 0 blue-green 0 November 15th, 2023 Taiwan Coffee Laboratory 台灣咖啡研究室 QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd... Lin, Jen-An Neil 林仁安 - 886-289116612
2 2 Laos OKLAO coffee farms The 2022 Pacific Rim Coffee Summit,LA01 oklao coffee processing plant NaN Taiwan Coffee Laboratory 1300 Laos Borofen Plateau WU TAO CHI ... 87.42 10.4 0 0 yellowish 2 November 15th, 2023 Taiwan Coffee Laboratory 台灣咖啡研究室 QAHWAH CO., LTD 4F, No. 225, Sec. 3, Beixin Rd... Lin, Jen-An Neil 林仁安 - 886-289116612
3 3 Costa Rica La Cumbre CQU2022017 La Montana Tarrazu MIll NaN Coffee Quality Union 1900 Los Santos,Tarrazu Santa Maria de Dota ... 87.17 11.8 0 0 green 0 September 21st, 2023 Japan Coffee Exchange 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901
4 4 Colombia Finca Santuario CQU2023002 Finca Santuario NaN Coffee Quality Union 1850-2100 Popayan,Cauca Camilo Merizalde ... 87.08 11.6 0 2 yellow-green 2 March 5th, 2024 Japan Coffee Exchange 〒413-0002 静岡県熱海市伊豆山1173−58 1173-58 Izusan, Ata... 松澤 宏樹 Koju Matsuzawa - +81(0)9085642901

5 rows × 40 columns

In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 207 entries, 0 to 206
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             207 non-null    int64  
 1   Country of Origin      207 non-null    object 
 2   Farm Name              205 non-null    object 
 3   Lot Number             206 non-null    object 
 4   Mill                   204 non-null    object 
 5   ICO Number             75 non-null     object 
 6   Company                207 non-null    object 
 7   Altitude               206 non-null    object 
 8   Region                 205 non-null    object 
 9   Producer               206 non-null    object 
 10  Number of Bags         207 non-null    int64  
 11  Bag Weight             207 non-null    object 
 12  In-Country Partner     207 non-null    object 
 13  Harvest Year           207 non-null    object 
 14  Grading Date           207 non-null    object 
 15  Owner                  207 non-null    object 
 16  Variety                201 non-null    object 
 17  Status                 207 non-null    object 
 18  Processing Method      202 non-null    object 
 19  Aroma                  207 non-null    float64
 20  Flavor                 207 non-null    float64
 21  Aftertaste             207 non-null    float64
 22  Acidity                207 non-null    float64
 23  Body                   207 non-null    float64
 24  Balance                207 non-null    float64
 25  Uniformity             207 non-null    float64
 26  Clean Cup              207 non-null    float64
 27  Sweetness              207 non-null    float64
 28  Overall                207 non-null    float64
 29  Defects                207 non-null    float64
 30  Total Cup Points       207 non-null    float64
 31  Moisture Percentage    207 non-null    float64
 32  Category One Defects   207 non-null    int64  
 33  Quakers                207 non-null    int64  
 34  Color                  207 non-null    object 
 35  Category Two Defects   207 non-null    int64  
 36  Expiration             207 non-null    object 
 37  Certification Body     207 non-null    object 
 38  Certification Address  207 non-null    object 
 39  Certification Contact  207 non-null    object 
dtypes: float64(13), int64(5), object(22)
memory usage: 66.3+ KB
In [4]:
df.describe().T
Out[4]:
count mean std min 25% 50% 75% max
Unnamed: 0 207.0 103.000000 59.899917 0.00 51.50 103.00 154.500 206.00
Number of Bags 207.0 155.449275 244.484868 1.00 1.00 14.00 275.000 2240.00
Aroma 207.0 7.721063 0.287626 6.50 7.58 7.67 7.920 8.58
Flavor 207.0 7.744734 0.279613 6.75 7.58 7.75 7.920 8.50
Aftertaste 207.0 7.599758 0.275911 6.67 7.42 7.58 7.750 8.42
Acidity 207.0 7.690290 0.259510 6.83 7.50 7.67 7.875 8.58
Body 207.0 7.640918 0.233499 6.83 7.50 7.67 7.750 8.25
Balance 207.0 7.644058 0.256299 6.67 7.50 7.67 7.790 8.42
Uniformity 207.0 9.990338 0.103306 8.67 10.00 10.00 10.000 10.00
Clean Cup 207.0 10.000000 0.000000 10.00 10.00 10.00 10.000 10.00
Sweetness 207.0 10.000000 0.000000 10.00 10.00 10.00 10.000 10.00
Overall 207.0 7.676812 0.306359 6.67 7.50 7.67 7.920 8.58
Defects 207.0 0.000000 0.000000 0.00 0.00 0.00 0.000 0.00
Total Cup Points 207.0 83.706570 1.730417 78.00 82.58 83.75 84.830 89.33
Moisture Percentage 207.0 10.735266 1.247468 0.00 10.10 10.80 11.500 13.50
Category One Defects 207.0 0.135266 0.592070 0.00 0.00 0.00 0.000 5.00
Quakers 207.0 0.690821 1.686918 0.00 0.00 0.00 1.000 12.00
Category Two Defects 207.0 2.251208 2.950183 0.00 0.00 1.00 3.000 16.00
In [5]:
print('Shape of the dataset: \n')
print(df.shape)
print('Columns names: \n')
print(df.columns)
Shape of the dataset: 

(207, 40)
Columns names: 

Index(['Unnamed: 0', 'Country of Origin', 'Farm Name', 'Lot Number', 'Mill',
       'ICO Number', 'Company', 'Altitude', 'Region', 'Producer',
       'Number of Bags', 'Bag Weight', 'In-Country Partner', 'Harvest Year',
       'Grading Date', 'Owner', 'Variety', 'Status', 'Processing Method',
       'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance',
       'Uniformity', 'Clean Cup', 'Sweetness', 'Overall', 'Defects',
       'Total Cup Points', 'Moisture Percentage', 'Category One Defects',
       'Quakers', 'Color', 'Category Two Defects', 'Expiration',
       'Certification Body', 'Certification Address', 'Certification Contact'],
      dtype='object')

Data Cleansind Preparation

In [6]:
# Delete the column 'Unnamed: 0', which is redundant and the columns irrelevant for the analysis
df.drop(['Unnamed: 0', 'Lot Number', 'ICO Number', 'Certification Address', 
         'Certification Contact', 'Mill'], inplace=True, axis=1)
In [7]:
# No duplicates
df[df.duplicated()].shape
Out[7]:
(0, 34)
In [8]:
# Null values for each column
df.isnull().sum()
Out[8]:
Country of Origin       0
Farm Name               2
Company                 0
Altitude                1
Region                  2
Producer                1
Number of Bags          0
Bag Weight              0
In-Country Partner      0
Harvest Year            0
Grading Date            0
Owner                   0
Variety                 6
Status                  0
Processing Method       5
Aroma                   0
Flavor                  0
Aftertaste              0
Acidity                 0
Body                    0
Balance                 0
Uniformity              0
Clean Cup               0
Sweetness               0
Overall                 0
Defects                 0
Total Cup Points        0
Moisture Percentage     0
Category One Defects    0
Quakers                 0
Color                   0
Category Two Defects    0
Expiration              0
Certification Body      0
dtype: int64
In [9]:
# Get all rows with at least one missing value
rows_with_nulls = df[df.isna().any(axis=1)]
print('Number of rows with missing values: ')
print(rows_with_nulls.shape[0])
Number of rows with missing values: 
11
In [10]:
# There are only few missing values, so I delete them
# Drop rows with any null values
df.dropna(axis=0, inplace=True)

df[df.isna().any(axis=1)].shape
Out[10]:
(0, 34)
In [11]:
print('Final shape of the data: ',df.shape)
Final shape of the data:  (196, 34)
In [12]:
# Renaming the columns for easier retrival
df.rename(columns={"Country of Origin": "Country", "Farm Name": "FarmName", "Number of Bags": "NumBags",
                  "Total Cup Points": "TotCupPoints", "Moisture Percentage": "MoisturePerc",
                  "Category Two Defects": "Cat2Defects", "Certification Body": "CertBody"}, inplace=True)
In [13]:
# Cleansing the same names of colors
# Use a dictionary mapping and regex for color replacements
color_mapping = {
    r'yellow green|yellow-green|yellow- green|yello-green': 'yellow-green',
    r'yellowish|pale yellow': 'yellow',
    r'blue-green|bluish-green': 'blue-green',
    r'browish-green': 'brownish',
    r'^green$|^greenish$': 'green'
}

for pattern, replacement in color_mapping.items():
    df['Color'] = df['Color'].str.replace(pat=pattern, repl=replacement, case=False, regex=True)

df['Color'].value_counts()
Out[13]:
green           131
blue-green       30
yellow-green     16
yellow           10
brownish          9
Name: Color, dtype: int64
In [14]:
# Renaming 'Tanzania, United Republic of' to 'Tanzania'
mask = df['Country'].str.contains('Tanzania', case=False, na=False)
df.loc[mask, 'Country'] = 'Tanzania'
In [15]:
# Define function that will process altitude values
def process_altitude(alt):
    if pd.isna(alt):  # Handle NaN values
        return alt
    
    alt_str = str(alt)
    # Handle different separators
    if any(sep in alt_str for sep in ['-', 'A', '~']):
        # Extract numbers
        numbers = re.findall(r'\d+', alt_str)
        if len(numbers) >= 2:
            # Return averageo of the 2 values
            return int((int(numbers[0]) + int(numbers[1])) / 2)
    
    # For single values
    try:
        return int(alt_str)
    except ValueError:
        return None  # Return None if conversion fails
    
df['Altitude'] = df['Altitude'].apply(process_altitude)
In [16]:
# Mapping the processing methods
processing_mapping = {
    "Double Anaerobic Washed": "Washed / Wet",
    "Semi Washed": "Washed / Wet",
    "Honey,Mossto": "Pulped natural / honey",
    "Double Carbonic Maceration / Natural": "Natural / Dry",
    "Wet Hulling": "Washed / Wet",
    "Anaerobico 1000h": "Washed / Wet",
    "SEMI-LAVADO": "Natural / Dry",
    "Washed / Wet" : "Washed / Wet",
    "Pulped natural / honey": "Pulped natural / honey",
    "Natural / Dry": "Natural / Dry"
}
# Changing the values in the column
df['Processing Method'] = df['Processing Method'].map(processing_mapping)
In [17]:
# OUTLIERS DETECTION

# Get only numeric columns
numeric_cols = df.select_dtypes(include='number').columns

# Outlier check for numeric columns
for col in numeric_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
    print(f"Outliers in {col}: {len(outliers)}")


# Plot boxplots for each numeric column

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13

# Atitude, NumBags and TotCupPoints have larger values compared to other numeric columns,
# so plot them separately
fig, axes = plt.subplots(1, 3, figsize=(12, 5))  # 1 row, 3 columns

# Boxplot for Altitude
sns.boxplot(y=df['Altitude'], ax=axes[0], palette='YlOrBr', 
            flierprops={"marker": "o", "markerfacecolor": "grey", "markersize": 7}, width=.5)
axes[0].set_title('Boxplot of Altitude')
axes[0].set_ylabel('Altitude')
axes[0].grid(True, linestyle='--', alpha=0.7)

# Boxplot for NumBags
sns.boxplot(y=df['NumBags'], ax=axes[1], palette='YlOrBr',
            flierprops={"marker": "o", "markerfacecolor": "grey", "markersize": 7}, width=.5)
axes[1].set_title('Boxplot of Number of Bags')
axes[1].set_ylabel('Number of Bags')
axes[1].grid(True, linestyle='--', alpha=0.7)

# Boxplot for TotCupPoints
sns.boxplot(y=df['TotCupPoints'], ax=axes[2], palette='YlOrBr',
            flierprops={"marker": "o", "markerfacecolor": "grey", "markersize": 7}, width=.5)
axes[2].set_title('Boxplot of Total Cup Points')
axes[2].set_ylabel('Total Cup Points')
axes[2].grid(True, linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()

# Remove Altitude, NumBags and TotCupPoints from the rest of columns
rest_numeric_cols = [col for col in numeric_cols if col not in ['Altitude', 'NumBags', 'TotCupPoints']]
# Melt the DataFrame to long format
df_melted = df[rest_numeric_cols].melt(var_name='Feature', value_name='Value')

plt.figure(figsize=(12, 8))
sns.boxplot(x='Feature', y='Value', data=df_melted, palette='YlOrBr', 
            flierprops={"marker": "o", "markerfacecolor": "grey", "markersize": 5}, width=.5, linewidth=.6)
# Customize the plot
plt.title("Boxplots of Numeric Features", fontsize=18)
plt.xticks(rotation=45)
plt.ylabel("Value")
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()
Outliers in Altitude: 4
Outliers in NumBags: 2
Outliers in Aroma: 3
Outliers in Flavor: 3
Outliers in Aftertaste: 7
Outliers in Acidity: 2
Outliers in Body: 9
Outliers in Balance: 9
Outliers in Uniformity: 2
Outliers in Clean Cup: 0
Outliers in Sweetness: 0
Outliers in Overall: 4
Outliers in Defects: 0
Outliers in TotCupPoints: 3
Outliers in MoisturePerc: 1
Outliers in Category One Defects: 12
Outliers in Quakers: 15
Outliers in Cat2Defects: 14
In [18]:
# Print a summary of the cleaned dataset
print("\033[1mCleaning completed!\033[0m")
print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining")
for column in df.columns:
    print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")
# print(f"- Countries: {df['Country'].nunique()} unique values")
# print(f"- Colors: {df['Color'].nunique()} unique values")
Cleaning completed!
196 rows and 34 columns remaining
- Country: 22 unique values
- FarmName: 165 unique values
- Company: 69 unique values
- Altitude: 75 unique values
- Region: 115 unique values
- Producer: 163 unique values
- NumBags: 53 unique values
- Bag Weight: 39 unique values
- In-Country Partner: 20 unique values
- Harvest Year: 7 unique values
- Grading Date: 71 unique values
- Owner: 75 unique values
- Variety: 47 unique values
- Status: 1 unique values
- Processing Method: 3 unique values
- Aroma: 19 unique values
- Flavor: 19 unique values
- Aftertaste: 20 unique values
- Acidity: 19 unique values
- Body: 17 unique values
- Balance: 18 unique values
- Uniformity: 3 unique values
- Clean Cup: 1 unique values
- Sweetness: 1 unique values
- Overall: 21 unique values
- Defects: 1 unique values
- TotCupPoints: 79 unique values
- MoisturePerc: 44 unique values
- Category One Defects: 6 unique values
- Quakers: 11 unique values
- Color: 5 unique values
- Cat2Defects: 14 unique values
- Expiration: 71 unique values
- CertBody: 20 unique values

Data Analysis:

Top values

Top 10 countries producing coffee

In [19]:
# Get unique farms per country
unique_farms = df.groupby('Country')['FarmName'].nunique().sort_values(ascending=False)
# Calculate mean cup points for each country
mean_cup_points = df.groupby('Country')['TotCupPoints'].mean()

# Create a DataFrame with both metrics
country_analysis = pd.DataFrame({
    'Number of Farms': unique_farms,
    'Mean Cup Points': mean_cup_points
}).reset_index()

# Some countries have the same num. of farms so,
# Add rank for farms count
country_analysis['Rank'] = country_analysis['Number of Farms'].rank(method='dense', 
                                            ascending=False)

top10_countries = country_analysis[country_analysis['Rank'] <= 10].sort_values(['Rank', 'Country'],
                                                                   ignore_index=True)
# Group by rank and aggregate the countries
rank_groups = top10_countries.groupby('Rank').agg({
    'Country': lambda x: ', '.join(sorted(x)),
    'Number of Farms': 'first'  # All countries in the same rank have the same number
}).reset_index()

top10_cuppoints = country_analysis.sort_values(['Mean Cup Points', 'Country'], ascending=False, ignore_index=True).head(10)

# Print top countries by number of farms
print(f"\033[1mTop 10 countries by number of farms:\033[0m")
print(top10_countries.to_string(index=False))

# Print top countries by cup points
print(f"\033[1m\nTop 10 countries by mean cup points:\033[0m")
print(top10_cuppoints.to_string(index=False))
Top 10 countries by number of farms:
               Country  Number of Farms  Mean Cup Points  Rank
                Taiwan               56        84.363559   1.0
              Colombia               16        83.781250   2.0
             Guatemala               15        84.416500   3.0
              Ethiopia               10        84.960909   4.0
                Brazil                8        81.541250   5.0
              Honduras                7        83.282308   6.0
              Thailand                6        82.827500   7.0
            Costa Rica                5        83.740000   8.0
           El Salvador                5        81.532857   8.0
United States (Hawaii)                5        83.650000   8.0
                Mexico                4        82.710000   9.0
             Nicaragua                4        82.125000   9.0
              Tanzania                4        84.735000   9.0
               Vietnam                4        82.892500   9.0
                  Laos                3        83.390000  10.0
                  Peru                3        82.332500  10.0
                Uganda                3        83.916667  10.0

Top 10 countries by mean cup points:
   Country  Number of Farms  Mean Cup Points  Rank
  Ethiopia               10        84.960909   4.0
  Tanzania                4        84.735000   9.0
 Guatemala               15        84.416500   3.0
    Taiwan               56        84.363559   1.0
Madagascar                1        84.250000  12.0
 Indonesia                1        84.250000  12.0
    Uganda                3        83.916667  10.0
  Colombia               16        83.781250   2.0
Costa Rica                5        83.740000   8.0
     Kenya                2        83.710000  11.0
In [20]:
def create_hbarplot(data, x_col, y_col, title, xlim=None, figsize=(12,8), bar_labels=False):
    """
    Create a horizontal bar plot with customized styling.
    
    Args:
        data: DataFrame containing the data
        x_col: Column name for x-axis values
        y_col: Column name for y-axis categories
        title: Plot title
        xlim: Optional tuple for x-axis limits (min, max)
        figsize: Optional tuple for figure size
        bar_labels: Optional, add customed labels to the bars
    """
    fig, ax = plt.subplots(figsize=(12, 8))
    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.size'] = 13
    
    # Create the bar plot
    bars = sns.barplot(
        data=data, 
        x=x_col, 
        y=y_col,
        palette=sns.color_palette('YlOrBr', n_colors=len(data) + 3),
        edgecolor='lightgray',
        ax=ax
    )
    
    # Add labels to the bars
    if bar_labels:
        for index, row in data.iterrows():
            ax.text(row[x_col]-0.2,
                index,
                str(round(row[x_col],2))+':',
                va='center',
                color='black',
                ha='right'
                )
            ax.text(row[x_col]+0.2,
                index,
                str(row[y_col]),
                va='center',
                color='black',
                ha='left'
                )
    
    # Customize the plot
    ax.set_xlabel(x_col)
    ax.set_ylabel(None)
    ax.set_yticks([])
    ax.set_title(title, fontsize=18)
    ax.grid(axis='both', linestyle='--', alpha=0.7)
    
    if xlim:
        ax.set_xlim(xlim)
    else:
        ax.set_xlim(0, max(data[x_col]) * 1.1)
        
    plt.tight_layout()
    
    return fig, ax
In [21]:
fig, ax = create_hbarplot(data=rank_groups, x_col='Number of Farms', y_col='Country',
                title='Countries Grouped by Number of Farms',
                xlim=(0, max(rank_groups['Number of Farms']) * 1.07),
                bar_labels=True
               )

ax.annotate(
    f"Taiwan has the highest \nnumber of farms despite \nits small size.",
    xy=(54, 0), xytext=(38, 2),
    arrowprops=dict(arrowstyle="->", color="gray"),
    fontsize=16,
    color='chocolate',
    fontweight='bold'
)

plt.show()
In [22]:
fig, ax_cup = create_hbarplot(data=top10_cuppoints, x_col='Mean Cup Points', y_col='Country',
               title='Top 10 Countries by Mean Total Cup Points',
               xlim = (80, max(top10_cuppoints['Mean Cup Points'])*1.015),
                figsize=(8,8),
                bar_labels=True,
               )

ax_cup.annotate(
    "Ethiopia \nis known for \nspecialty coffee \nvarieties",
    xy=(85.4, 0.2), xytext=(85.15, 3.6),
    arrowprops=dict(arrowstyle="->", color="gray"),
    fontsize=16,
    color='chocolate',
    fontweight='bold'
)

plt.show()
In [23]:
# Create a scatterplot to examine relationship between farms and quality
plt.figure(figsize=(10, 8))
sns.scatterplot(x=country_analysis['Number of Farms'], y=country_analysis['Mean Cup Points'], 
            hue=np.log(country_analysis['Number of Farms']),
            s=200, 
            alpha=0.6, 
            palette='YlOrBr', edgecolor='#3D1C02', linewidth=0.5
               )

plt.xlabel('Number of Farms')
plt.ylabel('Mean Cup Points')
plt.title('Relationship Between Number of Farms and Cup Quality', fontsize=16)
plt.grid(True, alpha=0.3)
plt.legend().remove()
plt.xscale('log')

xticks = [1,2,3,4,5,6,7,8,9,10,20,30,40,50,60]
xlabels = [str(x) for x in xticks]
plt.xticks(xticks, labels=xlabels)

plt.tight_layout()
plt.show()

Top coffee colors

In [25]:
coffee_colors = df['Color'].value_counts()

# Creating a DataFrame for coffee colors
coffee_colors_df = pd.DataFrame({'Color': coffee_colors.index, 'Count' : coffee_colors.values})
print(coffee_colors_df)
          Color  Count
0         green    131
1    blue-green     30
2  yellow-green     16
3        yellow     10
4      brownish      9
In [26]:
# Creating a dictionary mapping coffee color names to actual colors
color_mapping = {
    'green': '#006400',        # Dark green
    'blue-green': '#029386',   # Teal
    'yellow-green': '#9ACD32', # Yellow-green
    'brownish': '#A0522D',     # Sienna/brown
    'yellow': '#FBDD7E',     # Wheat
}

plt.figure(figsize=(6,6))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13

# Extract data
labels = coffee_colors_df['Color']  # Coffee color names column
sizes = coffee_colors_df['Count']   # Count column

# Create a list of colors in the same order as labels
colors = [color_mapping[color] for color in labels]

pie = plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', pctdistance=0.8, textprops={})

# Customize percent labels
for autotext in pie[2]:
    autotext.set_horizontalalignment('center')
    autotext.set_color('white')

plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.title('Distribution of Coffee Colors', fontsize=18)
plt.tight_layout()
plt.show()
In [27]:
plt.figure(figsize=(10,5))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13

# Set y-axis to logarithmic scale
plt.yscale('log')

# List of altitude values for y-axis ticks
max_alt = df['Altitude'].max()
yticks = [150,300,600,1200,2400,4800]
ylabels = [str(y) for y in yticks]
# Set y-axis ticks
plt.yticks(ticks=yticks, labels=ylabels)
# Disable minor ticks
plt.minorticks_off()

print(f"\033[1m There is no strong correlation between the altitude and the color of the coffee:\033[0m")
sns.swarmplot(x=df['Color'], y=df['Altitude'], palette=color_mapping, s=6, edgecolor='black',
              linewidth=0.5, alpha=0.8, dodge=True)

plt.title('Altitude Distribution by Coffee Bean Color', fontsize=18)
plt.ylabel('Altitude (meters, log scale)')
plt.xlabel('Bean Color')
plt.grid(axis='both', linestyle='--', alpha=0.7)

# Add altitude range annotations for each color
for i, color in enumerate(df['Color'].unique()):
    color_data = df[df['Color'] == color]['Altitude']
    mean_alt = color_data.mean()
    
    # Add range text below the plot
    plt.text(
        i, 3000, 
        f"Mean: {round(mean_alt,1)}m", 
        ha='center', 
        fontsize=9
    )

plt.tight_layout()
plt.show()
 There is no strong correlation between the altitude and the color of the coffee:
In [28]:
color_labels = df['Color'].unique()
# list of features 
features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance']  
# Dictionary: for each feature, calculate means for all colors
feature_means = {
    feature: [df[df['Color'] == color][feature].mean() for color in color_labels]
    for feature in features
}
# Merge the data for creating a heatmap
heatmap_data = pd.DataFrame(feature_means, index=color_labels).T

# Order columns by total cup points
color_by_points = df.groupby('Color')['TotCupPoints'].mean().sort_values(ascending=False).index
heatmap_data = heatmap_data[color_by_points]

plt.figure(figsize=(10, 9))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13

# Create a diverging colormap centered around the mean
mean_value = heatmap_data.values.mean()
# Create heatmap with improved color scheme
hm = sns.heatmap(
        heatmap_data, 
        annot=True, 
        fmt=".2f", 
        cmap='YlOrBr',
        center=mean_value,
        cbar_kws={'label': 'Mean Score'},
        linewidths=0.5
    )

# Highlight the highest value in each row
for i, feature in enumerate(features):
    max_idx = heatmap_data.loc[feature].idxmax()
    j = list(heatmap_data.columns).index(max_idx)
    hm.add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='black', lw=2))

plt.yticks(rotation=0, ha='right')
plt.xticks(rotation=20, ha='right')

plt.title('Mean Feature Values by Coffee Color', fontsize=18)
plt.xlabel('Coffee Color (ordered by total cup points)')
plt.ylabel('Taste Feature')
plt.show()
In [29]:
# Create a radar chart comparing flavor profiles by color
color_labels = df['Color'].unique()
features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance']
    
# Compute mean values for each color and feature
means = {}
for color in color_labels:
    means[color] = [df[df['Color'] == color][feature].mean() for feature in features]
    
# Create the radar chart
fig, ax = plt.subplots(figsize=(10, 8), subplot_kw=dict(polar=True))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
    
# color_mapping = get_coffee_color_map()
angles = np.linspace(0, 2*np.pi, len(features), endpoint=False).tolist()
angles += angles[:1]  # Close the loop
    
# Plot each color
for color in color_labels:
    values = means[color]
    values += values[:1]  # Close the loop
    ax.plot(angles, values, color=color_mapping[color], linewidth=2, label=color)
    ax.fill(angles, values, color=color_mapping[color], alpha=0.25)
    
# Set feature labels
ax.set_xticks(angles[:-1], features)
# Add padding for better label visibility
for tick in ax.xaxis.get_major_ticks():
    tick.set_pad(15)
    
# Set y limits for consistency
ax.set_ylim(7.5, 7.95)
ax.set_yticks([7.5, 7.6, 7.7, 7.8, 7.9])
    
ax.set_title('Flavor Profile Comparison by Coffee Color', fontsize=18, y=1.1)
plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
Out[29]:
<matplotlib.legend.Legend at 0x16549e746a0>
In [30]:
# Retrieving mean value of total cup points for each color
cuppoints_means = df.groupby('Color')['TotCupPoints'].mean()

# Sort the list by numerical values in descending order
sorted_cuppoints = cuppoints_means.sort_values(ascending=False)

# Create a DataFrame for sorted points
df_cuppoints = sorted_cuppoints.reset_index(name='TotCupPoints')

# Display the DataFrame
print(df_cuppoints)
          Color  TotCupPoints
0      brownish     84.776667
1  yellow-green     84.233125
2        yellow     83.792000
3         green     83.649160
4    blue-green     83.455333
In [31]:
# Create the bar plot
plt.figure(figsize=(6, 5))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13

bars = sns.barplot(data=df_cuppoints, x='TotCupPoints', y='Color',
    palette=color_mapping,
    edgecolor='lightgray',
)

# Add labels to the bars
for index, row in df_cuppoints.iterrows():
    plt.text(row['TotCupPoints']-0.1,
            index,
            str(row['Color'])+': '+str(round(float(row['TotCupPoints']),2)),
            va='center',
            color='white',
            ha='right',
            weight='bold'
            )

# Customize the plot
plt.xlabel('Mean Total Cup Points')
plt.ylabel(None)
plt.yticks([])
plt.title('Mean Total Cup Points by Color', fontsize=18)
plt.grid(axis='x', linestyle='--', alpha=0.8)
plt.xlim(80, max(df_cuppoints['TotCupPoints'])*1.001)

plt.tight_layout()
plt.show()
In [32]:
# Perform statistical tests to verify significance of color differences"""
from scipy import stats
    
color_labels = df['Color'].unique()
features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'TotCupPoints']
    
# Create a results dataframe
results = pd.DataFrame(columns=['Feature', 'F_statistic', 'p_value', 'Significant'])
    
for feature in features:
    # Perform one-way ANOVA
    groups = [df[df['Color'] == color][feature] for color in color_labels]
    f_stat, p_val = stats.f_oneway(*groups)
        
    # Add to results
    results = results.append({
            'Feature': feature, 
            'F_statistic': round(f_stat, 2), 
            'p_value': round(p_val, 4),
            'Significant': 'Yes' if p_val < 0.05 else 'No'
        }, ignore_index=True)
    
print("Statistical Significance of Color Differences:")
print(results)
    
# Create a bar chart of p-values
fig, ax = plt.subplots(figsize=(10, 6))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
    
bars = ax.bar(results['Feature'], -np.log10(results['p_value']), 
                 edgecolor=['green' if sig == 'Yes' else 'gray' for sig in results['Significant']],
                 linewidth=2.5,
                 fill=False, hatch='//')
    
# Add significance threshold line
ax.axhline(y=-np.log10(0.05), color='purple', linestyle='--', 
               label='Significance threshold (p=0.05)')
    
ax.set_ylabel('-log10(p_value)')
ax.set_title('Statistical Significance of Coffee Color Impact', fontsize=18)
ax.set_xticklabels(results['Feature'], rotation=45)
    
# Add annotation
significant_count = results['Significant'].value_counts().get('Yes', 0)
ax.annotate(
    f"{significant_count} out of {len(features)} features show significant differences",
    xy=(0.5, 0.9), 
    xycoords='axes fraction',
    ha='center',
    fontsize=13
    )
    
plt.legend(loc=(0.58,0.7))
plt.tight_layout()
Statistical Significance of Color Differences:
        Feature  F_statistic  p_value Significant
0         Aroma         1.27   0.2851          No
1        Flavor         2.19   0.0712          No
2    Aftertaste         1.82   0.1258          No
3       Acidity         0.76   0.5542          No
4          Body         0.95   0.4374          No
5       Balance         1.28   0.2782          No
6  TotCupPoints         1.44   0.2221          No

Top coffee companies

In [33]:
def format_company_report(df, top_n=3):
    """Generate a formatted report for top companies with improved styling"""
    from IPython.display import display, HTML
    
    # Companies with the biggest number of farms
    companies = df.groupby('Company')['FarmName'].nunique().sort_values(ascending=False).head(top_n)
    df_companies = companies.reset_index(name='Number of Farms')
    
    # Features we want to analyze
    features = ['Aroma', 'Flavor', 'Overall', 'TotCupPoints']
    
    # Create HTML tables for better visualization
    html_output = "<style>.company-table {border-collapse: collapse; width: 50%; margin-bottom: 20px;} "
    html_output += ".company-table th {background-color: #D4A76A; color: white; text-align: left; \
        padding: 8px; font-size: 14px} "
    html_output += ".company-table td {padding: 8px; border-bottom: 1px solid #ddd; font-size: 14px} "
    html_output += ".company-header {background-color: #8B4513; width: 55%; color: white;\
        font-weight: bold; padding: 10px; font-size: 18px;}</style>"
    
    # Loop through top companies
    for i, (company, farm_count) in enumerate(zip(df_companies['Company'], df_companies['Number of Farms'])):
        company_data = df[df['Company'] == company]
        
        # Company header
        html_output += f"<div class='company-header'>{i+1}. {company} ({farm_count} farms)</div>"
        
        # Create a table for this company's stats
        html_output += "<table class='company-table'>"
        
        # Top countries section
        top3_countries = company_data.groupby('Country')['FarmName'].nunique().sort_values(ascending=False).head(3)
        html_output += "<tr><th colspan='2'>Top 3 Countries by Farm Count</th></tr>"
        for country, count in top3_countries.items():
            html_output += f"<tr><td>{country}</td><td>{count} farms</td></tr>"
        
        # Feature averages section
        html_output += "<tr><th colspan='2'>Quality Metrics</th></tr>"
        for feature in features:
            feature_mean = company_data[feature].mean()
            html_output += f"<tr><td>{feature}</td><td>{feature_mean:.2f}</td></tr>"
        
        # Additional metrics
        html_output += "<tr><th colspan='2'>Other Metrics</th></tr>"
        html_output += f"<tr><td>Mean Altitude</td><td>{company_data['Altitude'].mean():.0f} m</td></tr>"
        html_output += f"<tr><td>Most Common Processing Method</td><td>{company_data['Processing Method'].value_counts().index[0]}</td></tr>"
        
        html_output += "</table>"
    
    return HTML(html_output)

format_company_report(df)
Out[33]:
1. Taiwan Coffee Laboratory (49 farms)
Top 3 Countries by Farm Count
Taiwan27 farms
United States (Hawaii)5 farms
Guatemala4 farms
Quality Metrics
Aroma7.94
Flavor7.92
Overall7.86
TotCupPoints84.82
Other Metrics
Mean Altitude1161 m
Most Common Processing MethodWashed / Wet
2. Taiwu Coffee Cooperative (25 farms)
Top 3 Countries by Farm Count
Taiwan25 farms
Quality Metrics
Aroma7.63
Flavor7.69
Overall7.61
TotCupPoints83.16
Other Metrics
Mean Altitude372 m
Most Common Processing MethodPulped natural / honey
3. Coffee Quality Union (13 farms)
Top 3 Countries by Farm Count
Colombia6 farms
Thailand2 farms
Brazil1 farms
Quality Metrics
Aroma7.78
Flavor7.76
Overall7.78
TotCupPoints84.11
Other Metrics
Mean Altitude1593 m
Most Common Processing MethodWashed / Wet
In [34]:
def compare_top_companies(df, top_n=3):
    """Create visual comparisons between top coffee companies"""
    # Get top companies
    top_companies = df.groupby('Company')['FarmName'].nunique().sort_values(ascending=False).head(top_n).index
    
    # Create a filtered dataframe with only top companies
    top_df = df[df['Company'].isin(top_companies)]
    
    # Plot: Feature comparison between companies
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.size'] = 13
    
    features = ['Aroma', 'Flavor', 'Aftertaste', 'Balance']
    axes = axes.flatten()
    
    for i, feature in enumerate(features):
        sns.boxplot(x='Company', y=feature, data=top_df, ax=axes[i], palette='YlOrBr')
        axes[i].set_title(f'Distribution of {feature} Scores')
        axes[i].set_xlabel('')
        
        labels = axes[i].get_xticklabels()
        wrapped = [label.get_text().replace(" ", "\n") for label in labels]
        axes[i].set_xticklabels(wrapped)
        
        axes[i].set_ylabel(feature)
        axes[i].grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add suptitle to the figure
    fig.suptitle('Feature Comparison Between Top Coffee Companies', fontsize=18)
    fig.tight_layout()
    fig.subplots_adjust(top=0.9)
    
    return fig

fig = compare_top_companies(df)
plt.show()
In [35]:
def plot_cup_points_distribution(df, top_n=3):
    """Compare total cup points distributions among top companies"""
    # Get top companies
    top_companies = df.groupby('Company')['FarmName'].nunique().sort_values(ascending=False).head(top_n).index
    
    # Create filtered dataframe
    top_df = df[df['Company'].isin(top_companies)]
    
    # Create the plot
    fig, ax = plt.subplots(figsize=(12, 6))
    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.size'] = 13
    
    # Create violin plot with inner box plot
    sns.violinplot(
        x='Company', 
        y='TotCupPoints', 
        data=top_df,
        palette='YlOrBr',
        inner='box',
        ax=ax
    )
    
    # Add individual points with jitter
    sns.stripplot(
        x='Company', 
        y='TotCupPoints', 
        data=top_df,
        color='black',
        size=5,
        alpha=0.3,
        jitter=True,
        ax=ax
    )
    
    # Add mean markers
    for i, company in enumerate(top_companies):
        mean_val = top_df[top_df['Company'] == company]['TotCupPoints'].mean()
        ax.plot([i-0.2, i+0.2], [mean_val, mean_val], color='gray', linewidth=12)
        ax.text(i, mean_val-0.2, f"Mean: {mean_val:.2f}", ha='center', fontsize=10, fontweight='bold', color='white')
    
    # Customize the plot
    ax.set_title('Distribution of Total Cup Points by Company', fontsize=18)
    ax.set_ylabel('Total Cup Points')
    ax.set_xlabel('')
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    
    # Add statistical significance test results
    from scipy import stats
    f_stat, p_val = stats.f_oneway(*[top_df[top_df['Company'] == company]['TotCupPoints'] for company in top_companies])
    significance = "significant" if p_val < 0.05 else "not significant"
    
    ax.annotate(
        f"ANOVA p-value: {p_val:.4f} (differences are {significance})",
        xy=(0.43, 0.79),
        xycoords='figure fraction',
        ha='center',
        fontsize=12
    )
    
    return fig

fig = plot_cup_points_distribution(df)
plt.show()
In [36]:
def create_company_correlation_matrices(df, top_n=3):
    """Create correlation matrices for top companies to compare their quality patterns"""
    # Get top companies
    top_companies = df.groupby('Company')['FarmName'].nunique().sort_values(ascending=False).head(top_n).index
    
    # Features to analyze
    features = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 'TotCupPoints']
    
    # Create subplots
    fig, axes = plt.subplots(top_n, 1, figsize=(8, 15))
    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.size'] = 13
    
    vmin = 1
    # Plot correlation matrix for each company
    for i, company in enumerate(top_companies):
        company_data = df[df['Company'] == company][features]
        corr = company_data.corr()
        if corr.min().min() < vmin:
            vmin = corr.min().min()
        
        
        # Create heatmap
        sns.heatmap(
            corr, 
            annot=True, 
            fmt=".2f", 
            cmap='YlOrBr', 
            vmin=vmin, 
            vmax=1,
            ax=axes[i],
            linewidths=0.5
        )
        
        axes[i].set_title(f"{company}", fontsize=14)
        axes[i].set_xticklabels(features, rotation=45)
    
    plt.tight_layout()
    plt.suptitle('Correlation Between Quality Metrics for Top Companies', fontsize=18, y=1.05)
    
    return fig

fig = create_company_correlation_matrices(df)
plt.show()

Top coffee processing methods

In [37]:
print(df.columns)
Index(['Country', 'FarmName', 'Company', 'Altitude', 'Region', 'Producer',
       'NumBags', 'Bag Weight', 'In-Country Partner', 'Harvest Year',
       'Grading Date', 'Owner', 'Variety', 'Status', 'Processing Method',
       'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance',
       'Uniformity', 'Clean Cup', 'Sweetness', 'Overall', 'Defects',
       'TotCupPoints', 'MoisturePerc', 'Category One Defects', 'Quakers',
       'Color', 'Cat2Defects', 'Expiration', 'CertBody'],
      dtype='object')
In [38]:
# Get top processing methods
process_methods = df['Processing Method'].value_counts()
top_methods = process_methods.head(3)
# Create a DataFrame for the top methods
process_methods_df = pd.DataFrame({
        'ProcessMethod': top_methods.index, 
        'Count': top_methods.values
    })

# TOP COFFEE PROCESSING METHODS VISUALIZATION

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
fig, axes = plt.subplots(2, 1, figsize=(8, 15))
#plt.figure(figsize=(6,6))

# ------------------Plot 1: Pie chart----------------------------------

# Add percentage and absolute count to labels
total = sum(top_methods.values)
labels = [f"{method}\n({count:,} beans, {count/total:.1%})" 
            for method, count in zip(top_methods.index, top_methods.values)]
    
pie_palette = sns.color_palette('YlOrBr',3)

wedges, texts = axes[0].pie(
    top_methods.values,
    colors=pie_palette,
    startangle=90,
    wedgeprops={'edgecolor': 'w', 'linewidth': 1.5}
    )
    
# Add a white circle at the center to create a donut chart
centre_circle = plt.Circle((0, 0), 0.5, fc='white')
axes[0].add_artist(centre_circle)
axes[0].legend(wedges, labels, loc="center left", bbox_to_anchor=(1, 0.5))
axes[0].set_title('Top 3 Coffee Processing Methods', fontsize=18)
    
axes[0].text(0, 0, f'Total: {total:,}', ha='center', va='center', fontsize=14)

    
# -------------------Plot 2: Horizontal bar chart with quality scores--------------------
    
# Create a new DataFrame with average quality scores for each processing method
quality_metrics = ['Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body', 'Balance', 
    'Overall', 'TotCupPoints']
    
# Calculate mean scores for each processing method
method_quality = df[df['Processing Method'].isin(top_methods.index)].groupby('Processing Method')[quality_metrics].mean()
    
# Sort by total cup points
method_quality = method_quality.sort_values('TotCupPoints', ascending=False)
    
# Plot horizontal bar chart for TotCupPoints
sns.barplot(
        x='TotCupPoints', 
        y=method_quality.index,
        data=method_quality.reset_index(),
        palette=pie_palette,
        ax=axes[1]
    )
    
# Add data labels
for i, v in enumerate(method_quality['TotCupPoints']):
    axes[1].text(v + 0.01, i, f"{v:.2f}", va='center')
    
axes[1].set_title('Average Quality Score by Processing Method', fontsize=18)
axes[1].set_xlabel('Total Cup Points')
axes[1].set_ylabel('Processing Method')
    
# Set x-axis to start at a reasonable value to better show differences
min_score = method_quality['TotCupPoints'].min() - 0.5
max_score = method_quality['TotCupPoints'].max() + 0.1
axes[1].set_xlim(min_score, max_score)
    
# Add annotations explaining processing methods
method_descriptions = {
        "Washed / Wet": "Beans have the fruit removed before drying, resulting in clean, bright flavors",
        "Natural / Dry": "Beans are dried with the fruit intact, creating fruity, complex flavors",
        "Pulped natural / honey": "Partial removal of fruit before drying, balancing sweetness and acidity",
        "Semi-washed / Semi-pulped": "Hybrid method with some mucilage left on during drying"
    }
    
# Add annotations for available methods
plt.figtext(0.5, 0.02, "\n".join([f"{k}: {v}" for k, v in method_descriptions.items() 
                                     if k in top_methods.index]), 
               ha='center', fontsize=14, bbox=dict(facecolor='white', alpha=0.5))
    
#plt.tight_layout()
plt.subplots_adjust(bottom=0.15)  # Make room for annotations

Analyzing trends and coffee age

In [39]:
# Adding a new useful feature: Coffee Age in days

# Extract the prior year from the "Harvest Year" column
df['Harvest Year'] = df['Harvest Year'].str.split('/').str[0].str.strip()

# Convert "Harvest Year" and "Expiration" columns to datetime objects using dateutil parser
df['Harvest Year'] = pd.to_datetime(df['Harvest Year'], format='%Y')
df['Expiration'] = df['Expiration'].apply(parser.parse)

# Calculate the coffe age, i.e., the difference in days between "Expiration" and "Harvest Year" columns
df['Coffee Age'] = (df['Expiration'] - df['Harvest Year']).dt.days

# Check for and handle outliers in Coffee Age
q1 = df['Coffee Age'].quantile(0.25)
q3 = df['Coffee Age'].quantile(0.75)
iqr = q3 - q1
upper_bound = q3 + 1.5 * iqr
lower_bound = q1 - 1.5 * iqr
    
# Create a filtered version without extreme outliers
df_filtered = df[(df['Coffee Age'] >= lower_bound) & 
                             (df['Coffee Age'] <= upper_bound)].copy()
    
outliers_count = len(df) - len(df_filtered)
if outliers_count > 0:
    print(f"Found {outliers_count} outliers in Coffee Age ({outliers_count/len(df):.1%})")
    
# Select features for analysis
coffee_features = ['Altitude', 'Aroma', 'Flavor', 'Aftertaste', 'Acidity', 'Body',
                   'Balance', 'Coffee Age', 'TotCupPoints']
Found 2 outliers in Coffee Age (1.0%)
In [40]:
# Coffee Age Distribution

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
plt.figure(figsize=(12, 6))
    
# Create two subplots side by side
ax1 = plt.subplot(1, 2, 1)
ax2 = plt.subplot(1, 2, 2)
    
# Plot histogram with KDE on first subplot
sns.histplot(df['Coffee Age'].dropna(), kde=True, color='chocolate', 
                 edgecolor='#3D1C02', alpha=0.7, ax=ax1)
ax1.set_title('Coffee Age Distribution (All Data)', fontsize=18)
ax1.set_xlabel('Age (days)')
ax1.set_ylabel('Frequency')
    
# Add mean and median lines
mean_age = df['Coffee Age'].mean()
median_age = df['Coffee Age'].median()
ax1.axvline(mean_age, color='#3D1C02', linestyle='--', 
                label=f'Mean: {mean_age:.0f} days')
ax1.axvline(median_age, color='#8B4513', linestyle='-', 
                label=f'Median: {median_age:.0f} days')
ax1.legend()
    
# Plot boxplot on second subplot
sns.boxplot(y=df['Coffee Age'].dropna(), color='chocolate', ax=ax2)
ax2.set_title('Coffee Age Boxplot', fontsize=18)
ax2.set_ylabel('Age (days)')
    
# Add annotation about outliers
if outliers_count > 0:
    ax2.text(0.8, 0.95, f"{outliers_count} outliers detected", 
                 transform=ax2.transAxes, ha='center', 
                 bbox=dict(facecolor='white', alpha=0.8))
    
plt.tight_layout()
In [41]:
# Coffee Age vs. Quality Score - Filtered Data (without outliers)
    
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
plt.figure(figsize=(10, 6))
    
# Scatter plot with regression line
sns.regplot(x='Coffee Age', y='TotCupPoints', data=df_filtered, 
            scatter_kws={'alpha': 0.6, 'color': 'chocolate', 'edgecolor': '#3D1C02'}, 
            line_kws={'color': '#3D1C02'})
    
plt.title('Coffee Age vs. Overall Quality Score', fontsize=18)
plt.xlabel('Coffee Age (days)')
plt.ylabel('Total Cup Points')
plt.grid(True, linestyle='--', alpha=0.7)
    
# Calculate and display correlation
corr = df_filtered[['Coffee Age', 'TotCupPoints']].corr().iloc[0, 1]
plt.annotate(f"Correlation: {corr:.2f}", 
            xy=(0.05, 0.95), xycoords='axes fraction',
            bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
    
# Add explanation based on correlation value
if abs(corr) < 0.1:
    explanation = "Coffee age appears to have minimal impact on quality"
elif corr > 0:
    explanation = "Longer aging tends to slightly improve quality"
else:
    explanation = "Quality tends to slightly decrease with age"
        
plt.annotate(explanation, xy=(0.05, 0.89), xycoords='axes fraction',
                 bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8))
    
plt.tight_layout()
In [42]:
# Pairplot - Filtered Data (without outliers)
    
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
plt.figure(figsize=(15, 15))
    
# Create pairplot
g = sns.pairplot(
        df_filtered[coffee_features],
        kind='reg',
        diag_kind='kde',
        plot_kws={
            'line_kws': {
                'color': '#3D1C02',
                'lw': 2
            },
            'scatter_kws': {
                's': 15, 
                'alpha': 0.6, 
                'color': 'chocolate', 
                'edgecolor': '#3D1C02', 
                'linewidth': 0.3
            }
        },
        diag_kws={
            'color': 'chocolate',
            'edgecolor': '#3D1C02',
            'alpha': 0.7,
            'linewidth': 1
        }
    )
    
g.fig.suptitle('Relationships Between Coffee Features', y=1.02, fontsize=20)
    
# Add gridlines to each subplot
for ax in g.axes.flat:
    ax.grid(True, linestyle='--', alpha=0.6, color='lightgray')
        
    # Improve tick label readability
    ax.tick_params(axis='both', which='major', labelsize=8)
        
    # Rotate x-axis labels if they exist
    if ax.get_xlabel():
        ax.set_xlabel(ax.get_xlabel(), fontsize=9)
    if ax.get_ylabel():
        ax.set_ylabel(ax.get_ylabel(), fontsize=9)
    
g.fig.tight_layout()
<Figure size 1500x1500 with 0 Axes>
In [43]:
# Correlation heatmap

plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
plt.figure(figsize=(12, 10))
    
# Calculate correlation matrix
corr_matrix = df_filtered[coffee_features].corr()
    
# Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
    
# Create the heatmap
hmt = sns.heatmap(
        round(corr_matrix, 2),
        annot=True,
        fmt=".2f",
        cmap='YlOrBr',
        mask=mask,
        cbar_kws={'label': 'Correlation Coefficient'},
        linewidths=1,
        linecolor='white',
        square=True
    )
    
plt.yticks(rotation=0, ha='right')
plt.xticks(rotation=45, ha='right')
plt.title('Correlation Between Coffee Features', fontsize=18, pad=20)
    
plt.tight_layout()
plt.subplots_adjust(bottom=0.25)
In [44]:
# Coffee Age by Processing Method

plt.figure(figsize=(12, 6))
        
# Get processing methods
p_methods = df['Processing Method'].value_counts().index
method_data = df[df['Processing Method'].isin(p_methods)]
        
# Create boxplot
sns.boxplot(x='Processing Method', y='Coffee Age', data=method_data, 
                   palette=sns.color_palette("YlOrBr", n_colors=len(p_methods)))
        
plt.title('Coffee Age by Processing Method', fontsize=18)
plt.xlabel('Processing Method')
plt.ylabel('Coffee Age (days)')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', linestyle='--', alpha=0.7)
        
# Calculate and display average age by method
avg_age_by_method = method_data.groupby('Processing Method')['Coffee Age'].mean().sort_values()
annotation = "\nAverage age (days):\n\n"
for method, avg in avg_age_by_method.items():
    annotation += f"• {method}: {avg:.0f}\n"
        
plt.annotate(annotation, xy=(0.75, 0.97), xycoords='axes fraction', va='top',
                    bbox=dict(boxstyle="round,pad=0.5", fc="white", ec="gray", alpha=0.8),
                    fontsize=12)
        
plt.tight_layout()