In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np
In [2]:
# Load CSV file
data = pd.read_csv("vgsales.csv")

Info about the data and cleansing

In [3]:
def df_info(df):
    """Display the basic info about the dataset"""
    
    pd.set_option('display.max_columns', None) # Display all the columns of the dataset
    
    print(f"\033[1m5 head rows of the data:\n\033[0m")
    display(df.head(5))
    
    print(f"\033[1mInfo about the data:\n\033[0m")
    display(df.info())
    
    print(f"\033[1mDescription of the data:\n\033[0m")
    display(df.describe().T)

def df_dupl_null(df):
    """
    Handling the duplicates and print info about them and null values 
    """

    # Checking out duplicates
    duplicates = df[df.duplicated()].shape[0]
    if duplicates==0:
        print(f"\033[1mThere's no duplicates in the DataFrame.\033[0m")
    else:
        print(f"\033[1mThere's {duplicates} duplicates in the {df._name} DataFrame.\033[0m")
        df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
        print("Duplicates dropped!")
    
    # Checking null values for each column
    pd.set_option('display.max_rows', None)
    rows = df.shape[0]
    print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
    return round((df.isnull().sum()/rows)*100,2)

def df_outliers(df):
    """
    Finding information about outliers in the df
    and returning it as a DataFrame
    """
    print(f"\033[1m Outliers detection: \033[0m ")
    
    # Get numeric columns excluding any columns with 'id' in their name
    numeric_cols = [col for col in df.select_dtypes(include='number').columns 
                if 'id' not in col.lower()]
    
    outliers_list = []
    # Outlier check for numeric columns
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outliers_list.append(len(outliers))
        
    rows = df.shape[0]
    # DataFrame for outliers
    outliers_df = pd.DataFrame(
        [[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
        columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
    )

    return outliers_df

def df_summary(df):
    """
    Print a summary of the cleaned dataset
    """
    print(f"\033[1mCleaning of the DataFrame completed!\033[0m")
    print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
    for column in df.columns:
        if df[column].nunique()==1:
            df.drop([column], inplace=True, axis=1)
        else:
            print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")
In [4]:
df_info(data)
5 head rows of the data:

Rank Name Platform Year Genre Publisher NA_Sales EU_Sales JP_Sales Other_Sales Global_Sales
0 1 Wii Sports Wii 2006.0 Sports Nintendo 41.49 29.02 3.77 8.46 82.74
1 2 Super Mario Bros. NES 1985.0 Platform Nintendo 29.08 3.58 6.81 0.77 40.24
2 3 Mario Kart Wii Wii 2008.0 Racing Nintendo 15.85 12.88 3.79 3.31 35.82
3 4 Wii Sports Resort Wii 2009.0 Sports Nintendo 15.75 11.01 3.28 2.96 33.00
4 5 Pokemon Red/Pokemon Blue GB 1996.0 Role-Playing Nintendo 11.27 8.89 10.22 1.00 31.37
Info about the data:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB
None
Description of the data:

count mean std min 25% 50% 75% max
Rank 16598.0 8300.605254 4791.853933 1.00 4151.25 8300.50 12449.75 16600.00
Year 16327.0 2006.406443 5.828981 1980.00 2003.00 2007.00 2010.00 2020.00
NA_Sales 16598.0 0.264667 0.816683 0.00 0.00 0.08 0.24 41.49
EU_Sales 16598.0 0.146652 0.505351 0.00 0.00 0.02 0.11 29.02
JP_Sales 16598.0 0.077782 0.309291 0.00 0.00 0.00 0.04 10.22
Other_Sales 16598.0 0.048063 0.188588 0.00 0.00 0.01 0.04 10.57
Global_Sales 16598.0 0.537441 1.555028 0.01 0.06 0.17 0.47 82.74
In [5]:
df_dupl_null(data)
There's no duplicates in the DataFrame.
 
List of null values for each column in percents: 

Out[5]:
Rank            0.00
Name            0.00
Platform        0.00
Year            1.63
Genre           0.00
Publisher       0.35
NA_Sales        0.00
EU_Sales        0.00
JP_Sales        0.00
Other_Sales     0.00
Global_Sales    0.00
dtype: float64
In [6]:
# Change the Year column type to integer and fill the nulls with zeros
data['Year'] = data['Year'].fillna(0).astype(int)

# Fill the nulls in Publisher column with 'Unknown'
data['Publisher'] = data['Publisher'].fillna('Unknown')
In [7]:
df_outliers(data)
 Outliers detection:  
Out[7]:
column_name num_of_outliers pctg_of_outliers
0 Rank 0 0.0
1 Year 576 3.5
2 NA_Sales 1681 10.1
3 EU_Sales 2081 12.5
4 JP_Sales 2425 14.6
5 Other_Sales 1665 10.0
6 Global_Sales 1893 11.4
In [8]:
df_summary(data)
Cleaning of the DataFrame completed!
16598 rows and 11 columns remaining.
- Rank: 16598 unique values
- Name: 11493 unique values
- Platform: 31 unique values
- Year: 40 unique values
- Genre: 12 unique values
- Publisher: 578 unique values
- NA_Sales: 409 unique values
- EU_Sales: 305 unique values
- JP_Sales: 244 unique values
- Other_Sales: 157 unique values
- Global_Sales: 623 unique values

EDA

Descriptive analysis

In [9]:
# Globar figure settings
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
In [10]:
# The most frequent games' names, i.e., released in many different platforms
top_names = pd.DataFrame(data['Name'].value_counts()).reset_index().rename(columns={'index': 'name', 'Name': 'count'})
# Adding a rank column to find top 3 names grouped by rank
top_names['rank'] = top_names['count'].rank(method='dense', 
                                            ascending=False)
# Selecting games with rank up to 3
top3_names = top_names[top_names['rank'] <= 3].sort_values(['rank', 'name'],
                                                                   ignore_index=True)
# Group by rank and aggregate the names
rank_groups = top3_names.groupby('rank').agg({
    'name': lambda x: ', '.join(sorted(x)),
    'count': 'first'  # All countries in the same rank have the same number
}).reset_index()

print("🎮 GAME RELEASES FREQUENCY RANKINGS 🎮")
print("=" * 40)

for rank in sorted(top3_names['rank'].unique()):
    games_at_rank = top3_names[top3_names['rank'] == rank]
    count = games_at_rank['count'].iloc[0]
    game_names = games_at_rank['name'].tolist()
    
    print(f"\nRank {int(rank)} ({count} releases each):")
    for game in game_names:
        print(f"  • {game}")
🎮 GAME RELEASES FREQUENCY RANKINGS 🎮
========================================

Rank 1 (12 releases each):
  • Need for Speed: Most Wanted

Rank 2 (9 releases each):
  • FIFA 14
  • LEGO Marvel Super Heroes
  • Madden NFL 07
  • Ratatouille

Rank 3 (8 releases each):
  • Angry Birds Star Wars
  • Cars
  • FIFA 15
  • FIFA Soccer 13
  • LEGO Harry Potter: Years 5-7
  • LEGO Jurassic World
  • LEGO Star Wars II: The Original Trilogy
  • LEGO The Hobbit
  • Lego Batman 3: Beyond Gotham
  • Madden NFL 08
  • Monopoly
  • Terraria
  • The LEGO Movie Videogame
In [11]:
def create_hbarplot(data, x_col, y_col, title, x_label, y_label, xlim=None, 
                    figsize=(12,8), bar_labels=False, xscale='linear', lscale=1, order=None):
    """
    Create a horizontal bar plot with customized styling.
    
    Args:
        data: DataFrame containing the data
        x_col: Column name for x-axis values
        y_col: Column name for y-axis categories
        x_label: label for the x axis
        y_label: label for the y axis
        title: Plot title
        xlim: Optional tuple for x-axis limits (min, max)
        figsize: Optional tuple for figure size
        bar_labels: Optional, add customed labels to the bars
        xscale: Optional, sets the bar_labels behavior depending on the type of x axis scaling
        lscale: Optional, a multiplier controling the bar labels placing
        order: Optional. Sns barplot automaticaly sorts categorical data on yaxis, so in order to keep
            the desired form of sorting, one has to pass the list with proper ordering
    """
    
    fig, ax = plt.subplots(figsize=figsize)
    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.size'] = 13
    
    # Create the bar plot
    bars = sns.barplot(
        data=data, 
        x=x_col, 
        y=y_col,
        palette='plasma',
        edgecolor='gray',
        ax=ax,
        orient='h',
        order=order
    )
    
    # Customize the plot
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title, fontsize=18)
    ax.grid(axis='x', linestyle='--', alpha=0.9)
    plt.xscale(xscale)
    
    if xlim:
        ax.set_xlim(xlim)
    else:
        ax.set_xlim(0, max(data[x_col]) * 1.1)
        
    # Add labels to the bars
    if bar_labels:
        for index, row in data.iterrows():
            if xscale=='linear':
                if row['count']<=1000:
                    posx = row['count']+150*lscale
                else:
                    posx = row['count']+200*lscale
            elif xscale=='log':
                if row['count']<11:
                    posx = row['count']*(1+0.2)
                elif 10<row['count']<101:
                    posx = row['count']*(1+0.35)
                elif 100<row['count']<1001:
                    posx = row['count']*(1+0.45)
                else:
                    posx = row['count']*(1+0.55)
            ax.text(posx,
                    index,
                    str(round(row['count'],2)),
                    va='center',
                    color='black',
                    ha='right')
        
    plt.tight_layout()
    
    return fig, ax
In [12]:
# Game platforms frequency, i.e., how many games were released on each platform
platforms = pd.DataFrame(data['Platform'].value_counts()).reset_index().rename(columns={'index': 'platform', 
                                                                                        'Platform': 'count'})

create_hbarplot(data=platforms, x_col='count', y_col='platform', 
            title='Distribution of the Gaming Platform Frequency',
            x_label='Frequency (Game Releases)', y_label='Gaming Platform',
            xlim=[0.6, max(platforms['count']) * 1.6],
            figsize=(15,10),
            bar_labels=True,
               xscale='log')
Out[12]:
(<Figure size 1500x1000 with 1 Axes>,
 <AxesSubplot:title={'center':'Distribution of the Gaming Platform Frequency'}, xlabel='Frequency (Game Releases)', ylabel='Gaming Platform'>)
In [13]:
# Years frequency, i.e., how many games were released on ech year
years = pd.DataFrame(data['Year'].value_counts()).reset_index().rename(columns={'index': 'year', 
                                                        'Year': 'count'}).sort_values('year').reset_index()
# Delete the data with unknown year
years.drop(index = 0, inplace=True)
years.reset_index(inplace=True)

create_hbarplot(data=years, x_col='count', y_col='year', 
            title='Distribution of the Games Releases over Years',
            x_label='Frequency (Game Releases)', y_label='Year',
            xlim=[0.6, max(years['count']) * 1.8],
            figsize=(10,12),
            bar_labels=True,
               xscale='log')
Out[13]:
(<Figure size 1000x1200 with 1 Axes>,
 <AxesSubplot:title={'center':'Distribution of the Games Releases over Years'}, xlabel='Frequency (Game Releases)', ylabel='Year'>)
In [14]:
# Genre frequency, i.e., what game types are most popular
genres = pd.DataFrame(data['Genre'].value_counts()).reset_index().rename(columns={'index': 'genre', 
                                                        'Genre': 'count'})

create_hbarplot(data=genres, x_col='count', y_col='genre', 
            title='Distribution of the Games Releases by Genre',
            x_label='Frequency (Game Releases)', y_label='Year',
            xlim=[150, max(genres['count']) * 1.1],
            figsize=(12,8),
            bar_labels=True,
               xscale='linear')
Out[14]:
(<Figure size 1200x800 with 1 Axes>,
 <AxesSubplot:title={'center':'Distribution of the Games Releases by Genre'}, xlabel='Frequency (Game Releases)', ylabel='Year'>)
In [15]:
# Publisher frequency, i.e., which company released most games
publishers = pd.DataFrame(data['Publisher'].value_counts()).reset_index().rename(columns={'index': 'publisher', 
                                                        'Publisher': 'count'}).head(10)

create_hbarplot(data=publishers, x_col='count', y_col='publisher', 
            title='Distribution of the Games Releases by Top 10 Publishers',
            x_label='Frequency (Game Releases)', y_label='Publisher',
            xlim=[300, max(publishers['count']) * 1.1],
            figsize=(12,6),
            bar_labels=True,
            xscale='linear',
            lscale=0.4)
Out[15]:
(<Figure size 1200x600 with 1 Axes>,
 <AxesSubplot:title={'center':'Distribution of the Games Releases by Top 10 Publishers'}, xlabel='Frequency (Game Releases)', ylabel='Publisher'>)
In [16]:
# Sales statistics in a given region
sales_regions = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']

for region in sales_regions:
    sales = pd.DataFrame(data[region])
    
    display(sales.describe().T)
count mean std min 25% 50% 75% max
NA_Sales 16598.0 0.264667 0.816683 0.0 0.0 0.08 0.24 41.49
count mean std min 25% 50% 75% max
EU_Sales 16598.0 0.146652 0.505351 0.0 0.0 0.02 0.11 29.02
count mean std min 25% 50% 75% max
JP_Sales 16598.0 0.077782 0.309291 0.0 0.0 0.0 0.04 10.22
count mean std min 25% 50% 75% max
Other_Sales 16598.0 0.048063 0.188588 0.0 0.0 0.01 0.04 10.57
count mean std min 25% 50% 75% max
Global_Sales 16598.0 0.537441 1.555028 0.01 0.06 0.17 0.47 82.74

Exploratory analysis

In [17]:
# The best selling games by world region
region_names = {'NA_Sales': 'North America', 'EU_Sales': 'Europe', 'JP_Sales': 'Japan',
                'Other_Sales': 'Other Regions', 'Global_Sales': 'Globally'}
for region in sales_regions:
    # Top 5 games
    region_data = data[['Name','Year',region]].sort_values(region, ascending=False).reset_index(drop=True).head(5)
    plot_title = 'The Best Selling Games: ' + region_names[region]
    
    fig, ax = create_hbarplot(data=region_data, x_col=region, y_col='Name', 
            title=plot_title,
            x_label='Sales (in millions)', y_label='Game (Year of Release)',
            xlim=[2, max(region_data[region]) * 1.05],
            figsize=(8,4))

    yticks = list()
    for index, row in region_data.iterrows():
        yticks += [str(row['Name']) + ' (' + str(row['Year']) + ')']
        ax.text(row[region],
                    index,
                    str(round(row[region],2)),
                    va='center',
                    color='white',
                    ha='right',
                    weight='bold')
    
    ax.set_yticks(ticks=list(region_data.index), labels=yticks)
In [18]:
# The best selling features like: platform, genre, year, publisher (globally)
features = ['Platform', 'Genre', 'Year', 'Publisher']
g = 'Global_Sales' # for easier retrival

for f in features:
    f_data = data[[f, g]].groupby(f).sum().sort_values(g, ascending=False).reset_index().head(5)
    plot_title = 'Top sellers: ' + f
    
    fig, ax = create_hbarplot(data=f_data, x_col=g, y_col=f, 
            title=plot_title,
            x_label='Sales (in millions)', y_label=f,
            xlim=[min(f_data[g])*0.3, max(f_data[g]) * 1.05],
            figsize=(8,4),
            order=f_data[f].tolist())

    for index, row in f_data.iterrows():
        ax.text(row[g],
                index,
                str(round(row[g],2)),
                va='center',
                color='white',
                ha='right',
                weight='bold')

The last 5 years in Europe analysis - what type of game is worth creating?

In [19]:
recent_data = data[(data['Year'] >= (max(data['Year'])-5))]
In [20]:
# The best selling "features" of the last 5 years
EUfeatures = ['Name', 'Platform', 'Genre', 'Publisher']
# Dictionary to store the relevant data for each top 5 features
EUdata = {}
for f in EUfeatures:
    EUdata[f] = recent_data[[f,
            'EU_Sales']].groupby(f).sum().sort_values('EU_Sales',ascending=False).reset_index().head(5)
    
    fig, ax = create_hbarplot(data=EUdata[f], x_col='EU_Sales', y_col=f, 
            title='The Best Selling ' + f +'s',
            x_label='Sales (in millions)', y_label=f,
            xlim=[0, max(EUdata[f]['EU_Sales']) * 1.05],
            figsize=(10,4))
    
    for index, row in EUdata[f].iterrows():
        ax.text(row['EU_Sales']-1,
                index,
                str(round(row['EU_Sales'],2)),
                va='center',
                color='white',
                ha='right',
                weight='bold')
In [21]:
# Time trends of the features - time of the existence + sales
now = 2020
# Delete the rows with no year information
filtered_data = data[data['Year'] != 0].copy()
# New column - game age
filtered_data['game_age'] = now - filtered_data['Year']

# Distribution of the game ages
plt.figure(figsize=(12,4))
sns.histplot(filtered_data['game_age'], color='purple', binwidth=1, alpha=0.5)
plt.xlabel('Game Age')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.title('Distribution of the Game Ages')
Out[21]:
Text(0.5, 1.0, 'Distribution of the Game Ages')
In [22]:
# New column - mean sales in Europe over the years
filtered_data['EU_mean_sales'] = filtered_data['EU_Sales'] / filtered_data['game_age']

meandata = {}
for f in EUfeatures:
    meandata[f] = filtered_data[[f, 
        'game_age', 'EU_mean_sales']].groupby(f).sum().sort_values('EU_mean_sales',ascending=False).reset_index().head(5)
    
    fig, ax = create_hbarplot(data=meandata[f], x_col='EU_mean_sales', y_col=f, 
            title='The Best Selling ' + f +'s',
            x_label='Mean Sales (in millions)', y_label=f,
            xlim=[0, max(meandata[f]['EU_mean_sales']) * 1.05],
            figsize=(10,4))
    
    for index, row in meandata[f].iterrows():
        ax.text(row['EU_mean_sales'],
                index,
                str(round(row['EU_mean_sales'],2)),
                va='center',
                color='white',
                ha='right',
                weight='bold')
In [23]:
meanyears = filtered_data[['EU_mean_sales', 'game_age']].groupby('game_age').mean().reset_index()

plt.figure(figsize=(10,4))
sns.lineplot(data=meanyears, x='game_age', y='EU_mean_sales', color='purple', marker='o', ci=None)
plt.xlabel("Game's Age")
plt.ylabel('Mean Sales in Europe (in millions)')
plt.title("Distribution of Mean Sales by the Game's Age", fontsize=18, weight='bold')
plt.grid(axis='both', linestyle='--', alpha=0.6)

print('There are two well selling groups: recent hits (age < 10) and old classics (age > 25).')
There are two well selling groups: recent hits (age < 10) and old classics (age > 25).
In [24]:
# Identifying the recent hits
recent_hits = filtered_data[filtered_data['game_age'] <=10 ][['Name', 'Platform','Year', 'Genre','Publisher',
    'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)

display(recent_hits)
Name Platform Year Genre Publisher game_age EU_mean_sales
16 Grand Theft Auto V PS3 2013 Action Take-Two Interactive 7 1.324286
77 FIFA 16 PS4 2015 Sports Electronic Arts 5 1.212000
33 Call of Duty: Black Ops 3 PS4 2015 Shooter Activision 5 1.162000
44 Grand Theft Auto V PS4 2014 Action Take-Two Interactive 6 0.968333
221 FIFA 17 PS4 2016 Sports Electronic Arts 4 0.937500
23 Grand Theft Auto V X360 2013 Action Take-Two Interactive 7 0.758571
34 Call of Duty: Black Ops II PS3 2012 Shooter Activision 8 0.735000
124 FIFA 15 PS4 2014 Sports Electronic Arts 6 0.715000
92 Star Wars Battlefront (2015) PS4 2015 Shooter Electronic Arts 5 0.658000
37 Call of Duty: Modern Warfare 3 PS3 2011 Shooter Activision 9 0.646667
In [25]:
# Identifying the old classics
old_classics = filtered_data[filtered_data['game_age'] >= 25 ][['Name', 'Platform','Year', 'Genre','Publisher',
    'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)

display(old_classics)
Name Platform Year Genre Publisher game_age EU_mean_sales
18 Super Mario World SNES 1990 Platform Nintendo 30 0.125000
22 Super Mario Bros. 3 NES 1988 Platform Nintendo 32 0.107500
521 Myst PC 1994 Adventure Red Orb 26 0.107308
1 Super Mario Bros. NES 1985 Platform Nintendo 35 0.102286
267 Warcraft II: Tides of Darkness PC 1995 Strategy Activision 25 0.090800
21 Super Mario Land GB 1989 Platform Nintendo 31 0.087419
57 Super Mario All-Stars SNES 1993 Platform Nintendo 27 0.079630
5 Tetris GB 1989 Puzzle Nintendo 31 0.072903
50 Super Mario Land 2: 6 Golden Coins GB 1992 Adventure Nintendo 28 0.072857
71 Donkey Kong Country SNES 1994 Platform Nintendo 26 0.065769
In [26]:
# Middle age games
middle_age = filtered_data[filtered_data['game_age'].isin(list(range(16,25)))][['Name', 'Platform','Year', 'Genre','Publisher',
    'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)

display(middle_age)
Name Platform Year Genre Publisher game_age EU_mean_sales
137 World of Warcraft PC 2004 Role-Playing Activision 16 0.388125
4 Pokemon Red/Pokemon Blue GB 1996 Role-Playing Nintendo 24 0.370417
24 Grand Theft Auto: Vice City PS2 2002 Action Take-Two Interactive 18 0.305000
12 Pokemon Gold/Pokemon Silver GB 1999 Role-Playing Nintendo 21 0.294286
28 Gran Turismo 3: A-Spec PS2 2001 Racing Sony Computer Entertainment 19 0.267895
38 Grand Theft Auto III PS2 2001 Action Take-Two Interactive 19 0.237368
30 Pokémon Yellow: Special Pikachu Edition GB 1998 Role-Playing Nintendo 22 0.229091
25 Pokemon Ruby/Pokemon Sapphire GBA 2002 Role-Playing Nintendo 18 0.216667
59 Super Mario 64 DS 2004 Platform Nintendo 16 0.194375
113 Need for Speed Underground 2 PS2 2004 Racing Electronic Arts 16 0.188750
In [27]:
print("To decide what game one shoul produce, based on recent years, let's take one more look on the recent_hits DataFrame:")
display(recent_hits)
print('The favorable features are:\n')
print('Platform: PS4 or PS3\n')
print('Genre: Shooter, Action or Sports\n')
print('Publisher: Electronic Arts, Take-Two Interactive or Activision')
To decide what game one shoul produce, based on recent years, let's take one more look on the recent_hits DataFrame:
Name Platform Year Genre Publisher game_age EU_mean_sales
16 Grand Theft Auto V PS3 2013 Action Take-Two Interactive 7 1.324286
77 FIFA 16 PS4 2015 Sports Electronic Arts 5 1.212000
33 Call of Duty: Black Ops 3 PS4 2015 Shooter Activision 5 1.162000
44 Grand Theft Auto V PS4 2014 Action Take-Two Interactive 6 0.968333
221 FIFA 17 PS4 2016 Sports Electronic Arts 4 0.937500
23 Grand Theft Auto V X360 2013 Action Take-Two Interactive 7 0.758571
34 Call of Duty: Black Ops II PS3 2012 Shooter Activision 8 0.735000
124 FIFA 15 PS4 2014 Sports Electronic Arts 6 0.715000
92 Star Wars Battlefront (2015) PS4 2015 Shooter Electronic Arts 5 0.658000
37 Call of Duty: Modern Warfare 3 PS3 2011 Shooter Activision 9 0.646667
The favorable features are:

Platform: PS4 or PS3

Genre: Shooter, Action or Sports

Publisher: Electronic Arts, Take-Two Interactive or Activision

Other Questions

In [98]:
# Which platform had the most titles released per year?

# Platforms with total game releases, active years (with start and end year) and total sales
platform_stats = data[data['Year'] != 0].groupby('Platform').agg({
    'Year': ['count', 'nunique', 'min', 'max'], 'Global_Sales': 'sum'
}).reset_index()
# Flatten column names
platform_stats.columns = ['Platform', 'total_games', 'active_years', 'first_year', 'last_year', 'total_sales']
# Calculate average games per year
platform_stats['avg_games_per_year'] = round(platform_stats['total_games'] / platform_stats['active_years'], 2)
platform_stats.sort_values('avg_games_per_year', ascending=False, inplace=True)
platform_stats.reset_index(drop=True, inplace=True)
platform_stats
Out[98]:
Platform total_games active_years first_year last_year total_sales avg_games_per_year
0 PS2 2127 12 2000 2011 1233.46 177.25
1 DS 2133 13 1985 2020 818.96 164.08
2 Wii 1290 10 2006 2015 909.81 129.00
3 PS 1189 10 1994 2003 727.39 118.90
4 PS3 1304 11 2006 2016 949.35 118.55
5 X360 1235 12 2005 2016 969.61 102.92
6 GBA 811 8 2000 2007 313.56 101.38
7 PSP 1197 12 2004 2015 291.71 99.75
8 XB 803 9 2000 2008 252.09 89.22
9 3DS 500 6 2011 2016 246.28 83.33
10 GC 542 7 2001 2007 197.14 77.43
11 PS4 336 5 2013 2017 278.10 67.20
12 PSV 412 7 2011 2017 61.63 58.86
13 XOne 213 4 2013 2016 141.06 53.25
14 N64 316 7 1996 2002 218.21 45.14
15 PC 943 26 1985 2016 255.05 36.27
16 SAT 173 6 1994 1999 33.59 28.83
17 WiiU 143 5 2012 2016 81.86 28.60
18 SNES 239 10 1990 1999 200.05 23.90
19 2600 116 10 1980 1989 86.57 11.60
20 NES 98 12 1983 1994 251.07 8.17
21 GB 97 13 1988 2001 254.42 7.46
22 DC 52 7 1998 2008 15.97 7.43
23 GEN 27 5 1990 1994 28.36 5.40
24 SCD 6 2 1993 1994 1.87 3.00
25 NG 12 4 1993 1996 1.44 3.00
26 WS 6 3 1999 2001 1.42 2.00
27 TG16 2 1 1995 1995 0.16 2.00
28 3DO 3 2 1994 1995 0.10 1.50
29 GG 1 1 1992 1992 0.04 1.00
30 PCFX 1 1 1996 1996 0.03 1.00
In [95]:
# Top 10 platforms by game releases per year
fig, ax = create_hbarplot(data=platform_stats.head(10), x_col='avg_games_per_year', y_col='Platform', 
            title='Average Game Releases of Platforms per Year (Top 10)',
            x_label='Game Releases per Year', y_label='Platform',
            xlim=[0, max(platform_stats.head(10)['avg_games_per_year']) * 1.05],
            figsize=(10,6))
    
for index, row in platform_stats.head(10).iterrows():
    ax.text(row['avg_games_per_year']-1,
                index,
                str(round(row['avg_games_per_year'],2)),
                va='center',
                color='white',
                ha='right',
                weight='bold')
In [133]:
# Do publishers with more games always have higher total sales?
plt.figure(figsize=(6,5))
sns.regplot(x=platform_stats['avg_games_per_year'], y=platform_stats['total_sales'], color='purple')
plt.title('Platform Sales vs Game Release Rate', fontsize=18, weight='bold')
plt.xlabel('Average Game Relesed per Year')
plt.ylabel('Total Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)

# Calculate correlation coefficient and statistical significance
from scipy.stats import pearsonr
corr_coef, p_value = pearsonr(platform_stats['avg_games_per_year'], platform_stats['total_sales'])
plt.text(0.05, 0.85, f'correlation: {corr_coef:.3f} \np-value: {p_value:.3f}', transform=plt.gca().transAxes, 
         fontsize=13, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
print('So, do publishers with more game releases always have higher total sales? \n')
print('Yes, there is a strong correlation between the two quantities.')
So, do publishers with more game releases always have higher total sales? 

Yes, there is a strong correlation between the two quantities.
In [142]:
# Are there genre-platform combinations that are especially successful?

combined_stats = data.groupby(['Platform',
    'Genre']).agg({'Global_Sales': 'sum'}).sort_values('Global_Sales', ascending=False).reset_index().head(10)
combined_stats
Out[142]:
Platform Genre Global_Sales
0 PS3 Action 307.88
1 Wii Sports 292.06
2 X360 Shooter 278.55
3 PS2 Sports 273.41
4 PS2 Action 272.76
5 X360 Action 242.67
6 Wii Misc 221.06
7 PS3 Shooter 196.04
8 PS2 Racing 156.28
9 X360 Sports 139.12
In [220]:
# Which publishers dominate certain genres (e.g., EA in Sports)?

# 1. Domination by number of games released
genre_stats = data[['Genre', 'Platform']].groupby(['Genre', 'Platform']).value_counts()
genre_stats = pd.DataFrame(genre_stats)
genre_stats.sort_values(['Genre',0], ascending=False, inplace=True)
genre_stats.reset_index(inplace=True)
genre_stats = genre_stats.rename(columns={'Genre': 'Genre', 'Platform': 'Platform', 0: 'count'})

# List of genres
genres = data['Genre'].unique()
genres.sort()
# Create a detailed summary table
summary_data = []
for g in genres:
    gdata = genre_stats[genre_stats['Genre'] == g].head(3)
    for i, row in gdata.iterrows():
        summary_data.append({
            'Genre': g, 
            'Platform': row['Platform'], 
            'Games': row['count']
        })

detailed_df = pd.DataFrame(summary_data)
display(detailed_df)
Genre Platform Games
0 Action PS3 380
1 Action PS2 348
2 Action DS 343
3 Adventure DS 240
4 Adventure PSP 213
5 Adventure PS2 196
6 Fighting PS2 150
7 Fighting PS 108
8 Fighting PS3 76
9 Misc DS 393
10 Misc Wii 280
11 Misc PS2 222
12 Platform GBA 142
13 Platform PS2 103
14 Platform DS 92
15 Puzzle DS 238
16 Puzzle Wii 55
17 Puzzle PSP 44
18 Racing PS2 216
19 Racing PS 145
20 Racing XB 123
21 Role-Playing DS 200
22 Role-Playing PSP 192
23 Role-Playing PS2 187
24 Shooter X360 203
25 Shooter PS2 160
26 Shooter PS3 156
27 Simulation DS 285
28 Simulation PC 115
29 Simulation PS2 90
30 Sports PS2 400
31 Sports Wii 261
32 Sports PS 222
33 Strategy PC 188
34 Strategy DS 79
35 Strategy PS2 71
In [236]:
# Is there a correlation between NA_Sales and Global_Sales?
plt.figure(figsize=(15,5))
sns.regplot(x=data['NA_Sales'], y=data['Global_Sales'], color='purple')
plt.title('North America Sales vs Global Sales', fontsize=18, weight='bold')
plt.xlabel('NA Sales')
plt.ylabel('Global Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)

# Calculate correlation coefficient and statistical significance
corr_coef, p_value = pearsonr(data['NA_Sales'], data['Global_Sales'])
plt.text(0.05, 0.85, f'correlation: {corr_coef:.3f} \np-value: {p_value:.3f}', transform=plt.gca().transAxes, 
         fontsize=13, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
print('Is there a correlation between NA_Sales and Global_Sales? \n')
print('Yes, there is a strong correlation between the two quantities.')
Is there a correlation between NA_Sales and Global_Sales? 

Yes, there is a strong correlation between the two quantities.
In [247]:
# Close up for smaller values
plt.figure(figsize=(15,5))
sns.regplot(x=data['NA_Sales'], y=data['Global_Sales'], color='purple')
plt.title('North America Sales vs Global Sales', fontsize=18, weight='bold')
plt.xlabel('NA Sales')
plt.ylabel('Global Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)
plt.xlim([0,5])
plt.ylim([0,9])
Out[247]:
(0.0, 9.0)
In [ ]: