import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np
# Load CSV file
data = pd.read_csv("vgsales.csv")
def df_info(df):
"""Display the basic info about the dataset"""
pd.set_option('display.max_columns', None) # Display all the columns of the dataset
print(f"\033[1m5 head rows of the data:\n\033[0m")
display(df.head(5))
print(f"\033[1mInfo about the data:\n\033[0m")
display(df.info())
print(f"\033[1mDescription of the data:\n\033[0m")
display(df.describe().T)
def df_dupl_null(df):
"""
Handling the duplicates and print info about them and null values
"""
# Checking out duplicates
duplicates = df[df.duplicated()].shape[0]
if duplicates==0:
print(f"\033[1mThere's no duplicates in the DataFrame.\033[0m")
else:
print(f"\033[1mThere's {duplicates} duplicates in the {df._name} DataFrame.\033[0m")
df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
print("Duplicates dropped!")
# Checking null values for each column
pd.set_option('display.max_rows', None)
rows = df.shape[0]
print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
return round((df.isnull().sum()/rows)*100,2)
def df_outliers(df):
"""
Finding information about outliers in the df
and returning it as a DataFrame
"""
print(f"\033[1m Outliers detection: \033[0m ")
# Get numeric columns excluding any columns with 'id' in their name
numeric_cols = [col for col in df.select_dtypes(include='number').columns
if 'id' not in col.lower()]
outliers_list = []
# Outlier check for numeric columns
for col in numeric_cols:
q1 = df[col].quantile(0.25)
q3 = df[col].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
outliers_list.append(len(outliers))
rows = df.shape[0]
# DataFrame for outliers
outliers_df = pd.DataFrame(
[[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
)
return outliers_df
def df_summary(df):
"""
Print a summary of the cleaned dataset
"""
print(f"\033[1mCleaning of the DataFrame completed!\033[0m")
print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
for column in df.columns:
if df[column].nunique()==1:
df.drop([column], inplace=True, axis=1)
else:
print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")
df_info(data)
5 head rows of the data:
Rank | Name | Platform | Year | Genre | Publisher | NA_Sales | EU_Sales | JP_Sales | Other_Sales | Global_Sales | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | Wii Sports | Wii | 2006.0 | Sports | Nintendo | 41.49 | 29.02 | 3.77 | 8.46 | 82.74 |
1 | 2 | Super Mario Bros. | NES | 1985.0 | Platform | Nintendo | 29.08 | 3.58 | 6.81 | 0.77 | 40.24 |
2 | 3 | Mario Kart Wii | Wii | 2008.0 | Racing | Nintendo | 15.85 | 12.88 | 3.79 | 3.31 | 35.82 |
3 | 4 | Wii Sports Resort | Wii | 2009.0 | Sports | Nintendo | 15.75 | 11.01 | 3.28 | 2.96 | 33.00 |
4 | 5 | Pokemon Red/Pokemon Blue | GB | 1996.0 | Role-Playing | Nintendo | 11.27 | 8.89 | 10.22 | 1.00 | 31.37 |
Info about the data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Rank 16598 non-null int64
1 Name 16598 non-null object
2 Platform 16598 non-null object
3 Year 16327 non-null float64
4 Genre 16598 non-null object
5 Publisher 16540 non-null object
6 NA_Sales 16598 non-null float64
7 EU_Sales 16598 non-null float64
8 JP_Sales 16598 non-null float64
9 Other_Sales 16598 non-null float64
10 Global_Sales 16598 non-null float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB
None
Description of the data:
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Rank | 16598.0 | 8300.605254 | 4791.853933 | 1.00 | 4151.25 | 8300.50 | 12449.75 | 16600.00 |
Year | 16327.0 | 2006.406443 | 5.828981 | 1980.00 | 2003.00 | 2007.00 | 2010.00 | 2020.00 |
NA_Sales | 16598.0 | 0.264667 | 0.816683 | 0.00 | 0.00 | 0.08 | 0.24 | 41.49 |
EU_Sales | 16598.0 | 0.146652 | 0.505351 | 0.00 | 0.00 | 0.02 | 0.11 | 29.02 |
JP_Sales | 16598.0 | 0.077782 | 0.309291 | 0.00 | 0.00 | 0.00 | 0.04 | 10.22 |
Other_Sales | 16598.0 | 0.048063 | 0.188588 | 0.00 | 0.00 | 0.01 | 0.04 | 10.57 |
Global_Sales | 16598.0 | 0.537441 | 1.555028 | 0.01 | 0.06 | 0.17 | 0.47 | 82.74 |
df_dupl_null(data)
There's no duplicates in the DataFrame. List of null values for each column in percents:
Rank 0.00 Name 0.00 Platform 0.00 Year 1.63 Genre 0.00 Publisher 0.35 NA_Sales 0.00 EU_Sales 0.00 JP_Sales 0.00 Other_Sales 0.00 Global_Sales 0.00 dtype: float64
# Change the Year column type to integer and fill the nulls with zeros
data['Year'] = data['Year'].fillna(0).astype(int)
# Fill the nulls in Publisher column with 'Unknown'
data['Publisher'] = data['Publisher'].fillna('Unknown')
df_outliers(data)
Outliers detection:
column_name | num_of_outliers | pctg_of_outliers | |
---|---|---|---|
0 | Rank | 0 | 0.0 |
1 | Year | 576 | 3.5 |
2 | NA_Sales | 1681 | 10.1 |
3 | EU_Sales | 2081 | 12.5 |
4 | JP_Sales | 2425 | 14.6 |
5 | Other_Sales | 1665 | 10.0 |
6 | Global_Sales | 1893 | 11.4 |
df_summary(data)
Cleaning of the DataFrame completed! 16598 rows and 11 columns remaining. - Rank: 16598 unique values - Name: 11493 unique values - Platform: 31 unique values - Year: 40 unique values - Genre: 12 unique values - Publisher: 578 unique values - NA_Sales: 409 unique values - EU_Sales: 305 unique values - JP_Sales: 244 unique values - Other_Sales: 157 unique values - Global_Sales: 623 unique values
# Globar figure settings
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# The most frequent games' names, i.e., released in many different platforms
top_names = pd.DataFrame(data['Name'].value_counts()).reset_index().rename(columns={'index': 'name', 'Name': 'count'})
# Adding a rank column to find top 3 names grouped by rank
top_names['rank'] = top_names['count'].rank(method='dense',
ascending=False)
# Selecting games with rank up to 3
top3_names = top_names[top_names['rank'] <= 3].sort_values(['rank', 'name'],
ignore_index=True)
# Group by rank and aggregate the names
rank_groups = top3_names.groupby('rank').agg({
'name': lambda x: ', '.join(sorted(x)),
'count': 'first' # All countries in the same rank have the same number
}).reset_index()
print("🎮 GAME RELEASES FREQUENCY RANKINGS 🎮")
print("=" * 40)
for rank in sorted(top3_names['rank'].unique()):
games_at_rank = top3_names[top3_names['rank'] == rank]
count = games_at_rank['count'].iloc[0]
game_names = games_at_rank['name'].tolist()
print(f"\nRank {int(rank)} ({count} releases each):")
for game in game_names:
print(f" • {game}")
🎮 GAME RELEASES FREQUENCY RANKINGS 🎮 ======================================== Rank 1 (12 releases each): • Need for Speed: Most Wanted Rank 2 (9 releases each): • FIFA 14 • LEGO Marvel Super Heroes • Madden NFL 07 • Ratatouille Rank 3 (8 releases each): • Angry Birds Star Wars • Cars • FIFA 15 • FIFA Soccer 13 • LEGO Harry Potter: Years 5-7 • LEGO Jurassic World • LEGO Star Wars II: The Original Trilogy • LEGO The Hobbit • Lego Batman 3: Beyond Gotham • Madden NFL 08 • Monopoly • Terraria • The LEGO Movie Videogame
def create_hbarplot(data, x_col, y_col, title, x_label, y_label, xlim=None,
figsize=(12,8), bar_labels=False, xscale='linear', lscale=1, order=None):
"""
Create a horizontal bar plot with customized styling.
Args:
data: DataFrame containing the data
x_col: Column name for x-axis values
y_col: Column name for y-axis categories
x_label: label for the x axis
y_label: label for the y axis
title: Plot title
xlim: Optional tuple for x-axis limits (min, max)
figsize: Optional tuple for figure size
bar_labels: Optional, add customed labels to the bars
xscale: Optional, sets the bar_labels behavior depending on the type of x axis scaling
lscale: Optional, a multiplier controling the bar labels placing
order: Optional. Sns barplot automaticaly sorts categorical data on yaxis, so in order to keep
the desired form of sorting, one has to pass the list with proper ordering
"""
fig, ax = plt.subplots(figsize=figsize)
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13
# Create the bar plot
bars = sns.barplot(
data=data,
x=x_col,
y=y_col,
palette='plasma',
edgecolor='gray',
ax=ax,
orient='h',
order=order
)
# Customize the plot
ax.set_xlabel(x_label)
ax.set_ylabel(y_label)
ax.set_title(title, fontsize=18)
ax.grid(axis='x', linestyle='--', alpha=0.9)
plt.xscale(xscale)
if xlim:
ax.set_xlim(xlim)
else:
ax.set_xlim(0, max(data[x_col]) * 1.1)
# Add labels to the bars
if bar_labels:
for index, row in data.iterrows():
if xscale=='linear':
if row['count']<=1000:
posx = row['count']+150*lscale
else:
posx = row['count']+200*lscale
elif xscale=='log':
if row['count']<11:
posx = row['count']*(1+0.2)
elif 10<row['count']<101:
posx = row['count']*(1+0.35)
elif 100<row['count']<1001:
posx = row['count']*(1+0.45)
else:
posx = row['count']*(1+0.55)
ax.text(posx,
index,
str(round(row['count'],2)),
va='center',
color='black',
ha='right')
plt.tight_layout()
return fig, ax
# Game platforms frequency, i.e., how many games were released on each platform
platforms = pd.DataFrame(data['Platform'].value_counts()).reset_index().rename(columns={'index': 'platform',
'Platform': 'count'})
create_hbarplot(data=platforms, x_col='count', y_col='platform',
title='Distribution of the Gaming Platform Frequency',
x_label='Frequency (Game Releases)', y_label='Gaming Platform',
xlim=[0.6, max(platforms['count']) * 1.6],
figsize=(15,10),
bar_labels=True,
xscale='log')
(<Figure size 1500x1000 with 1 Axes>, <AxesSubplot:title={'center':'Distribution of the Gaming Platform Frequency'}, xlabel='Frequency (Game Releases)', ylabel='Gaming Platform'>)
# Years frequency, i.e., how many games were released on ech year
years = pd.DataFrame(data['Year'].value_counts()).reset_index().rename(columns={'index': 'year',
'Year': 'count'}).sort_values('year').reset_index()
# Delete the data with unknown year
years.drop(index = 0, inplace=True)
years.reset_index(inplace=True)
create_hbarplot(data=years, x_col='count', y_col='year',
title='Distribution of the Games Releases over Years',
x_label='Frequency (Game Releases)', y_label='Year',
xlim=[0.6, max(years['count']) * 1.8],
figsize=(10,12),
bar_labels=True,
xscale='log')
(<Figure size 1000x1200 with 1 Axes>, <AxesSubplot:title={'center':'Distribution of the Games Releases over Years'}, xlabel='Frequency (Game Releases)', ylabel='Year'>)
# Genre frequency, i.e., what game types are most popular
genres = pd.DataFrame(data['Genre'].value_counts()).reset_index().rename(columns={'index': 'genre',
'Genre': 'count'})
create_hbarplot(data=genres, x_col='count', y_col='genre',
title='Distribution of the Games Releases by Genre',
x_label='Frequency (Game Releases)', y_label='Year',
xlim=[150, max(genres['count']) * 1.1],
figsize=(12,8),
bar_labels=True,
xscale='linear')
(<Figure size 1200x800 with 1 Axes>, <AxesSubplot:title={'center':'Distribution of the Games Releases by Genre'}, xlabel='Frequency (Game Releases)', ylabel='Year'>)
# Publisher frequency, i.e., which company released most games
publishers = pd.DataFrame(data['Publisher'].value_counts()).reset_index().rename(columns={'index': 'publisher',
'Publisher': 'count'}).head(10)
create_hbarplot(data=publishers, x_col='count', y_col='publisher',
title='Distribution of the Games Releases by Top 10 Publishers',
x_label='Frequency (Game Releases)', y_label='Publisher',
xlim=[300, max(publishers['count']) * 1.1],
figsize=(12,6),
bar_labels=True,
xscale='linear',
lscale=0.4)
(<Figure size 1200x600 with 1 Axes>, <AxesSubplot:title={'center':'Distribution of the Games Releases by Top 10 Publishers'}, xlabel='Frequency (Game Releases)', ylabel='Publisher'>)
# Sales statistics in a given region
sales_regions = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']
for region in sales_regions:
sales = pd.DataFrame(data[region])
display(sales.describe().T)
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
NA_Sales | 16598.0 | 0.264667 | 0.816683 | 0.0 | 0.0 | 0.08 | 0.24 | 41.49 |
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
EU_Sales | 16598.0 | 0.146652 | 0.505351 | 0.0 | 0.0 | 0.02 | 0.11 | 29.02 |
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
JP_Sales | 16598.0 | 0.077782 | 0.309291 | 0.0 | 0.0 | 0.0 | 0.04 | 10.22 |
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Other_Sales | 16598.0 | 0.048063 | 0.188588 | 0.0 | 0.0 | 0.01 | 0.04 | 10.57 |
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
Global_Sales | 16598.0 | 0.537441 | 1.555028 | 0.01 | 0.06 | 0.17 | 0.47 | 82.74 |
# The best selling games by world region
region_names = {'NA_Sales': 'North America', 'EU_Sales': 'Europe', 'JP_Sales': 'Japan',
'Other_Sales': 'Other Regions', 'Global_Sales': 'Globally'}
for region in sales_regions:
# Top 5 games
region_data = data[['Name','Year',region]].sort_values(region, ascending=False).reset_index(drop=True).head(5)
plot_title = 'The Best Selling Games: ' + region_names[region]
fig, ax = create_hbarplot(data=region_data, x_col=region, y_col='Name',
title=plot_title,
x_label='Sales (in millions)', y_label='Game (Year of Release)',
xlim=[2, max(region_data[region]) * 1.05],
figsize=(8,4))
yticks = list()
for index, row in region_data.iterrows():
yticks += [str(row['Name']) + ' (' + str(row['Year']) + ')']
ax.text(row[region],
index,
str(round(row[region],2)),
va='center',
color='white',
ha='right',
weight='bold')
ax.set_yticks(ticks=list(region_data.index), labels=yticks)
# The best selling features like: platform, genre, year, publisher (globally)
features = ['Platform', 'Genre', 'Year', 'Publisher']
g = 'Global_Sales' # for easier retrival
for f in features:
f_data = data[[f, g]].groupby(f).sum().sort_values(g, ascending=False).reset_index().head(5)
plot_title = 'Top sellers: ' + f
fig, ax = create_hbarplot(data=f_data, x_col=g, y_col=f,
title=plot_title,
x_label='Sales (in millions)', y_label=f,
xlim=[min(f_data[g])*0.3, max(f_data[g]) * 1.05],
figsize=(8,4),
order=f_data[f].tolist())
for index, row in f_data.iterrows():
ax.text(row[g],
index,
str(round(row[g],2)),
va='center',
color='white',
ha='right',
weight='bold')
recent_data = data[(data['Year'] >= (max(data['Year'])-5))]
# The best selling "features" of the last 5 years
EUfeatures = ['Name', 'Platform', 'Genre', 'Publisher']
# Dictionary to store the relevant data for each top 5 features
EUdata = {}
for f in EUfeatures:
EUdata[f] = recent_data[[f,
'EU_Sales']].groupby(f).sum().sort_values('EU_Sales',ascending=False).reset_index().head(5)
fig, ax = create_hbarplot(data=EUdata[f], x_col='EU_Sales', y_col=f,
title='The Best Selling ' + f +'s',
x_label='Sales (in millions)', y_label=f,
xlim=[0, max(EUdata[f]['EU_Sales']) * 1.05],
figsize=(10,4))
for index, row in EUdata[f].iterrows():
ax.text(row['EU_Sales']-1,
index,
str(round(row['EU_Sales'],2)),
va='center',
color='white',
ha='right',
weight='bold')
# Time trends of the features - time of the existence + sales
now = 2020
# Delete the rows with no year information
filtered_data = data[data['Year'] != 0].copy()
# New column - game age
filtered_data['game_age'] = now - filtered_data['Year']
# Distribution of the game ages
plt.figure(figsize=(12,4))
sns.histplot(filtered_data['game_age'], color='purple', binwidth=1, alpha=0.5)
plt.xlabel('Game Age')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.title('Distribution of the Game Ages')
Text(0.5, 1.0, 'Distribution of the Game Ages')
# New column - mean sales in Europe over the years
filtered_data['EU_mean_sales'] = filtered_data['EU_Sales'] / filtered_data['game_age']
meandata = {}
for f in EUfeatures:
meandata[f] = filtered_data[[f,
'game_age', 'EU_mean_sales']].groupby(f).sum().sort_values('EU_mean_sales',ascending=False).reset_index().head(5)
fig, ax = create_hbarplot(data=meandata[f], x_col='EU_mean_sales', y_col=f,
title='The Best Selling ' + f +'s',
x_label='Mean Sales (in millions)', y_label=f,
xlim=[0, max(meandata[f]['EU_mean_sales']) * 1.05],
figsize=(10,4))
for index, row in meandata[f].iterrows():
ax.text(row['EU_mean_sales'],
index,
str(round(row['EU_mean_sales'],2)),
va='center',
color='white',
ha='right',
weight='bold')
meanyears = filtered_data[['EU_mean_sales', 'game_age']].groupby('game_age').mean().reset_index()
plt.figure(figsize=(10,4))
sns.lineplot(data=meanyears, x='game_age', y='EU_mean_sales', color='purple', marker='o', ci=None)
plt.xlabel("Game's Age")
plt.ylabel('Mean Sales in Europe (in millions)')
plt.title("Distribution of Mean Sales by the Game's Age", fontsize=18, weight='bold')
plt.grid(axis='both', linestyle='--', alpha=0.6)
print('There are two well selling groups: recent hits (age < 10) and old classics (age > 25).')
There are two well selling groups: recent hits (age < 10) and old classics (age > 25).
# Identifying the recent hits
recent_hits = filtered_data[filtered_data['game_age'] <=10 ][['Name', 'Platform','Year', 'Genre','Publisher',
'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)
display(recent_hits)
Name | Platform | Year | Genre | Publisher | game_age | EU_mean_sales | |
---|---|---|---|---|---|---|---|
16 | Grand Theft Auto V | PS3 | 2013 | Action | Take-Two Interactive | 7 | 1.324286 |
77 | FIFA 16 | PS4 | 2015 | Sports | Electronic Arts | 5 | 1.212000 |
33 | Call of Duty: Black Ops 3 | PS4 | 2015 | Shooter | Activision | 5 | 1.162000 |
44 | Grand Theft Auto V | PS4 | 2014 | Action | Take-Two Interactive | 6 | 0.968333 |
221 | FIFA 17 | PS4 | 2016 | Sports | Electronic Arts | 4 | 0.937500 |
23 | Grand Theft Auto V | X360 | 2013 | Action | Take-Two Interactive | 7 | 0.758571 |
34 | Call of Duty: Black Ops II | PS3 | 2012 | Shooter | Activision | 8 | 0.735000 |
124 | FIFA 15 | PS4 | 2014 | Sports | Electronic Arts | 6 | 0.715000 |
92 | Star Wars Battlefront (2015) | PS4 | 2015 | Shooter | Electronic Arts | 5 | 0.658000 |
37 | Call of Duty: Modern Warfare 3 | PS3 | 2011 | Shooter | Activision | 9 | 0.646667 |
# Identifying the old classics
old_classics = filtered_data[filtered_data['game_age'] >= 25 ][['Name', 'Platform','Year', 'Genre','Publisher',
'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)
display(old_classics)
Name | Platform | Year | Genre | Publisher | game_age | EU_mean_sales | |
---|---|---|---|---|---|---|---|
18 | Super Mario World | SNES | 1990 | Platform | Nintendo | 30 | 0.125000 |
22 | Super Mario Bros. 3 | NES | 1988 | Platform | Nintendo | 32 | 0.107500 |
521 | Myst | PC | 1994 | Adventure | Red Orb | 26 | 0.107308 |
1 | Super Mario Bros. | NES | 1985 | Platform | Nintendo | 35 | 0.102286 |
267 | Warcraft II: Tides of Darkness | PC | 1995 | Strategy | Activision | 25 | 0.090800 |
21 | Super Mario Land | GB | 1989 | Platform | Nintendo | 31 | 0.087419 |
57 | Super Mario All-Stars | SNES | 1993 | Platform | Nintendo | 27 | 0.079630 |
5 | Tetris | GB | 1989 | Puzzle | Nintendo | 31 | 0.072903 |
50 | Super Mario Land 2: 6 Golden Coins | GB | 1992 | Adventure | Nintendo | 28 | 0.072857 |
71 | Donkey Kong Country | SNES | 1994 | Platform | Nintendo | 26 | 0.065769 |
# Middle age games
middle_age = filtered_data[filtered_data['game_age'].isin(list(range(16,25)))][['Name', 'Platform','Year', 'Genre','Publisher',
'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)
display(middle_age)
Name | Platform | Year | Genre | Publisher | game_age | EU_mean_sales | |
---|---|---|---|---|---|---|---|
137 | World of Warcraft | PC | 2004 | Role-Playing | Activision | 16 | 0.388125 |
4 | Pokemon Red/Pokemon Blue | GB | 1996 | Role-Playing | Nintendo | 24 | 0.370417 |
24 | Grand Theft Auto: Vice City | PS2 | 2002 | Action | Take-Two Interactive | 18 | 0.305000 |
12 | Pokemon Gold/Pokemon Silver | GB | 1999 | Role-Playing | Nintendo | 21 | 0.294286 |
28 | Gran Turismo 3: A-Spec | PS2 | 2001 | Racing | Sony Computer Entertainment | 19 | 0.267895 |
38 | Grand Theft Auto III | PS2 | 2001 | Action | Take-Two Interactive | 19 | 0.237368 |
30 | Pokémon Yellow: Special Pikachu Edition | GB | 1998 | Role-Playing | Nintendo | 22 | 0.229091 |
25 | Pokemon Ruby/Pokemon Sapphire | GBA | 2002 | Role-Playing | Nintendo | 18 | 0.216667 |
59 | Super Mario 64 | DS | 2004 | Platform | Nintendo | 16 | 0.194375 |
113 | Need for Speed Underground 2 | PS2 | 2004 | Racing | Electronic Arts | 16 | 0.188750 |
print("To decide what game one shoul produce, based on recent years, let's take one more look on the recent_hits DataFrame:")
display(recent_hits)
print('The favorable features are:\n')
print('Platform: PS4 or PS3\n')
print('Genre: Shooter, Action or Sports\n')
print('Publisher: Electronic Arts, Take-Two Interactive or Activision')
To decide what game one shoul produce, based on recent years, let's take one more look on the recent_hits DataFrame:
Name | Platform | Year | Genre | Publisher | game_age | EU_mean_sales | |
---|---|---|---|---|---|---|---|
16 | Grand Theft Auto V | PS3 | 2013 | Action | Take-Two Interactive | 7 | 1.324286 |
77 | FIFA 16 | PS4 | 2015 | Sports | Electronic Arts | 5 | 1.212000 |
33 | Call of Duty: Black Ops 3 | PS4 | 2015 | Shooter | Activision | 5 | 1.162000 |
44 | Grand Theft Auto V | PS4 | 2014 | Action | Take-Two Interactive | 6 | 0.968333 |
221 | FIFA 17 | PS4 | 2016 | Sports | Electronic Arts | 4 | 0.937500 |
23 | Grand Theft Auto V | X360 | 2013 | Action | Take-Two Interactive | 7 | 0.758571 |
34 | Call of Duty: Black Ops II | PS3 | 2012 | Shooter | Activision | 8 | 0.735000 |
124 | FIFA 15 | PS4 | 2014 | Sports | Electronic Arts | 6 | 0.715000 |
92 | Star Wars Battlefront (2015) | PS4 | 2015 | Shooter | Electronic Arts | 5 | 0.658000 |
37 | Call of Duty: Modern Warfare 3 | PS3 | 2011 | Shooter | Activision | 9 | 0.646667 |
The favorable features are: Platform: PS4 or PS3 Genre: Shooter, Action or Sports Publisher: Electronic Arts, Take-Two Interactive or Activision
# Which platform had the most titles released per year?
# Platforms with total game releases, active years (with start and end year) and total sales
platform_stats = data[data['Year'] != 0].groupby('Platform').agg({
'Year': ['count', 'nunique', 'min', 'max'], 'Global_Sales': 'sum'
}).reset_index()
# Flatten column names
platform_stats.columns = ['Platform', 'total_games', 'active_years', 'first_year', 'last_year', 'total_sales']
# Calculate average games per year
platform_stats['avg_games_per_year'] = round(platform_stats['total_games'] / platform_stats['active_years'], 2)
platform_stats.sort_values('avg_games_per_year', ascending=False, inplace=True)
platform_stats.reset_index(drop=True, inplace=True)
platform_stats
Platform | total_games | active_years | first_year | last_year | total_sales | avg_games_per_year | |
---|---|---|---|---|---|---|---|
0 | PS2 | 2127 | 12 | 2000 | 2011 | 1233.46 | 177.25 |
1 | DS | 2133 | 13 | 1985 | 2020 | 818.96 | 164.08 |
2 | Wii | 1290 | 10 | 2006 | 2015 | 909.81 | 129.00 |
3 | PS | 1189 | 10 | 1994 | 2003 | 727.39 | 118.90 |
4 | PS3 | 1304 | 11 | 2006 | 2016 | 949.35 | 118.55 |
5 | X360 | 1235 | 12 | 2005 | 2016 | 969.61 | 102.92 |
6 | GBA | 811 | 8 | 2000 | 2007 | 313.56 | 101.38 |
7 | PSP | 1197 | 12 | 2004 | 2015 | 291.71 | 99.75 |
8 | XB | 803 | 9 | 2000 | 2008 | 252.09 | 89.22 |
9 | 3DS | 500 | 6 | 2011 | 2016 | 246.28 | 83.33 |
10 | GC | 542 | 7 | 2001 | 2007 | 197.14 | 77.43 |
11 | PS4 | 336 | 5 | 2013 | 2017 | 278.10 | 67.20 |
12 | PSV | 412 | 7 | 2011 | 2017 | 61.63 | 58.86 |
13 | XOne | 213 | 4 | 2013 | 2016 | 141.06 | 53.25 |
14 | N64 | 316 | 7 | 1996 | 2002 | 218.21 | 45.14 |
15 | PC | 943 | 26 | 1985 | 2016 | 255.05 | 36.27 |
16 | SAT | 173 | 6 | 1994 | 1999 | 33.59 | 28.83 |
17 | WiiU | 143 | 5 | 2012 | 2016 | 81.86 | 28.60 |
18 | SNES | 239 | 10 | 1990 | 1999 | 200.05 | 23.90 |
19 | 2600 | 116 | 10 | 1980 | 1989 | 86.57 | 11.60 |
20 | NES | 98 | 12 | 1983 | 1994 | 251.07 | 8.17 |
21 | GB | 97 | 13 | 1988 | 2001 | 254.42 | 7.46 |
22 | DC | 52 | 7 | 1998 | 2008 | 15.97 | 7.43 |
23 | GEN | 27 | 5 | 1990 | 1994 | 28.36 | 5.40 |
24 | SCD | 6 | 2 | 1993 | 1994 | 1.87 | 3.00 |
25 | NG | 12 | 4 | 1993 | 1996 | 1.44 | 3.00 |
26 | WS | 6 | 3 | 1999 | 2001 | 1.42 | 2.00 |
27 | TG16 | 2 | 1 | 1995 | 1995 | 0.16 | 2.00 |
28 | 3DO | 3 | 2 | 1994 | 1995 | 0.10 | 1.50 |
29 | GG | 1 | 1 | 1992 | 1992 | 0.04 | 1.00 |
30 | PCFX | 1 | 1 | 1996 | 1996 | 0.03 | 1.00 |
# Top 10 platforms by game releases per year
fig, ax = create_hbarplot(data=platform_stats.head(10), x_col='avg_games_per_year', y_col='Platform',
title='Average Game Releases of Platforms per Year (Top 10)',
x_label='Game Releases per Year', y_label='Platform',
xlim=[0, max(platform_stats.head(10)['avg_games_per_year']) * 1.05],
figsize=(10,6))
for index, row in platform_stats.head(10).iterrows():
ax.text(row['avg_games_per_year']-1,
index,
str(round(row['avg_games_per_year'],2)),
va='center',
color='white',
ha='right',
weight='bold')
# Do publishers with more games always have higher total sales?
plt.figure(figsize=(6,5))
sns.regplot(x=platform_stats['avg_games_per_year'], y=platform_stats['total_sales'], color='purple')
plt.title('Platform Sales vs Game Release Rate', fontsize=18, weight='bold')
plt.xlabel('Average Game Relesed per Year')
plt.ylabel('Total Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)
# Calculate correlation coefficient and statistical significance
from scipy.stats import pearsonr
corr_coef, p_value = pearsonr(platform_stats['avg_games_per_year'], platform_stats['total_sales'])
plt.text(0.05, 0.85, f'correlation: {corr_coef:.3f} \np-value: {p_value:.3f}', transform=plt.gca().transAxes,
fontsize=13, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
print('So, do publishers with more game releases always have higher total sales? \n')
print('Yes, there is a strong correlation between the two quantities.')
So, do publishers with more game releases always have higher total sales? Yes, there is a strong correlation between the two quantities.
# Are there genre-platform combinations that are especially successful?
combined_stats = data.groupby(['Platform',
'Genre']).agg({'Global_Sales': 'sum'}).sort_values('Global_Sales', ascending=False).reset_index().head(10)
combined_stats
Platform | Genre | Global_Sales | |
---|---|---|---|
0 | PS3 | Action | 307.88 |
1 | Wii | Sports | 292.06 |
2 | X360 | Shooter | 278.55 |
3 | PS2 | Sports | 273.41 |
4 | PS2 | Action | 272.76 |
5 | X360 | Action | 242.67 |
6 | Wii | Misc | 221.06 |
7 | PS3 | Shooter | 196.04 |
8 | PS2 | Racing | 156.28 |
9 | X360 | Sports | 139.12 |
# Which publishers dominate certain genres (e.g., EA in Sports)?
# 1. Domination by number of games released
genre_stats = data[['Genre', 'Platform']].groupby(['Genre', 'Platform']).value_counts()
genre_stats = pd.DataFrame(genre_stats)
genre_stats.sort_values(['Genre',0], ascending=False, inplace=True)
genre_stats.reset_index(inplace=True)
genre_stats = genre_stats.rename(columns={'Genre': 'Genre', 'Platform': 'Platform', 0: 'count'})
# List of genres
genres = data['Genre'].unique()
genres.sort()
# Create a detailed summary table
summary_data = []
for g in genres:
gdata = genre_stats[genre_stats['Genre'] == g].head(3)
for i, row in gdata.iterrows():
summary_data.append({
'Genre': g,
'Platform': row['Platform'],
'Games': row['count']
})
detailed_df = pd.DataFrame(summary_data)
display(detailed_df)
Genre | Platform | Games | |
---|---|---|---|
0 | Action | PS3 | 380 |
1 | Action | PS2 | 348 |
2 | Action | DS | 343 |
3 | Adventure | DS | 240 |
4 | Adventure | PSP | 213 |
5 | Adventure | PS2 | 196 |
6 | Fighting | PS2 | 150 |
7 | Fighting | PS | 108 |
8 | Fighting | PS3 | 76 |
9 | Misc | DS | 393 |
10 | Misc | Wii | 280 |
11 | Misc | PS2 | 222 |
12 | Platform | GBA | 142 |
13 | Platform | PS2 | 103 |
14 | Platform | DS | 92 |
15 | Puzzle | DS | 238 |
16 | Puzzle | Wii | 55 |
17 | Puzzle | PSP | 44 |
18 | Racing | PS2 | 216 |
19 | Racing | PS | 145 |
20 | Racing | XB | 123 |
21 | Role-Playing | DS | 200 |
22 | Role-Playing | PSP | 192 |
23 | Role-Playing | PS2 | 187 |
24 | Shooter | X360 | 203 |
25 | Shooter | PS2 | 160 |
26 | Shooter | PS3 | 156 |
27 | Simulation | DS | 285 |
28 | Simulation | PC | 115 |
29 | Simulation | PS2 | 90 |
30 | Sports | PS2 | 400 |
31 | Sports | Wii | 261 |
32 | Sports | PS | 222 |
33 | Strategy | PC | 188 |
34 | Strategy | DS | 79 |
35 | Strategy | PS2 | 71 |
# Is there a correlation between NA_Sales and Global_Sales?
plt.figure(figsize=(15,5))
sns.regplot(x=data['NA_Sales'], y=data['Global_Sales'], color='purple')
plt.title('North America Sales vs Global Sales', fontsize=18, weight='bold')
plt.xlabel('NA Sales')
plt.ylabel('Global Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)
# Calculate correlation coefficient and statistical significance
corr_coef, p_value = pearsonr(data['NA_Sales'], data['Global_Sales'])
plt.text(0.05, 0.85, f'correlation: {corr_coef:.3f} \np-value: {p_value:.3f}', transform=plt.gca().transAxes,
fontsize=13, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
print('Is there a correlation between NA_Sales and Global_Sales? \n')
print('Yes, there is a strong correlation between the two quantities.')
Is there a correlation between NA_Sales and Global_Sales? Yes, there is a strong correlation between the two quantities.
# Close up for smaller values
plt.figure(figsize=(15,5))
sns.regplot(x=data['NA_Sales'], y=data['Global_Sales'], color='purple')
plt.title('North America Sales vs Global Sales', fontsize=18, weight='bold')
plt.xlabel('NA Sales')
plt.ylabel('Global Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)
plt.xlim([0,5])
plt.ylim([0,9])
(0.0, 9.0)