import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import numpy as np


# Load CSV file
data = pd.read_csv("vgsales.csv")


def df_info(df):
    """Display the basic info about the dataset"""
    
    pd.set_option('display.max_columns', None) # Display all the columns of the dataset
    
    print(f"\033[1m5 head rows of the data:\n\033[0m")
    display(df.head(5))
    
    print(f"\033[1mInfo about the data:\n\033[0m")
    display(df.info())
    
    print(f"\033[1mDescription of the data:\n\033[0m")
    display(df.describe().T)

def df_dupl_null(df):
    """
    Handling the duplicates and print info about them and null values 
    """

    # Checking out duplicates
    duplicates = df[df.duplicated()].shape[0]
    if duplicates==0:
        print(f"\033[1mThere's no duplicates in the DataFrame.\033[0m")
    else:
        print(f"\033[1mThere's {duplicates} duplicates in the {df._name} DataFrame.\033[0m")
        df.drop_duplicates(keep='first', inplace=True, ignore_index=True)
        print("Duplicates dropped!")
    
    # Checking null values for each column
    pd.set_option('display.max_rows', None)
    rows = df.shape[0]
    print(f"\033[1m \nList of null values for each column in percents:\033[0m \n")
    return round((df.isnull().sum()/rows)*100,2)

def df_outliers(df):
    """
    Finding information about outliers in the df
    and returning it as a DataFrame
    """
    print(f"\033[1m Outliers detection: \033[0m ")
    
    # Get numeric columns excluding any columns with 'id' in their name
    numeric_cols = [col for col in df.select_dtypes(include='number').columns 
                if 'id' not in col.lower()]
    
    outliers_list = []
    # Outlier check for numeric columns
    for col in numeric_cols:
        q1 = df[col].quantile(0.25)
        q3 = df[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (1.5 * iqr)
        upper_bound = q3 + (1.5 * iqr)
        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
        outliers_list.append(len(outliers))
        
    rows = df.shape[0]
    # DataFrame for outliers
    outliers_df = pd.DataFrame(
        [[col, len_out, round(len_out/rows, 3)*100] for col, len_out in zip(numeric_cols, outliers_list)],
        columns=['column_name', 'num_of_outliers', 'pctg_of_outliers']
    )

    return outliers_df

def df_summary(df):
    """
    Print a summary of the cleaned dataset
    """
    print(f"\033[1mCleaning of the DataFrame completed!\033[0m")
    print(f"{df.shape[0]} rows and {df.shape[1]} columns remaining.")
    for column in df.columns:
        if df[column].nunique()==1:
            df.drop([column], inplace=True, axis=1)
        else:
            print(f"\033[1m- {column}:\033[0m {df[column].nunique()} unique values")


df_info(data)

5 head rows of the data:

Info about the data:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16598 entries, 0 to 16597
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Rank          16598 non-null  int64  
 1   Name          16598 non-null  object 
 2   Platform      16598 non-null  object 
 3   Year          16327 non-null  float64
 4   Genre         16598 non-null  object 
 5   Publisher     16540 non-null  object 
 6   NA_Sales      16598 non-null  float64
 7   EU_Sales      16598 non-null  float64
 8   JP_Sales      16598 non-null  float64
 9   Other_Sales   16598 non-null  float64
 10  Global_Sales  16598 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 1.4+ MB

None

Description of the data:


df_dupl_null(data)

There's no duplicates in the DataFrame.
 
List of null values for each column in percents:

Rank            0.00
Name            0.00
Platform        0.00
Year            1.63
Genre           0.00
Publisher       0.35
NA_Sales        0.00
EU_Sales        0.00
JP_Sales        0.00
Other_Sales     0.00
Global_Sales    0.00
dtype: float64


# Change the Year column type to integer and fill the nulls with zeros
data['Year'] = data['Year'].fillna(0).astype(int)

# Fill the nulls in Publisher column with 'Unknown'
data['Publisher'] = data['Publisher'].fillna('Unknown')


df_outliers(data)

 Outliers detection:


df_summary(data)

Cleaning of the DataFrame completed!
16598 rows and 11 columns remaining.
- Rank: 16598 unique values
- Name: 11493 unique values
- Platform: 31 unique values
- Year: 40 unique values
- Genre: 12 unique values
- Publisher: 578 unique values
- NA_Sales: 409 unique values
- EU_Sales: 305 unique values
- JP_Sales: 244 unique values
- Other_Sales: 157 unique values
- Global_Sales: 623 unique values


# Globar figure settings
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 13


# The most frequent games' names, i.e., released in many different platforms
top_names = pd.DataFrame(data['Name'].value_counts()).reset_index().rename(columns={'index': 'name', 'Name': 'count'})
# Adding a rank column to find top 3 names grouped by rank
top_names['rank'] = top_names['count'].rank(method='dense', 
                                            ascending=False)
# Selecting games with rank up to 3
top3_names = top_names[top_names['rank'] <= 3].sort_values(['rank', 'name'],
                                                                   ignore_index=True)
# Group by rank and aggregate the names
rank_groups = top3_names.groupby('rank').agg({
    'name': lambda x: ', '.join(sorted(x)),
    'count': 'first'  # All countries in the same rank have the same number
}).reset_index()

print("🎮 GAME RELEASES FREQUENCY RANKINGS 🎮")
print("=" * 40)

for rank in sorted(top3_names['rank'].unique()):
    games_at_rank = top3_names[top3_names['rank'] == rank]
    count = games_at_rank['count'].iloc[0]
    game_names = games_at_rank['name'].tolist()
    
    print(f"\nRank {int(rank)} ({count} releases each):")
    for game in game_names:
        print(f"  • {game}")

🎮 GAME RELEASES FREQUENCY RANKINGS 🎮
========================================

Rank 1 (12 releases each):
  • Need for Speed: Most Wanted

Rank 2 (9 releases each):
  • FIFA 14
  • LEGO Marvel Super Heroes
  • Madden NFL 07
  • Ratatouille

Rank 3 (8 releases each):
  • Angry Birds Star Wars
  • Cars
  • FIFA 15
  • FIFA Soccer 13
  • LEGO Harry Potter: Years 5-7
  • LEGO Jurassic World
  • LEGO Star Wars II: The Original Trilogy
  • LEGO The Hobbit
  • Lego Batman 3: Beyond Gotham
  • Madden NFL 08
  • Monopoly
  • Terraria
  • The LEGO Movie Videogame


def create_hbarplot(data, x_col, y_col, title, x_label, y_label, xlim=None, 
                    figsize=(12,8), bar_labels=False, xscale='linear', lscale=1, order=None):
    """
    Create a horizontal bar plot with customized styling.
    
    Args:
        data: DataFrame containing the data
        x_col: Column name for x-axis values
        y_col: Column name for y-axis categories
        x_label: label for the x axis
        y_label: label for the y axis
        title: Plot title
        xlim: Optional tuple for x-axis limits (min, max)
        figsize: Optional tuple for figure size
        bar_labels: Optional, add customed labels to the bars
        xscale: Optional, sets the bar_labels behavior depending on the type of x axis scaling
        lscale: Optional, a multiplier controling the bar labels placing
        order: Optional. Sns barplot automaticaly sorts categorical data on yaxis, so in order to keep
            the desired form of sorting, one has to pass the list with proper ordering
    """
    
    fig, ax = plt.subplots(figsize=figsize)
    plt.rcParams['font.family'] = 'serif'
    plt.rcParams['font.size'] = 13
    
    # Create the bar plot
    bars = sns.barplot(
        data=data, 
        x=x_col, 
        y=y_col,
        palette='plasma',
        edgecolor='gray',
        ax=ax,
        orient='h',
        order=order
    )
    
    # Customize the plot
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title, fontsize=18)
    ax.grid(axis='x', linestyle='--', alpha=0.9)
    plt.xscale(xscale)
    
    if xlim:
        ax.set_xlim(xlim)
    else:
        ax.set_xlim(0, max(data[x_col]) * 1.1)
        
    # Add labels to the bars
    if bar_labels:
        for index, row in data.iterrows():
            if xscale=='linear':
                if row['count']<=1000:
                    posx = row['count']+150*lscale
                else:
                    posx = row['count']+200*lscale
            elif xscale=='log':
                if row['count']<11:
                    posx = row['count']*(1+0.2)
                elif 10<row['count']<101:
                    posx = row['count']*(1+0.35)
                elif 100<row['count']<1001:
                    posx = row['count']*(1+0.45)
                else:
                    posx = row['count']*(1+0.55)
            ax.text(posx,
                    index,
                    str(round(row['count'],2)),
                    va='center',
                    color='black',
                    ha='right')
        
    plt.tight_layout()
    
    return fig, ax


# Game platforms frequency, i.e., how many games were released on each platform
platforms = pd.DataFrame(data['Platform'].value_counts()).reset_index().rename(columns={'index': 'platform', 
                                                                                        'Platform': 'count'})

create_hbarplot(data=platforms, x_col='count', y_col='platform', 
            title='Distribution of the Gaming Platform Frequency',
            x_label='Frequency (Game Releases)', y_label='Gaming Platform',
            xlim=[0.6, max(platforms['count']) * 1.6],
            figsize=(15,10),
            bar_labels=True,
               xscale='log')

(<Figure size 1500x1000 with 1 Axes>,
 <AxesSubplot:title={'center':'Distribution of the Gaming Platform Frequency'}, xlabel='Frequency (Game Releases)', ylabel='Gaming Platform'>)


# Years frequency, i.e., how many games were released on ech year
years = pd.DataFrame(data['Year'].value_counts()).reset_index().rename(columns={'index': 'year', 
                                                        'Year': 'count'}).sort_values('year').reset_index()
# Delete the data with unknown year
years.drop(index = 0, inplace=True)
years.reset_index(inplace=True)

create_hbarplot(data=years, x_col='count', y_col='year', 
            title='Distribution of the Games Releases over Years',
            x_label='Frequency (Game Releases)', y_label='Year',
            xlim=[0.6, max(years['count']) * 1.8],
            figsize=(10,12),
            bar_labels=True,
               xscale='log')

(<Figure size 1000x1200 with 1 Axes>,
 <AxesSubplot:title={'center':'Distribution of the Games Releases over Years'}, xlabel='Frequency (Game Releases)', ylabel='Year'>)


# Genre frequency, i.e., what game types are most popular
genres = pd.DataFrame(data['Genre'].value_counts()).reset_index().rename(columns={'index': 'genre', 
                                                        'Genre': 'count'})

create_hbarplot(data=genres, x_col='count', y_col='genre', 
            title='Distribution of the Games Releases by Genre',
            x_label='Frequency (Game Releases)', y_label='Year',
            xlim=[150, max(genres['count']) * 1.1],
            figsize=(12,8),
            bar_labels=True,
               xscale='linear')

(<Figure size 1200x800 with 1 Axes>,
 <AxesSubplot:title={'center':'Distribution of the Games Releases by Genre'}, xlabel='Frequency (Game Releases)', ylabel='Year'>)


# Publisher frequency, i.e., which company released most games
publishers = pd.DataFrame(data['Publisher'].value_counts()).reset_index().rename(columns={'index': 'publisher', 
                                                        'Publisher': 'count'}).head(10)

create_hbarplot(data=publishers, x_col='count', y_col='publisher', 
            title='Distribution of the Games Releases by Top 10 Publishers',
            x_label='Frequency (Game Releases)', y_label='Publisher',
            xlim=[300, max(publishers['count']) * 1.1],
            figsize=(12,6),
            bar_labels=True,
            xscale='linear',
            lscale=0.4)

(<Figure size 1200x600 with 1 Axes>,
 <AxesSubplot:title={'center':'Distribution of the Games Releases by Top 10 Publishers'}, xlabel='Frequency (Game Releases)', ylabel='Publisher'>)


# Sales statistics in a given region
sales_regions = ['NA_Sales', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales']

for region in sales_regions:
    sales = pd.DataFrame(data[region])
    
    display(sales.describe().T)


# The best selling games by world region
region_names = {'NA_Sales': 'North America', 'EU_Sales': 'Europe', 'JP_Sales': 'Japan',
                'Other_Sales': 'Other Regions', 'Global_Sales': 'Globally'}
for region in sales_regions:
    # Top 5 games
    region_data = data[['Name','Year',region]].sort_values(region, ascending=False).reset_index(drop=True).head(5)
    plot_title = 'The Best Selling Games: ' + region_names[region]
    
    fig, ax = create_hbarplot(data=region_data, x_col=region, y_col='Name', 
            title=plot_title,
            x_label='Sales (in millions)', y_label='Game (Year of Release)',
            xlim=[2, max(region_data[region]) * 1.05],
            figsize=(8,4))

    yticks = list()
    for index, row in region_data.iterrows():
        yticks += [str(row['Name']) + ' (' + str(row['Year']) + ')']
        ax.text(row[region],
                    index,
                    str(round(row[region],2)),
                    va='center',
                    color='white',
                    ha='right',
                    weight='bold')
    
    ax.set_yticks(ticks=list(region_data.index), labels=yticks)


# The best selling features like: platform, genre, year, publisher (globally)
features = ['Platform', 'Genre', 'Year', 'Publisher']
g = 'Global_Sales' # for easier retrival

for f in features:
    f_data = data[[f, g]].groupby(f).sum().sort_values(g, ascending=False).reset_index().head(5)
    plot_title = 'Top sellers: ' + f
    
    fig, ax = create_hbarplot(data=f_data, x_col=g, y_col=f, 
            title=plot_title,
            x_label='Sales (in millions)', y_label=f,
            xlim=[min(f_data[g])*0.3, max(f_data[g]) * 1.05],
            figsize=(8,4),
            order=f_data[f].tolist())

    for index, row in f_data.iterrows():
        ax.text(row[g],
                index,
                str(round(row[g],2)),
                va='center',
                color='white',
                ha='right',
                weight='bold')


recent_data = data[(data['Year'] >= (max(data['Year'])-5))]


# The best selling "features" of the last 5 years
EUfeatures = ['Name', 'Platform', 'Genre', 'Publisher']
# Dictionary to store the relevant data for each top 5 features
EUdata = {}
for f in EUfeatures:
    EUdata[f] = recent_data[[f,
            'EU_Sales']].groupby(f).sum().sort_values('EU_Sales',ascending=False).reset_index().head(5)
    
    fig, ax = create_hbarplot(data=EUdata[f], x_col='EU_Sales', y_col=f, 
            title='The Best Selling ' + f +'s',
            x_label='Sales (in millions)', y_label=f,
            xlim=[0, max(EUdata[f]['EU_Sales']) * 1.05],
            figsize=(10,4))
    
    for index, row in EUdata[f].iterrows():
        ax.text(row['EU_Sales']-1,
                index,
                str(round(row['EU_Sales'],2)),
                va='center',
                color='white',
                ha='right',
                weight='bold')


# Time trends of the features - time of the existence + sales
now = 2020
# Delete the rows with no year information
filtered_data = data[data['Year'] != 0].copy()
# New column - game age
filtered_data['game_age'] = now - filtered_data['Year']

# Distribution of the game ages
plt.figure(figsize=(12,4))
sns.histplot(filtered_data['game_age'], color='purple', binwidth=1, alpha=0.5)
plt.xlabel('Game Age')
plt.grid(axis='y', linestyle='--', alpha=0.6)
plt.title('Distribution of the Game Ages')

Text(0.5, 1.0, 'Distribution of the Game Ages')


# New column - mean sales in Europe over the years
filtered_data['EU_mean_sales'] = filtered_data['EU_Sales'] / filtered_data['game_age']

meandata = {}
for f in EUfeatures:
    meandata[f] = filtered_data[[f, 
        'game_age', 'EU_mean_sales']].groupby(f).sum().sort_values('EU_mean_sales',ascending=False).reset_index().head(5)
    
    fig, ax = create_hbarplot(data=meandata[f], x_col='EU_mean_sales', y_col=f, 
            title='The Best Selling ' + f +'s',
            x_label='Mean Sales (in millions)', y_label=f,
            xlim=[0, max(meandata[f]['EU_mean_sales']) * 1.05],
            figsize=(10,4))
    
    for index, row in meandata[f].iterrows():
        ax.text(row['EU_mean_sales'],
                index,
                str(round(row['EU_mean_sales'],2)),
                va='center',
                color='white',
                ha='right',
                weight='bold')


meanyears = filtered_data[['EU_mean_sales', 'game_age']].groupby('game_age').mean().reset_index()

plt.figure(figsize=(10,4))
sns.lineplot(data=meanyears, x='game_age', y='EU_mean_sales', color='purple', marker='o', ci=None)
plt.xlabel("Game's Age")
plt.ylabel('Mean Sales in Europe (in millions)')
plt.title("Distribution of Mean Sales by the Game's Age", fontsize=18, weight='bold')
plt.grid(axis='both', linestyle='--', alpha=0.6)

print('There are two well selling groups: recent hits (age < 10) and old classics (age > 25).')

There are two well selling groups: recent hits (age < 10) and old classics (age > 25).


# Identifying the recent hits
recent_hits = filtered_data[filtered_data['game_age'] <=10 ][['Name', 'Platform','Year', 'Genre','Publisher',
    'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)

display(recent_hits)


# Identifying the old classics
old_classics = filtered_data[filtered_data['game_age'] >= 25 ][['Name', 'Platform','Year', 'Genre','Publisher',
    'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)

display(old_classics)


# Middle age games
middle_age = filtered_data[filtered_data['game_age'].isin(list(range(16,25)))][['Name', 'Platform','Year', 'Genre','Publisher',
    'game_age', 'EU_mean_sales']].sort_values('EU_mean_sales', ascending=False).head(10)

display(middle_age)


print("To decide what game one shoul produce, based on recent years, let's take one more look on the recent_hits DataFrame:")
display(recent_hits)
print('The favorable features are:\n')
print('Platform: PS4 or PS3\n')
print('Genre: Shooter, Action or Sports\n')
print('Publisher: Electronic Arts, Take-Two Interactive or Activision')

To decide what game one shoul produce, based on recent years, let's take one more look on the recent_hits DataFrame:

The favorable features are:

Platform: PS4 or PS3

Genre: Shooter, Action or Sports

Publisher: Electronic Arts, Take-Two Interactive or Activision


# Which platform had the most titles released per year?

# Platforms with total game releases, active years (with start and end year) and total sales
platform_stats = data[data['Year'] != 0].groupby('Platform').agg({
    'Year': ['count', 'nunique', 'min', 'max'], 'Global_Sales': 'sum'
}).reset_index()
# Flatten column names
platform_stats.columns = ['Platform', 'total_games', 'active_years', 'first_year', 'last_year', 'total_sales']
# Calculate average games per year
platform_stats['avg_games_per_year'] = round(platform_stats['total_games'] / platform_stats['active_years'], 2)
platform_stats.sort_values('avg_games_per_year', ascending=False, inplace=True)
platform_stats.reset_index(drop=True, inplace=True)
platform_stats


# Top 10 platforms by game releases per year
fig, ax = create_hbarplot(data=platform_stats.head(10), x_col='avg_games_per_year', y_col='Platform', 
            title='Average Game Releases of Platforms per Year (Top 10)',
            x_label='Game Releases per Year', y_label='Platform',
            xlim=[0, max(platform_stats.head(10)['avg_games_per_year']) * 1.05],
            figsize=(10,6))
    
for index, row in platform_stats.head(10).iterrows():
    ax.text(row['avg_games_per_year']-1,
                index,
                str(round(row['avg_games_per_year'],2)),
                va='center',
                color='white',
                ha='right',
                weight='bold')


# Do publishers with more games always have higher total sales?
plt.figure(figsize=(6,5))
sns.regplot(x=platform_stats['avg_games_per_year'], y=platform_stats['total_sales'], color='purple')
plt.title('Platform Sales vs Game Release Rate', fontsize=18, weight='bold')
plt.xlabel('Average Game Relesed per Year')
plt.ylabel('Total Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)

# Calculate correlation coefficient and statistical significance
from scipy.stats import pearsonr
corr_coef, p_value = pearsonr(platform_stats['avg_games_per_year'], platform_stats['total_sales'])
plt.text(0.05, 0.85, f'correlation: {corr_coef:.3f} \np-value: {p_value:.3f}', transform=plt.gca().transAxes, 
         fontsize=13, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
print('So, do publishers with more game releases always have higher total sales? \n')
print('Yes, there is a strong correlation between the two quantities.')

So, do publishers with more game releases always have higher total sales? 

Yes, there is a strong correlation between the two quantities.


# Are there genre-platform combinations that are especially successful?

combined_stats = data.groupby(['Platform',
    'Genre']).agg({'Global_Sales': 'sum'}).sort_values('Global_Sales', ascending=False).reset_index().head(10)
combined_stats


# Which publishers dominate certain genres (e.g., EA in Sports)?

# 1. Domination by number of games released
genre_stats = data[['Genre', 'Platform']].groupby(['Genre', 'Platform']).value_counts()
genre_stats = pd.DataFrame(genre_stats)
genre_stats.sort_values(['Genre',0], ascending=False, inplace=True)
genre_stats.reset_index(inplace=True)
genre_stats = genre_stats.rename(columns={'Genre': 'Genre', 'Platform': 'Platform', 0: 'count'})

# List of genres
genres = data['Genre'].unique()
genres.sort()
# Create a detailed summary table
summary_data = []
for g in genres:
    gdata = genre_stats[genre_stats['Genre'] == g].head(3)
    for i, row in gdata.iterrows():
        summary_data.append({
            'Genre': g, 
            'Platform': row['Platform'], 
            'Games': row['count']
        })

detailed_df = pd.DataFrame(summary_data)
display(detailed_df)


# Is there a correlation between NA_Sales and Global_Sales?
plt.figure(figsize=(15,5))
sns.regplot(x=data['NA_Sales'], y=data['Global_Sales'], color='purple')
plt.title('North America Sales vs Global Sales', fontsize=18, weight='bold')
plt.xlabel('NA Sales')
plt.ylabel('Global Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)

# Calculate correlation coefficient and statistical significance
corr_coef, p_value = pearsonr(data['NA_Sales'], data['Global_Sales'])
plt.text(0.05, 0.85, f'correlation: {corr_coef:.3f} \np-value: {p_value:.3f}', transform=plt.gca().transAxes, 
         fontsize=13, bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
print('Is there a correlation between NA_Sales and Global_Sales? \n')
print('Yes, there is a strong correlation between the two quantities.')

Is there a correlation between NA_Sales and Global_Sales? 

Yes, there is a strong correlation between the two quantities.


# Close up for smaller values
plt.figure(figsize=(15,5))
sns.regplot(x=data['NA_Sales'], y=data['Global_Sales'], color='purple')
plt.title('North America Sales vs Global Sales', fontsize=18, weight='bold')
plt.xlabel('NA Sales')
plt.ylabel('Global Sales (in million)')
plt.grid(axis='both', linestyle='--', alpha=0.7)
plt.xlim([0,5])
plt.ylim([0,9])

(0.0, 9.0)

	Rank	Name	Platform	Year	Genre	Publisher	NA_Sales	EU_Sales	JP_Sales	Other_Sales	Global_Sales
0	1	Wii Sports	Wii	2006.0	Sports	Nintendo	41.49	29.02	3.77	8.46	82.74
1	2	Super Mario Bros.	NES	1985.0	Platform	Nintendo	29.08	3.58	6.81	0.77	40.24
2	3	Mario Kart Wii	Wii	2008.0	Racing	Nintendo	15.85	12.88	3.79	3.31	35.82
3	4	Wii Sports Resort	Wii	2009.0	Sports	Nintendo	15.75	11.01	3.28	2.96	33.00
4	5	Pokemon Red/Pokemon Blue	GB	1996.0	Role-Playing	Nintendo	11.27	8.89	10.22	1.00	31.37

	count	mean	std	min	25%	50%	75%	max
Rank	16598.0	8300.605254	4791.853933	1.00	4151.25	8300.50	12449.75	16600.00
Year	16327.0	2006.406443	5.828981	1980.00	2003.00	2007.00	2010.00	2020.00
NA_Sales	16598.0	0.264667	0.816683	0.00	0.00	0.08	0.24	41.49
EU_Sales	16598.0	0.146652	0.505351	0.00	0.00	0.02	0.11	29.02
JP_Sales	16598.0	0.077782	0.309291	0.00	0.00	0.00	0.04	10.22
Other_Sales	16598.0	0.048063	0.188588	0.00	0.00	0.01	0.04	10.57
Global_Sales	16598.0	0.537441	1.555028	0.01	0.06	0.17	0.47	82.74

	column_name	num_of_outliers	pctg_of_outliers
0	Rank	0	0.0
1	Year	576	3.5
2	NA_Sales	1681	10.1
3	EU_Sales	2081	12.5
4	JP_Sales	2425	14.6
5	Other_Sales	1665	10.0
6	Global_Sales	1893	11.4

	Name	Platform	Year	Genre	Publisher	game_age	EU_mean_sales
18	Super Mario World	SNES	1990	Platform	Nintendo	30	0.125000
22	Super Mario Bros. 3	NES	1988	Platform	Nintendo	32	0.107500
521	Myst	PC	1994	Adventure	Red Orb	26	0.107308
1	Super Mario Bros.	NES	1985	Platform	Nintendo	35	0.102286
267	Warcraft II: Tides of Darkness	PC	1995	Strategy	Activision	25	0.090800
21	Super Mario Land	GB	1989	Platform	Nintendo	31	0.087419
57	Super Mario All-Stars	SNES	1993	Platform	Nintendo	27	0.079630
5	Tetris	GB	1989	Puzzle	Nintendo	31	0.072903
50	Super Mario Land 2: 6 Golden Coins	GB	1992	Adventure	Nintendo	28	0.072857
71	Donkey Kong Country	SNES	1994	Platform	Nintendo	26	0.065769

	Name	Platform	Year	Genre	Publisher	game_age	EU_mean_sales
137	World of Warcraft	PC	2004	Role-Playing	Activision	16	0.388125
4	Pokemon Red/Pokemon Blue	GB	1996	Role-Playing	Nintendo	24	0.370417
24	Grand Theft Auto: Vice City	PS2	2002	Action	Take-Two Interactive	18	0.305000
12	Pokemon Gold/Pokemon Silver	GB	1999	Role-Playing	Nintendo	21	0.294286
28	Gran Turismo 3: A-Spec	PS2	2001	Racing	Sony Computer Entertainment	19	0.267895
38	Grand Theft Auto III	PS2	2001	Action	Take-Two Interactive	19	0.237368
30	Pokémon Yellow: Special Pikachu Edition	GB	1998	Role-Playing	Nintendo	22	0.229091
25	Pokemon Ruby/Pokemon Sapphire	GBA	2002	Role-Playing	Nintendo	18	0.216667
59	Super Mario 64	DS	2004	Platform	Nintendo	16	0.194375
113	Need for Speed Underground 2	PS2	2004	Racing	Electronic Arts	16	0.188750

Info about the data and cleansing

EDA

Descriptive analysis

Exploratory analysis

The last 5 years in Europe analysis - what type of game is worth creating?

Other Questions

	Name	Platform	Year	Genre	Publisher	game_age	EU_mean_sales
16	Grand Theft Auto V	PS3	2013	Action	Take-Two Interactive	7	1.324286
77	FIFA 16	PS4	2015	Sports	Electronic Arts	5	1.212000
33	Call of Duty: Black Ops 3	PS4	2015	Shooter	Activision	5	1.162000
44	Grand Theft Auto V	PS4	2014	Action	Take-Two Interactive	6	0.968333
221	FIFA 17	PS4	2016	Sports	Electronic Arts	4	0.937500
23	Grand Theft Auto V	X360	2013	Action	Take-Two Interactive	7	0.758571
34	Call of Duty: Black Ops II	PS3	2012	Shooter	Activision	8	0.735000
124	FIFA 15	PS4	2014	Sports	Electronic Arts	6	0.715000
92	Star Wars Battlefront (2015)	PS4	2015	Shooter	Electronic Arts	5	0.658000
37	Call of Duty: Modern Warfare 3	PS3	2011	Shooter	Activision	9	0.646667

	Platform	total_games	active_years	first_year	last_year	total_sales	avg_games_per_year
0	PS2	2127	12	2000	2011	1233.46	177.25
1	DS	2133	13	1985	2020	818.96	164.08
2	Wii	1290	10	2006	2015	909.81	129.00
3	PS	1189	10	1994	2003	727.39	118.90
4	PS3	1304	11	2006	2016	949.35	118.55
5	X360	1235	12	2005	2016	969.61	102.92
6	GBA	811	8	2000	2007	313.56	101.38
7	PSP	1197	12	2004	2015	291.71	99.75
8	XB	803	9	2000	2008	252.09	89.22
9	3DS	500	6	2011	2016	246.28	83.33
10	GC	542	7	2001	2007	197.14	77.43
11	PS4	336	5	2013	2017	278.10	67.20
12	PSV	412	7	2011	2017	61.63	58.86
13	XOne	213	4	2013	2016	141.06	53.25
14	N64	316	7	1996	2002	218.21	45.14
15	PC	943	26	1985	2016	255.05	36.27
16	SAT	173	6	1994	1999	33.59	28.83
17	WiiU	143	5	2012	2016	81.86	28.60
18	SNES	239	10	1990	1999	200.05	23.90
19	2600	116	10	1980	1989	86.57	11.60
20	NES	98	12	1983	1994	251.07	8.17
21	GB	97	13	1988	2001	254.42	7.46
22	DC	52	7	1998	2008	15.97	7.43
23	GEN	27	5	1990	1994	28.36	5.40
24	SCD	6	2	1993	1994	1.87	3.00
25	NG	12	4	1993	1996	1.44	3.00
26	WS	6	3	1999	2001	1.42	2.00
27	TG16	2	1	1995	1995	0.16	2.00
28	3DO	3	2	1994	1995	0.10	1.50
29	GG	1	1	1992	1992	0.04	1.00
30	PCFX	1	1	1996	1996	0.03	1.00

	Platform	Genre	Global_Sales
0	PS3	Action	307.88
1	Wii	Sports	292.06
2	X360	Shooter	278.55
3	PS2	Sports	273.41
4	PS2	Action	272.76
5	X360	Action	242.67
6	Wii	Misc	221.06
7	PS3	Shooter	196.04
8	PS2	Racing	156.28
9	X360	Sports	139.12