In [82]:
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import re
import numpy as np
import ast
In [2]:
conn = sqlite3.connect("florence_airbnb.db")

# Load cleaned CSV files
listings = pd.read_csv("listings_cleaned.csv", parse_dates=True)
calendar = pd.read_csv("calendar_cleaned.csv", parse_dates=True)
reviews = pd.read_csv("reviews_cleaned.csv", parse_dates=True)

listings.to_sql("listings", conn, if_exists="replace", index=False)
calendar.to_sql("calendar", conn, if_exists="replace", index=False)
reviews.to_sql("reviews", conn, if_exists="replace", index=False)
Out[2]:
904945
In [3]:
# PRAGMA queries for basic information about the data
pd.set_option('display.max_rows', 100)
tables = ['listings', 'calendar', 'reviews']
for table in tables:
    print(f"\nTable: {table}")
    df = pd.read_sql_query(f"PRAGMA table_info({table})", conn)
    display(df)
Table: listings
cid name type notnull dflt_value pk
0 0 id INTEGER 0 None 0
1 1 last_scraped TEXT 0 None 0
2 2 source TEXT 0 None 0
3 3 name TEXT 0 None 0
4 4 description TEXT 0 None 0
5 5 neighborhood_overview TEXT 0 None 0
6 6 host_id INTEGER 0 None 0
7 7 host_name TEXT 0 None 0
8 8 host_since TEXT 0 None 0
9 9 host_location TEXT 0 None 0
10 10 host_about TEXT 0 None 0
11 11 host_response_time TEXT 0 None 0
12 12 host_response_rate TEXT 0 None 0
13 13 host_acceptance_rate TEXT 0 None 0
14 14 host_is_superhost TEXT 0 None 0
15 15 host_neighbourhood TEXT 0 None 0
16 16 host_listings_count INTEGER 0 None 0
17 17 host_total_listings_count INTEGER 0 None 0
18 18 host_verifications TEXT 0 None 0
19 19 host_has_profile_pic TEXT 0 None 0
20 20 host_identity_verified TEXT 0 None 0
21 21 neighbourhood TEXT 0 None 0
22 22 neighbourhood_cleansed TEXT 0 None 0
23 23 latitude REAL 0 None 0
24 24 longitude REAL 0 None 0
25 25 property_type TEXT 0 None 0
26 26 room_type TEXT 0 None 0
27 27 accommodates INTEGER 0 None 0
28 28 bathrooms REAL 0 None 0
29 29 bathrooms_text TEXT 0 None 0
30 30 bedrooms REAL 0 None 0
31 31 beds REAL 0 None 0
32 32 amenities TEXT 0 None 0
33 33 price REAL 0 None 0
34 34 minimum_nights INTEGER 0 None 0
35 35 maximum_nights INTEGER 0 None 0
36 36 minimum_minimum_nights INTEGER 0 None 0
37 37 maximum_minimum_nights INTEGER 0 None 0
38 38 minimum_maximum_nights INTEGER 0 None 0
39 39 maximum_maximum_nights INTEGER 0 None 0
40 40 minimum_nights_avg_ntm REAL 0 None 0
41 41 maximum_nights_avg_ntm REAL 0 None 0
42 42 availability_30 INTEGER 0 None 0
43 43 availability_60 INTEGER 0 None 0
44 44 availability_90 INTEGER 0 None 0
45 45 availability_365 INTEGER 0 None 0
46 46 calendar_last_scraped TEXT 0 None 0
47 47 number_of_reviews INTEGER 0 None 0
48 48 number_of_reviews_ltm INTEGER 0 None 0
49 49 number_of_reviews_l30d INTEGER 0 None 0
50 50 availability_eoy INTEGER 0 None 0
51 51 number_of_reviews_ly INTEGER 0 None 0
52 52 estimated_occupancy_l365d INTEGER 0 None 0
53 53 estimated_revenue_l365d REAL 0 None 0
54 54 first_review TEXT 0 None 0
55 55 last_review TEXT 0 None 0
56 56 review_scores_rating REAL 0 None 0
57 57 review_scores_accuracy REAL 0 None 0
58 58 review_scores_cleanliness REAL 0 None 0
59 59 review_scores_checkin REAL 0 None 0
60 60 review_scores_communication REAL 0 None 0
61 61 review_scores_location REAL 0 None 0
62 62 review_scores_value REAL 0 None 0
63 63 license TEXT 0 None 0
64 64 instant_bookable TEXT 0 None 0
65 65 calculated_host_listings_count INTEGER 0 None 0
66 66 calculated_host_listings_count_entire_homes INTEGER 0 None 0
67 67 calculated_host_listings_count_private_rooms INTEGER 0 None 0
68 68 calculated_host_listings_count_shared_rooms INTEGER 0 None 0
69 69 reviews_per_month REAL 0 None 0
Table: calendar
cid name type notnull dflt_value pk
0 0 listing_id INTEGER 0 None 0
1 1 date TEXT 0 None 0
2 2 available INTEGER 0 None 0
3 3 price REAL 0 None 0
4 4 minimum_nights REAL 0 None 0
5 5 maximum_nights REAL 0 None 0
Table: reviews
cid name type notnull dflt_value pk
0 0 listing_id INTEGER 0 None 0
1 1 id INTEGER 0 None 0
2 2 date TEXT 0 None 0
3 3 reviewer_id INTEGER 0 None 0
4 4 reviewer_name TEXT 0 None 0
5 5 comments TEXT 0 None 0

Price-Focused Analysis of Available Listings

Distribution of Rental Price Ranges

In [74]:
q1 = """WITH top_listings AS (
    SELECT 
        listing_id, 
        ROUND(AVG(price), 0) AS avg_price 
    FROM calendar 
    WHERE available = True 
    GROUP BY listing_id 
    ORDER BY avg_price DESC 
) 
 
SELECT 
    COUNT(listing_id) AS num_of_listings, 
    CASE WHEN avg_price >= 10000 THEN '>=10k'
        WHEN avg_price >= 5000 THEN '5k-10k'
        WHEN avg_price >= 1000 THEN '1k-5k'
        WHEN avg_price >= 500 THEN '500-1k'
        WHEN avg_price >= 100 THEN '100-500'
        ELSE '<100' END AS price_bucket
FROM top_listings
GROUP BY 2 
ORDER BY 
    CASE price_bucket
        WHEN '>=10k' THEN 1
        WHEN '5k-10k' THEN 2
        WHEN '1k-5k' THEN 3
        WHEN '500-1k' THEN 4
        WHEN '100-500' THEN 5
        ELSE 6
    END
"""

listings_price = pd.read_sql_query(q1, conn)

# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
    
sns.barplot(x=listings_price['price_bucket'], y=listings_price['num_of_listings'], 
            palette='RdPu', edgecolor='black', linewidth=0.5)

plt.yscale('log', base=10)
plt.xlabel('Listing Price Range per Night ($)', labelpad=20)
plt.ylabel('Number of Listings')
plt.title('Distribution of Rental Price Ranges', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')

yticks = [50, 100, 500, 1000, 5000]
ylabels = [str(y) for y in yticks]
plt.yticks(yticks, labels=ylabels)

# Adding labels to the bars
for index, row in listings_price.iterrows():
        plt.text(index,
            row['num_of_listings']*(1+0.2),
            str(round(row['num_of_listings'],2)),
            va='center',
            color='black',
            ha='center'
            )
# Adding space for the labels
plt.ylim(30, max(listings_price['num_of_listings']) * 1.7)
        
plt.tight_layout()
#plt.show()

plt.savefig('price_ranges.png')

Average Price of Rent per Month

In [20]:
q2 = """
SELECT 
    strftime('%Y-%m', date) AS month, 
    ROUND(AVG(price), 2) AS avg_price 
FROM calendar 
WHERE price IS NOT NULL AND available = True 
GROUP BY month 
ORDER BY month
"""

monthly_price = pd.read_sql_query(q2, conn)

# Formatting the date
monthly_price['formatted_month'] = pd.to_datetime(monthly_price['month']).dt.strftime('%m.%y')

# Plotting the results
plt.figure(figsize=(20,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26

sns.lineplot(x=monthly_price['formatted_month'], y=monthly_price['avg_price'],
             color='purple', marker='o', linewidth=3, markersize=10)

plt.xlabel('Date (month.year)', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Average Price of Rent by Month', fontsize=32)
plt.grid(True, alpha=0.3)

plt.tight_layout()
#plt.show()

plt.savefig('price_by_month.png')

Distribution of Rental Prices by Room Type

In [75]:
q3 = """
SELECT 
    l.room_type AS room_type,
    ROUND(AVG(c.price), 2) AS avg_price
FROM calendar c
JOIN listings l ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1
ORDER BY 2 DESC
"""

room_price = pd.read_sql_query(q3, conn)

# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
    
sns.barplot(x=room_price['room_type'], y=room_price['avg_price'],
           palette='RdPu', edgecolor='black', linewidth=0.5)

plt.xlabel('Room Type', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Distribution of Rental Prices by Room Type', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Adding labels to the bars
for index, row in room_price.iterrows():
        plt.text(index,
            row['avg_price']+20,
            str(round(row['avg_price'],2)),
            va='center',
            color='black',
            ha='center'
            )
        
# Adding space for the labels
plt.ylim(50, max(room_price['avg_price']) * 1.1)

plt.tight_layout()
#plt.show()

plt.savefig('price_by_roomtype.png')
In [ ]:
 

Distribution of Rental Prices by Neighbourhood

In [76]:
q4 = """
SELECT
    l.neighbourhood_cleansed AS nieghbourhood,
    AVG(c.price) AS avg_price
FROM calendar c
    JOIN listings l ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1
ORDER BY 2 DESC
"""

neighb_price = pd.read_sql_query(q4, conn)

# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
    
sns.barplot(x=neighb_price['nieghbourhood'], y=neighb_price['avg_price'],
           palette='RdPu', edgecolor='black', linewidth=0.5)

plt.xlabel('Nieghbourhood', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Distribution of Rental Prices by Neighbourhood', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Adjust x ticks labels
xlabels = []
for label in neighb_price['nieghbourhood']:
    xlabels.append(re.sub(r"\s",r"\n", label))
positions = range(len(xlabels))
plt.xticks(positions, xlabels)

# Adding labels to the bars
for index, row in neighb_price.iterrows():
        plt.text(index,
            row['avg_price']+10,
            str(round(row['avg_price'],2)),
            va='center',
            color='black',
            ha='center'
            )
# Adding space for the labels
plt.ylim(300, max(neighb_price['avg_price']) * 1.05)

plt.tight_layout()
#plt.show()

plt.savefig('price_by_neighbourhood.png')

Distribution of Rental Prices by Room Type and Neighbourhood

In [78]:
q5 = """
SELECT 
    l.room_type AS "Room Type",
    l.neighbourhood_cleansed AS Nieghbourhood,
    ROUND(AVG(l.accommodates),1) AS "Accommodates (avg)",
    COUNT(l.id) AS "Num. of Listings",
    ROUND(AVG(c.price),2) AS "Price per Night (avg)"
FROM listings l
    JOIN calendar c ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1, 2
ORDER BY 1, 5 DESC
"""

room_neighb_price = pd.read_sql_query(q5, conn)
room_neighb_price
Out[78]:
Room Type Nieghbourhood Accommodates (avg) Num. of Listings Price per Night (avg)
0 Entire home/apt Isolotto Legnaia 4.3 89879 643.13
1 Entire home/apt Campo di Marte 4.1 192669 564.73
2 Entire home/apt Centro Storico 4.2 1559858 556.27
3 Entire home/apt Gavinana Galluzzo 4.2 71756 459.72
4 Entire home/apt Rifredi 4.1 163307 385.66
5 Hotel room Centro Storico 2.4 8688 372.94
6 Hotel room Gavinana Galluzzo 2.6 1821 263.56
7 Hotel room Rifredi 2.0 313 128.11
8 Hotel room Campo di Marte 2.0 906 65.71
9 Private room Centro Storico 2.3 218473 403.93
10 Private room Rifredi 2.4 47879 275.99
11 Private room Isolotto Legnaia 2.1 20926 215.32
12 Private room Campo di Marte 2.1 53124 202.14
13 Private room Gavinana Galluzzo 2.2 18049 133.22
14 Shared room Centro Storico 1.0 626 999.00
15 Shared room Campo di Marte 2.0 329 55.00
16 Shared room Rifredi 1.0 1181 12.48
In [79]:
room_types = room_neighb_price['Room Type'].unique()

# Create figure with subplots in a 2x2 grid
fig, axes = plt.subplots(2, 2, figsize=(15, 10), sharex=False, sharey=True)
axes = axes.flatten()  # Flatten to easily iterate

# Define a maximum y-value for all plots to share the same scale
max_price = room_neighb_price["Price per Night (avg)"].max() * 1.1  # Add 10% margin

# Set a standard width for the bars
bar_width = 0.7

# Set color palette for the bars
colors = sns.color_palette('RdPu', n_colors=len(room_types))

# Iterate through each room type to create individual plots
for i, room_type in enumerate(room_types):
    ax = axes[i]
    room_data = room_neighb_price[room_neighb_price["Room Type"] == room_type]
    
    # Sort by price in descending order
    room_data = room_data.sort_values("Price per Night (avg)", ascending=False)
    
    # Get neighborhoods and prices
    neighborhoods = room_data["Nieghbourhood"].tolist()
    prices = room_data["Price per Night (avg)"].tolist()
    
    # Create positions for bars
    positions = np.arange(len(neighborhoods))
    
    rcParams['font.family'] = 'serif'
    rcParams['font.size'] = 18
    
    # Create bars
    bars = ax.bar(positions, prices, bar_width, color=colors, edgecolor='black', linewidth=0.5)
    
    # Add price labels on top of bars
    for bar, price in zip(bars, prices):
        ax.text(bar.get_x() + bar.get_width()/2., 
                bar.get_height() + max_price * 0.02,  # Small offset from top
                f'${price:.2f}', 
                ha='center', va='bottom', 
                fontsize=14, rotation=0)
    
    # Set title and labels
    ax.set_title(f'{room_type}', fontsize=20, fontweight='bold')
    ax.set_ylabel('Price per Night ($)' if i % 2 == 0 else '')  # Y-label only for left plots
    ax.set_xlabel('Neighborhood' if i >= 2 else '', labelpad=10)  # X-label only for bottom plots
    
    # Adjust x ticks labels
    xlabels = []
    for label in neighborhoods:
        xlabels.append(re.sub(r"\s",r"\n", label))
    
    # Set tick positions and labels
    ax.set_xticks(positions)
    ax.set_xticklabels(xlabels, ha='center')
    
    # Set y-axis limit to be the same for all plots
    ax.set_ylim(0, max_price)
    
    # Add grid lines for better readability
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    
# Add a main title for the entire figure
plt.suptitle('Price Distribution by Neighborhood for Different Room Types', fontsize=26, y=1)

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.subplots_adjust(top=0.9)

plt.savefig('price_by_room_neighb.png')

Amenities Impact on Price

In [87]:
#  Get a list of all unique amenities across all listings
all_amenities = []
for amenities_list in listings['amenities']:
    all_amenities.extend(ast.literal_eval(amenities_list))

unique_amenities = sorted(list(set(all_amenities)))
print(f"Total number of unique amenities: {len(unique_amenities)}")

print("\nFinding the most frequent amenities.")
# Create a dictionary to count occurrences
amenity_counts = {}
for amenities_list in listings['amenities']:
    for amenity in ast.literal_eval(amenities_list):
        if amenity in amenity_counts:
            amenity_counts[amenity] += 1
        else:
            amenity_counts[amenity] = 1

# Convert to dataframe
amenity_df = pd.DataFrame({
    'Amenity': list(amenity_counts.keys()),
    'Count': list(amenity_counts.values())
})

# Sort by frequency
amenity_df = amenity_df.sort_values('Count', ascending=False).reset_index(drop=True)

# Display the top 20 most common amenities
print("\nTop 20 most common amenities:")
amenity_df.head(20)
Total number of unique amenities: 2924

Finding the most frequent amenities.

Top 20 most common amenities:
Out[87]:
Amenity Count
0 Hair dryer 11464
1 Wifi 11194
2 Kitchen 10916
3 Hot water 10911
4 Hangers 10316
5 Essentials 10268
6 Bed linens 10007
7 Refrigerator 9818
8 Dishes and silverware 9726
9 Iron 9542
10 Cooking basics 8772
11 Bidet 7901
12 Heating 7898
13 TV 7665
14 Shampoo 7631
15 Air conditioning 7327
16 Dining table 7013
17 Microwave 6897
18 Carbon monoxide alarm 6631
19 Freezer 6518
In [94]:
# Retrieve a DataFrame with average price per night by listings
q6 = """
    SELECT 
        c.listing_id, 
        ROUND(AVG(c.price), 2) AS avg_price,
        l.amenities
    FROM calendar c
        JOIN listings l ON c.listing_id = l.id
    WHERE c.available = True
    GROUP BY 1
    ORDER BY 2 DESC
    """

listings_df = pd.read_sql_query(q6, conn)
listings_df['amenities'] = listings_df['amenities'].apply(ast.literal_eval)

top_amenities = amenity_df.head(20)['Amenity'].tolist()

for amenity in top_amenities:
        # Check if each listing has this amenity
        listings_df[f'has_{amenity.lower().replace(" ", "_")}'] = listings_df['amenities'].apply(
            lambda x: 1 if amenity in x else 0)
    
# Calculate average price for listings with and without each amenity
price_impact = {}
for amenity in top_amenities:
    col_name = f'has_{amenity.lower().replace(" ", "_")}'
    with_amenity = listings_df[listings_df[col_name] == 1]['avg_price'].mean()
    without_amenity = listings_df[listings_df[col_name] == 0]['avg_price'].mean()
    price_diff = with_amenity - without_amenity
    price_impact[amenity] = {
            'with': with_amenity,
            'without': without_amenity,
            'difference': price_diff,
            'percentage': ((price_diff) / without_amenity) * 100 if without_amenity > 0 else float('nan')
        }
    
# Convert to dataframe
impact_df = pd.DataFrame(price_impact).T.reset_index()
impact_df.columns = ['Amenity', 'With Amenity', 'Without Amenity', 'Price Difference', 'Percentage Change']
impact_df = impact_df.sort_values('Percentage Change', ascending=False, ignore_index=True)
    
# Plot amenities with biggest price impact (top 20)
plt.figure(figsize=(15, 12))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 22

print(f"Percentage Change = (price with amenity - price without amenity) / (price without amenity) * 100%")

sns.barplot(x='Percentage Change', y='Amenity', data=impact_df.head(20),
            palette='RdPu', edgecolor='black', linewidth=0.5)

plt.title('Top 20 Amenities by Price Impact', fontsize=32)
plt.grid(axis='both', alpha=0.3, linestyle='--')
plt.tight_layout()
#plt.show()

plt.savefig('amenities_impact.png')
Percentage Change = (price with amenity - price without amenity) / (price without amenity) * 100%
In [ ]: