import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import re
import numpy as np
import ast


conn = sqlite3.connect("florence_airbnb.db")

# Load cleaned CSV files
listings = pd.read_csv("listings_cleaned.csv", parse_dates=True)
calendar = pd.read_csv("calendar_cleaned.csv", parse_dates=True)
reviews = pd.read_csv("reviews_cleaned.csv", parse_dates=True)

listings.to_sql("listings", conn, if_exists="replace", index=False)
calendar.to_sql("calendar", conn, if_exists="replace", index=False)
reviews.to_sql("reviews", conn, if_exists="replace", index=False)

904945


# PRAGMA queries for basic information about the data
pd.set_option('display.max_rows', 100)
tables = ['listings', 'calendar', 'reviews']
for table in tables:
    print(f"\nTable: {table}")
    df = pd.read_sql_query(f"PRAGMA table_info({table})", conn)
    display(df)

Table: listings

Table: calendar

Table: reviews


q1 = """WITH top_listings AS (
    SELECT 
        listing_id, 
        ROUND(AVG(price), 0) AS avg_price 
    FROM calendar 
    WHERE available = True 
    GROUP BY listing_id 
    ORDER BY avg_price DESC 
) 
 
SELECT 
    COUNT(listing_id) AS num_of_listings, 
    CASE WHEN avg_price >= 10000 THEN '>=10k'
        WHEN avg_price >= 5000 THEN '5k-10k'
        WHEN avg_price >= 1000 THEN '1k-5k'
        WHEN avg_price >= 500 THEN '500-1k'
        WHEN avg_price >= 100 THEN '100-500'
        ELSE '<100' END AS price_bucket
FROM top_listings
GROUP BY 2 
ORDER BY 
    CASE price_bucket
        WHEN '>=10k' THEN 1
        WHEN '5k-10k' THEN 2
        WHEN '1k-5k' THEN 3
        WHEN '500-1k' THEN 4
        WHEN '100-500' THEN 5
        ELSE 6
    END
"""

listings_price = pd.read_sql_query(q1, conn)

# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
    
sns.barplot(x=listings_price['price_bucket'], y=listings_price['num_of_listings'], 
            palette='RdPu', edgecolor='black', linewidth=0.5)

plt.yscale('log', base=10)
plt.xlabel('Listing Price Range per Night ($)', labelpad=20)
plt.ylabel('Number of Listings')
plt.title('Distribution of Rental Price Ranges', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')

yticks = [50, 100, 500, 1000, 5000]
ylabels = [str(y) for y in yticks]
plt.yticks(yticks, labels=ylabels)

# Adding labels to the bars
for index, row in listings_price.iterrows():
        plt.text(index,
            row['num_of_listings']*(1+0.2),
            str(round(row['num_of_listings'],2)),
            va='center',
            color='black',
            ha='center'
            )
# Adding space for the labels
plt.ylim(30, max(listings_price['num_of_listings']) * 1.7)
        
plt.tight_layout()
#plt.show()

plt.savefig('price_ranges.png')


q2 = """
SELECT 
    strftime('%Y-%m', date) AS month, 
    ROUND(AVG(price), 2) AS avg_price 
FROM calendar 
WHERE price IS NOT NULL AND available = True 
GROUP BY month 
ORDER BY month
"""

monthly_price = pd.read_sql_query(q2, conn)

# Formatting the date
monthly_price['formatted_month'] = pd.to_datetime(monthly_price['month']).dt.strftime('%m.%y')

# Plotting the results
plt.figure(figsize=(20,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26

sns.lineplot(x=monthly_price['formatted_month'], y=monthly_price['avg_price'],
             color='purple', marker='o', linewidth=3, markersize=10)

plt.xlabel('Date (month.year)', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Average Price of Rent by Month', fontsize=32)
plt.grid(True, alpha=0.3)

plt.tight_layout()
#plt.show()

plt.savefig('price_by_month.png')


q3 = """
SELECT 
    l.room_type AS room_type,
    ROUND(AVG(c.price), 2) AS avg_price
FROM calendar c
JOIN listings l ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1
ORDER BY 2 DESC
"""

room_price = pd.read_sql_query(q3, conn)

# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
    
sns.barplot(x=room_price['room_type'], y=room_price['avg_price'],
           palette='RdPu', edgecolor='black', linewidth=0.5)

plt.xlabel('Room Type', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Distribution of Rental Prices by Room Type', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Adding labels to the bars
for index, row in room_price.iterrows():
        plt.text(index,
            row['avg_price']+20,
            str(round(row['avg_price'],2)),
            va='center',
            color='black',
            ha='center'
            )
        
# Adding space for the labels
plt.ylim(50, max(room_price['avg_price']) * 1.1)

plt.tight_layout()
#plt.show()

plt.savefig('price_by_roomtype.png')


q4 = """
SELECT
    l.neighbourhood_cleansed AS nieghbourhood,
    AVG(c.price) AS avg_price
FROM calendar c
    JOIN listings l ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1
ORDER BY 2 DESC
"""

neighb_price = pd.read_sql_query(q4, conn)

# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
    
sns.barplot(x=neighb_price['nieghbourhood'], y=neighb_price['avg_price'],
           palette='RdPu', edgecolor='black', linewidth=0.5)

plt.xlabel('Nieghbourhood', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Distribution of Rental Prices by Neighbourhood', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Adjust x ticks labels
xlabels = []
for label in neighb_price['nieghbourhood']:
    xlabels.append(re.sub(r"\s",r"\n", label))
positions = range(len(xlabels))
plt.xticks(positions, xlabels)

# Adding labels to the bars
for index, row in neighb_price.iterrows():
        plt.text(index,
            row['avg_price']+10,
            str(round(row['avg_price'],2)),
            va='center',
            color='black',
            ha='center'
            )
# Adding space for the labels
plt.ylim(300, max(neighb_price['avg_price']) * 1.05)

plt.tight_layout()
#plt.show()

plt.savefig('price_by_neighbourhood.png')


q5 = """
SELECT 
    l.room_type AS "Room Type",
    l.neighbourhood_cleansed AS Nieghbourhood,
    ROUND(AVG(l.accommodates),1) AS "Accommodates (avg)",
    COUNT(l.id) AS "Num. of Listings",
    ROUND(AVG(c.price),2) AS "Price per Night (avg)"
FROM listings l
    JOIN calendar c ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1, 2
ORDER BY 1, 5 DESC
"""

room_neighb_price = pd.read_sql_query(q5, conn)
room_neighb_price


room_types = room_neighb_price['Room Type'].unique()

# Create figure with subplots in a 2x2 grid
fig, axes = plt.subplots(2, 2, figsize=(15, 10), sharex=False, sharey=True)
axes = axes.flatten()  # Flatten to easily iterate

# Define a maximum y-value for all plots to share the same scale
max_price = room_neighb_price["Price per Night (avg)"].max() * 1.1  # Add 10% margin

# Set a standard width for the bars
bar_width = 0.7

# Set color palette for the bars
colors = sns.color_palette('RdPu', n_colors=len(room_types))

# Iterate through each room type to create individual plots
for i, room_type in enumerate(room_types):
    ax = axes[i]
    room_data = room_neighb_price[room_neighb_price["Room Type"] == room_type]
    
    # Sort by price in descending order
    room_data = room_data.sort_values("Price per Night (avg)", ascending=False)
    
    # Get neighborhoods and prices
    neighborhoods = room_data["Nieghbourhood"].tolist()
    prices = room_data["Price per Night (avg)"].tolist()
    
    # Create positions for bars
    positions = np.arange(len(neighborhoods))
    
    rcParams['font.family'] = 'serif'
    rcParams['font.size'] = 18
    
    # Create bars
    bars = ax.bar(positions, prices, bar_width, color=colors, edgecolor='black', linewidth=0.5)
    
    # Add price labels on top of bars
    for bar, price in zip(bars, prices):
        ax.text(bar.get_x() + bar.get_width()/2., 
                bar.get_height() + max_price * 0.02,  # Small offset from top
                f'${price:.2f}', 
                ha='center', va='bottom', 
                fontsize=14, rotation=0)
    
    # Set title and labels
    ax.set_title(f'{room_type}', fontsize=20, fontweight='bold')
    ax.set_ylabel('Price per Night ($)' if i % 2 == 0 else '')  # Y-label only for left plots
    ax.set_xlabel('Neighborhood' if i >= 2 else '', labelpad=10)  # X-label only for bottom plots
    
    # Adjust x ticks labels
    xlabels = []
    for label in neighborhoods:
        xlabels.append(re.sub(r"\s",r"\n", label))
    
    # Set tick positions and labels
    ax.set_xticks(positions)
    ax.set_xticklabels(xlabels, ha='center')
    
    # Set y-axis limit to be the same for all plots
    ax.set_ylim(0, max_price)
    
    # Add grid lines for better readability
    ax.grid(axis='y', linestyle='--', alpha=0.7)
    
# Add a main title for the entire figure
plt.suptitle('Price Distribution by Neighborhood for Different Room Types', fontsize=26, y=1)

# Adjust layout to prevent overlapping
plt.tight_layout()
plt.subplots_adjust(top=0.9)

plt.savefig('price_by_room_neighb.png')


#  Get a list of all unique amenities across all listings
all_amenities = []
for amenities_list in listings['amenities']:
    all_amenities.extend(ast.literal_eval(amenities_list))

unique_amenities = sorted(list(set(all_amenities)))
print(f"Total number of unique amenities: {len(unique_amenities)}")

print("\nFinding the most frequent amenities.")
# Create a dictionary to count occurrences
amenity_counts = {}
for amenities_list in listings['amenities']:
    for amenity in ast.literal_eval(amenities_list):
        if amenity in amenity_counts:
            amenity_counts[amenity] += 1
        else:
            amenity_counts[amenity] = 1

# Convert to dataframe
amenity_df = pd.DataFrame({
    'Amenity': list(amenity_counts.keys()),
    'Count': list(amenity_counts.values())
})

# Sort by frequency
amenity_df = amenity_df.sort_values('Count', ascending=False).reset_index(drop=True)

# Display the top 20 most common amenities
print("\nTop 20 most common amenities:")
amenity_df.head(20)

Total number of unique amenities: 2924

Finding the most frequent amenities.

Top 20 most common amenities:


# Retrieve a DataFrame with average price per night by listings
q6 = """
    SELECT 
        c.listing_id, 
        ROUND(AVG(c.price), 2) AS avg_price,
        l.amenities
    FROM calendar c
        JOIN listings l ON c.listing_id = l.id
    WHERE c.available = True
    GROUP BY 1
    ORDER BY 2 DESC
    """

listings_df = pd.read_sql_query(q6, conn)
listings_df['amenities'] = listings_df['amenities'].apply(ast.literal_eval)

top_amenities = amenity_df.head(20)['Amenity'].tolist()

for amenity in top_amenities:
        # Check if each listing has this amenity
        listings_df[f'has_{amenity.lower().replace(" ", "_")}'] = listings_df['amenities'].apply(
            lambda x: 1 if amenity in x else 0)
    
# Calculate average price for listings with and without each amenity
price_impact = {}
for amenity in top_amenities:
    col_name = f'has_{amenity.lower().replace(" ", "_")}'
    with_amenity = listings_df[listings_df[col_name] == 1]['avg_price'].mean()
    without_amenity = listings_df[listings_df[col_name] == 0]['avg_price'].mean()
    price_diff = with_amenity - without_amenity
    price_impact[amenity] = {
            'with': with_amenity,
            'without': without_amenity,
            'difference': price_diff,
            'percentage': ((price_diff) / without_amenity) * 100 if without_amenity > 0 else float('nan')
        }
    
# Convert to dataframe
impact_df = pd.DataFrame(price_impact).T.reset_index()
impact_df.columns = ['Amenity', 'With Amenity', 'Without Amenity', 'Price Difference', 'Percentage Change']
impact_df = impact_df.sort_values('Percentage Change', ascending=False, ignore_index=True)
    
# Plot amenities with biggest price impact (top 20)
plt.figure(figsize=(15, 12))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 22

print(f"Percentage Change = (price with amenity - price without amenity) / (price without amenity) * 100%")

sns.barplot(x='Percentage Change', y='Amenity', data=impact_df.head(20),
            palette='RdPu', edgecolor='black', linewidth=0.5)

plt.title('Top 20 Amenities by Price Impact', fontsize=32)
plt.grid(axis='both', alpha=0.3, linestyle='--')
plt.tight_layout()
#plt.show()

plt.savefig('amenities_impact.png')

Percentage Change = (price with amenity - price without amenity) / (price without amenity) * 100%

	cid	name	type	dflt_value
0	0	id	INTEGER	None
1	1	last_scraped	TEXT	None
2	2	source	TEXT	None
3	3	name	TEXT	None
4	4	description	TEXT	None
5	5	neighborhood_overview	TEXT	None
6	6	host_id	INTEGER	None
7	7	host_name	TEXT	None
8	8	host_since	TEXT	None
9	9	host_location	TEXT	None
10	10	host_about	TEXT	None
11	11	host_response_time	TEXT	None
12	12	host_response_rate	TEXT	None
13	13	host_acceptance_rate	TEXT	None
14	14	host_is_superhost	TEXT	None
15	15	host_neighbourhood	TEXT	None
16	16	host_listings_count	INTEGER	None
17	17	host_total_listings_count	INTEGER	None
18	18	host_verifications	TEXT	None
19	19	host_has_profile_pic	TEXT	None
20	20	host_identity_verified	TEXT	None
21	21	neighbourhood	TEXT	None
22	22	neighbourhood_cleansed	TEXT	None
23	23	latitude	REAL	None
24	24	longitude	REAL	None
25	25	property_type	TEXT	None
26	26	room_type	TEXT	None
27	27	accommodates	INTEGER	None
28	28	bathrooms	REAL	None
29	29	bathrooms_text	TEXT	None
30	30	bedrooms	REAL	None
31	31	beds	REAL	None
32	32	amenities	TEXT	None
33	33	price	REAL	None
34	34	minimum_nights	INTEGER	None
35	35	maximum_nights	INTEGER	None
36	36	minimum_minimum_nights	INTEGER	None
37	37	maximum_minimum_nights	INTEGER	None
38	38	minimum_maximum_nights	INTEGER	None
39	39	maximum_maximum_nights	INTEGER	None
40	40	minimum_nights_avg_ntm	REAL	None
41	41	maximum_nights_avg_ntm	REAL	None
42	42	availability_30	INTEGER	None
43	43	availability_60	INTEGER	None
44	44	availability_90	INTEGER	None
45	45	availability_365	INTEGER	None
46	46	calendar_last_scraped	TEXT	None
47	47	number_of_reviews	INTEGER	None
48	48	number_of_reviews_ltm	INTEGER	None
49	49	number_of_reviews_l30d	INTEGER	None
50	50	availability_eoy	INTEGER	None
51	51	number_of_reviews_ly	INTEGER	None
52	52	estimated_occupancy_l365d	INTEGER	None
53	53	estimated_revenue_l365d	REAL	None
54	54	first_review	TEXT	None
55	55	last_review	TEXT	None
56	56	review_scores_rating	REAL	None
57	57	review_scores_accuracy	REAL	None
58	58	review_scores_cleanliness	REAL	None
59	59	review_scores_checkin	REAL	None
60	60	review_scores_communication	REAL	None
61	61	review_scores_location	REAL	None
62	62	review_scores_value	REAL	None
63	63	license	TEXT	None
64	64	instant_bookable	TEXT	None
65	65	calculated_host_listings_count	INTEGER	None
66	66	calculated_host_listings_count_entire_homes	INTEGER	None
67	67	calculated_host_listings_count_private_rooms	INTEGER	None
68	68	calculated_host_listings_count_shared_rooms	INTEGER	None
69	69	reviews_per_month	REAL	None

	cid	name	type	dflt_value
0	0	listing_id	INTEGER	None
1	1	date	TEXT	None
2	2	available	INTEGER	None
3	3	price	REAL	None
4	4	minimum_nights	REAL	None
5	5	maximum_nights	REAL	None

	cid	name	type	dflt_value
0	0	listing_id	INTEGER	None
1	1	id	INTEGER	None
2	2	date	TEXT	None
3	3	reviewer_id	INTEGER	None
4	4	reviewer_name	TEXT	None
5	5	comments	TEXT	None

	Room Type	Nieghbourhood	Accommodates (avg)	Num. of Listings	Price per Night (avg)
0	Entire home/apt	Isolotto Legnaia	4.3	89879	643.13
1	Entire home/apt	Campo di Marte	4.1	192669	564.73
2	Entire home/apt	Centro Storico	4.2	1559858	556.27
3	Entire home/apt	Gavinana Galluzzo	4.2	71756	459.72
4	Entire home/apt	Rifredi	4.1	163307	385.66
5	Hotel room	Centro Storico	2.4	8688	372.94
6	Hotel room	Gavinana Galluzzo	2.6	1821	263.56
7	Hotel room	Rifredi	2.0	313	128.11
8	Hotel room	Campo di Marte	2.0	906	65.71
9	Private room	Centro Storico	2.3	218473	403.93
10	Private room	Rifredi	2.4	47879	275.99
11	Private room	Isolotto Legnaia	2.1	20926	215.32
12	Private room	Campo di Marte	2.1	53124	202.14
13	Private room	Gavinana Galluzzo	2.2	18049	133.22
14	Shared room	Centro Storico	1.0	626	999.00
15	Shared room	Campo di Marte	2.0	329	55.00
16	Shared room	Rifredi	1.0	1181	12.48

	Amenity	Count
0	Hair dryer	11464
1	Wifi	11194
2	Kitchen	10916
3	Hot water	10911
4	Hangers	10316
5	Essentials	10268
6	Bed linens	10007
7	Refrigerator	9818
8	Dishes and silverware	9726
9	Iron	9542
10	Cooking basics	8772
11	Bidet	7901
12	Heating	7898
13	TV	7665
14	Shampoo	7631
15	Air conditioning	7327
16	Dining table	7013
17	Microwave	6897
18	Carbon monoxide alarm	6631
19	Freezer	6518

Price-Focused Analysis of Available Listings

Distribution of Rental Price Ranges

Average Price of Rent per Month

Distribution of Rental Prices by Room Type

Distribution of Rental Prices by Neighbourhood

Distribution of Rental Prices by Room Type and Neighbourhood

Amenities Impact on Price