import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
import seaborn as sns
import re
import numpy as np
import ast
conn = sqlite3.connect("florence_airbnb.db")
# Load cleaned CSV files
listings = pd.read_csv("listings_cleaned.csv", parse_dates=True)
calendar = pd.read_csv("calendar_cleaned.csv", parse_dates=True)
reviews = pd.read_csv("reviews_cleaned.csv", parse_dates=True)
listings.to_sql("listings", conn, if_exists="replace", index=False)
calendar.to_sql("calendar", conn, if_exists="replace", index=False)
reviews.to_sql("reviews", conn, if_exists="replace", index=False)
904945
# PRAGMA queries for basic information about the data
pd.set_option('display.max_rows', 100)
tables = ['listings', 'calendar', 'reviews']
for table in tables:
print(f"\nTable: {table}")
df = pd.read_sql_query(f"PRAGMA table_info({table})", conn)
display(df)
Table: listings
cid | name | type | notnull | dflt_value | pk | |
---|---|---|---|---|---|---|
0 | 0 | id | INTEGER | 0 | None | 0 |
1 | 1 | last_scraped | TEXT | 0 | None | 0 |
2 | 2 | source | TEXT | 0 | None | 0 |
3 | 3 | name | TEXT | 0 | None | 0 |
4 | 4 | description | TEXT | 0 | None | 0 |
5 | 5 | neighborhood_overview | TEXT | 0 | None | 0 |
6 | 6 | host_id | INTEGER | 0 | None | 0 |
7 | 7 | host_name | TEXT | 0 | None | 0 |
8 | 8 | host_since | TEXT | 0 | None | 0 |
9 | 9 | host_location | TEXT | 0 | None | 0 |
10 | 10 | host_about | TEXT | 0 | None | 0 |
11 | 11 | host_response_time | TEXT | 0 | None | 0 |
12 | 12 | host_response_rate | TEXT | 0 | None | 0 |
13 | 13 | host_acceptance_rate | TEXT | 0 | None | 0 |
14 | 14 | host_is_superhost | TEXT | 0 | None | 0 |
15 | 15 | host_neighbourhood | TEXT | 0 | None | 0 |
16 | 16 | host_listings_count | INTEGER | 0 | None | 0 |
17 | 17 | host_total_listings_count | INTEGER | 0 | None | 0 |
18 | 18 | host_verifications | TEXT | 0 | None | 0 |
19 | 19 | host_has_profile_pic | TEXT | 0 | None | 0 |
20 | 20 | host_identity_verified | TEXT | 0 | None | 0 |
21 | 21 | neighbourhood | TEXT | 0 | None | 0 |
22 | 22 | neighbourhood_cleansed | TEXT | 0 | None | 0 |
23 | 23 | latitude | REAL | 0 | None | 0 |
24 | 24 | longitude | REAL | 0 | None | 0 |
25 | 25 | property_type | TEXT | 0 | None | 0 |
26 | 26 | room_type | TEXT | 0 | None | 0 |
27 | 27 | accommodates | INTEGER | 0 | None | 0 |
28 | 28 | bathrooms | REAL | 0 | None | 0 |
29 | 29 | bathrooms_text | TEXT | 0 | None | 0 |
30 | 30 | bedrooms | REAL | 0 | None | 0 |
31 | 31 | beds | REAL | 0 | None | 0 |
32 | 32 | amenities | TEXT | 0 | None | 0 |
33 | 33 | price | REAL | 0 | None | 0 |
34 | 34 | minimum_nights | INTEGER | 0 | None | 0 |
35 | 35 | maximum_nights | INTEGER | 0 | None | 0 |
36 | 36 | minimum_minimum_nights | INTEGER | 0 | None | 0 |
37 | 37 | maximum_minimum_nights | INTEGER | 0 | None | 0 |
38 | 38 | minimum_maximum_nights | INTEGER | 0 | None | 0 |
39 | 39 | maximum_maximum_nights | INTEGER | 0 | None | 0 |
40 | 40 | minimum_nights_avg_ntm | REAL | 0 | None | 0 |
41 | 41 | maximum_nights_avg_ntm | REAL | 0 | None | 0 |
42 | 42 | availability_30 | INTEGER | 0 | None | 0 |
43 | 43 | availability_60 | INTEGER | 0 | None | 0 |
44 | 44 | availability_90 | INTEGER | 0 | None | 0 |
45 | 45 | availability_365 | INTEGER | 0 | None | 0 |
46 | 46 | calendar_last_scraped | TEXT | 0 | None | 0 |
47 | 47 | number_of_reviews | INTEGER | 0 | None | 0 |
48 | 48 | number_of_reviews_ltm | INTEGER | 0 | None | 0 |
49 | 49 | number_of_reviews_l30d | INTEGER | 0 | None | 0 |
50 | 50 | availability_eoy | INTEGER | 0 | None | 0 |
51 | 51 | number_of_reviews_ly | INTEGER | 0 | None | 0 |
52 | 52 | estimated_occupancy_l365d | INTEGER | 0 | None | 0 |
53 | 53 | estimated_revenue_l365d | REAL | 0 | None | 0 |
54 | 54 | first_review | TEXT | 0 | None | 0 |
55 | 55 | last_review | TEXT | 0 | None | 0 |
56 | 56 | review_scores_rating | REAL | 0 | None | 0 |
57 | 57 | review_scores_accuracy | REAL | 0 | None | 0 |
58 | 58 | review_scores_cleanliness | REAL | 0 | None | 0 |
59 | 59 | review_scores_checkin | REAL | 0 | None | 0 |
60 | 60 | review_scores_communication | REAL | 0 | None | 0 |
61 | 61 | review_scores_location | REAL | 0 | None | 0 |
62 | 62 | review_scores_value | REAL | 0 | None | 0 |
63 | 63 | license | TEXT | 0 | None | 0 |
64 | 64 | instant_bookable | TEXT | 0 | None | 0 |
65 | 65 | calculated_host_listings_count | INTEGER | 0 | None | 0 |
66 | 66 | calculated_host_listings_count_entire_homes | INTEGER | 0 | None | 0 |
67 | 67 | calculated_host_listings_count_private_rooms | INTEGER | 0 | None | 0 |
68 | 68 | calculated_host_listings_count_shared_rooms | INTEGER | 0 | None | 0 |
69 | 69 | reviews_per_month | REAL | 0 | None | 0 |
Table: calendar
cid | name | type | notnull | dflt_value | pk | |
---|---|---|---|---|---|---|
0 | 0 | listing_id | INTEGER | 0 | None | 0 |
1 | 1 | date | TEXT | 0 | None | 0 |
2 | 2 | available | INTEGER | 0 | None | 0 |
3 | 3 | price | REAL | 0 | None | 0 |
4 | 4 | minimum_nights | REAL | 0 | None | 0 |
5 | 5 | maximum_nights | REAL | 0 | None | 0 |
Table: reviews
cid | name | type | notnull | dflt_value | pk | |
---|---|---|---|---|---|---|
0 | 0 | listing_id | INTEGER | 0 | None | 0 |
1 | 1 | id | INTEGER | 0 | None | 0 |
2 | 2 | date | TEXT | 0 | None | 0 |
3 | 3 | reviewer_id | INTEGER | 0 | None | 0 |
4 | 4 | reviewer_name | TEXT | 0 | None | 0 |
5 | 5 | comments | TEXT | 0 | None | 0 |
q1 = """WITH top_listings AS (
SELECT
listing_id,
ROUND(AVG(price), 0) AS avg_price
FROM calendar
WHERE available = True
GROUP BY listing_id
ORDER BY avg_price DESC
)
SELECT
COUNT(listing_id) AS num_of_listings,
CASE WHEN avg_price >= 10000 THEN '>=10k'
WHEN avg_price >= 5000 THEN '5k-10k'
WHEN avg_price >= 1000 THEN '1k-5k'
WHEN avg_price >= 500 THEN '500-1k'
WHEN avg_price >= 100 THEN '100-500'
ELSE '<100' END AS price_bucket
FROM top_listings
GROUP BY 2
ORDER BY
CASE price_bucket
WHEN '>=10k' THEN 1
WHEN '5k-10k' THEN 2
WHEN '1k-5k' THEN 3
WHEN '500-1k' THEN 4
WHEN '100-500' THEN 5
ELSE 6
END
"""
listings_price = pd.read_sql_query(q1, conn)
# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
sns.barplot(x=listings_price['price_bucket'], y=listings_price['num_of_listings'],
palette='RdPu', edgecolor='black', linewidth=0.5)
plt.yscale('log', base=10)
plt.xlabel('Listing Price Range per Night ($)', labelpad=20)
plt.ylabel('Number of Listings')
plt.title('Distribution of Rental Price Ranges', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')
yticks = [50, 100, 500, 1000, 5000]
ylabels = [str(y) for y in yticks]
plt.yticks(yticks, labels=ylabels)
# Adding labels to the bars
for index, row in listings_price.iterrows():
plt.text(index,
row['num_of_listings']*(1+0.2),
str(round(row['num_of_listings'],2)),
va='center',
color='black',
ha='center'
)
# Adding space for the labels
plt.ylim(30, max(listings_price['num_of_listings']) * 1.7)
plt.tight_layout()
#plt.show()
plt.savefig('price_ranges.png')
q2 = """
SELECT
strftime('%Y-%m', date) AS month,
ROUND(AVG(price), 2) AS avg_price
FROM calendar
WHERE price IS NOT NULL AND available = True
GROUP BY month
ORDER BY month
"""
monthly_price = pd.read_sql_query(q2, conn)
# Formatting the date
monthly_price['formatted_month'] = pd.to_datetime(monthly_price['month']).dt.strftime('%m.%y')
# Plotting the results
plt.figure(figsize=(20,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
sns.lineplot(x=monthly_price['formatted_month'], y=monthly_price['avg_price'],
color='purple', marker='o', linewidth=3, markersize=10)
plt.xlabel('Date (month.year)', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Average Price of Rent by Month', fontsize=32)
plt.grid(True, alpha=0.3)
plt.tight_layout()
#plt.show()
plt.savefig('price_by_month.png')
q3 = """
SELECT
l.room_type AS room_type,
ROUND(AVG(c.price), 2) AS avg_price
FROM calendar c
JOIN listings l ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1
ORDER BY 2 DESC
"""
room_price = pd.read_sql_query(q3, conn)
# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
sns.barplot(x=room_price['room_type'], y=room_price['avg_price'],
palette='RdPu', edgecolor='black', linewidth=0.5)
plt.xlabel('Room Type', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Distribution of Rental Prices by Room Type', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')
# Adding labels to the bars
for index, row in room_price.iterrows():
plt.text(index,
row['avg_price']+20,
str(round(row['avg_price'],2)),
va='center',
color='black',
ha='center'
)
# Adding space for the labels
plt.ylim(50, max(room_price['avg_price']) * 1.1)
plt.tight_layout()
#plt.show()
plt.savefig('price_by_roomtype.png')
q4 = """
SELECT
l.neighbourhood_cleansed AS nieghbourhood,
AVG(c.price) AS avg_price
FROM calendar c
JOIN listings l ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1
ORDER BY 2 DESC
"""
neighb_price = pd.read_sql_query(q4, conn)
# Plotting the results
plt.figure(figsize=(15,8))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 26
sns.barplot(x=neighb_price['nieghbourhood'], y=neighb_price['avg_price'],
palette='RdPu', edgecolor='black', linewidth=0.5)
plt.xlabel('Nieghbourhood', labelpad=20)
plt.ylabel('Average Price per Night ($)')
plt.title('Distribution of Rental Prices by Neighbourhood', fontsize=32)
plt.grid(axis='y', alpha=0.3, linestyle='--')
# Adjust x ticks labels
xlabels = []
for label in neighb_price['nieghbourhood']:
xlabels.append(re.sub(r"\s",r"\n", label))
positions = range(len(xlabels))
plt.xticks(positions, xlabels)
# Adding labels to the bars
for index, row in neighb_price.iterrows():
plt.text(index,
row['avg_price']+10,
str(round(row['avg_price'],2)),
va='center',
color='black',
ha='center'
)
# Adding space for the labels
plt.ylim(300, max(neighb_price['avg_price']) * 1.05)
plt.tight_layout()
#plt.show()
plt.savefig('price_by_neighbourhood.png')
q5 = """
SELECT
l.room_type AS "Room Type",
l.neighbourhood_cleansed AS Nieghbourhood,
ROUND(AVG(l.accommodates),1) AS "Accommodates (avg)",
COUNT(l.id) AS "Num. of Listings",
ROUND(AVG(c.price),2) AS "Price per Night (avg)"
FROM listings l
JOIN calendar c ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1, 2
ORDER BY 1, 5 DESC
"""
room_neighb_price = pd.read_sql_query(q5, conn)
room_neighb_price
Room Type | Nieghbourhood | Accommodates (avg) | Num. of Listings | Price per Night (avg) | |
---|---|---|---|---|---|
0 | Entire home/apt | Isolotto Legnaia | 4.3 | 89879 | 643.13 |
1 | Entire home/apt | Campo di Marte | 4.1 | 192669 | 564.73 |
2 | Entire home/apt | Centro Storico | 4.2 | 1559858 | 556.27 |
3 | Entire home/apt | Gavinana Galluzzo | 4.2 | 71756 | 459.72 |
4 | Entire home/apt | Rifredi | 4.1 | 163307 | 385.66 |
5 | Hotel room | Centro Storico | 2.4 | 8688 | 372.94 |
6 | Hotel room | Gavinana Galluzzo | 2.6 | 1821 | 263.56 |
7 | Hotel room | Rifredi | 2.0 | 313 | 128.11 |
8 | Hotel room | Campo di Marte | 2.0 | 906 | 65.71 |
9 | Private room | Centro Storico | 2.3 | 218473 | 403.93 |
10 | Private room | Rifredi | 2.4 | 47879 | 275.99 |
11 | Private room | Isolotto Legnaia | 2.1 | 20926 | 215.32 |
12 | Private room | Campo di Marte | 2.1 | 53124 | 202.14 |
13 | Private room | Gavinana Galluzzo | 2.2 | 18049 | 133.22 |
14 | Shared room | Centro Storico | 1.0 | 626 | 999.00 |
15 | Shared room | Campo di Marte | 2.0 | 329 | 55.00 |
16 | Shared room | Rifredi | 1.0 | 1181 | 12.48 |
room_types = room_neighb_price['Room Type'].unique()
# Create figure with subplots in a 2x2 grid
fig, axes = plt.subplots(2, 2, figsize=(15, 10), sharex=False, sharey=True)
axes = axes.flatten() # Flatten to easily iterate
# Define a maximum y-value for all plots to share the same scale
max_price = room_neighb_price["Price per Night (avg)"].max() * 1.1 # Add 10% margin
# Set a standard width for the bars
bar_width = 0.7
# Set color palette for the bars
colors = sns.color_palette('RdPu', n_colors=len(room_types))
# Iterate through each room type to create individual plots
for i, room_type in enumerate(room_types):
ax = axes[i]
room_data = room_neighb_price[room_neighb_price["Room Type"] == room_type]
# Sort by price in descending order
room_data = room_data.sort_values("Price per Night (avg)", ascending=False)
# Get neighborhoods and prices
neighborhoods = room_data["Nieghbourhood"].tolist()
prices = room_data["Price per Night (avg)"].tolist()
# Create positions for bars
positions = np.arange(len(neighborhoods))
rcParams['font.family'] = 'serif'
rcParams['font.size'] = 18
# Create bars
bars = ax.bar(positions, prices, bar_width, color=colors, edgecolor='black', linewidth=0.5)
# Add price labels on top of bars
for bar, price in zip(bars, prices):
ax.text(bar.get_x() + bar.get_width()/2.,
bar.get_height() + max_price * 0.02, # Small offset from top
f'${price:.2f}',
ha='center', va='bottom',
fontsize=14, rotation=0)
# Set title and labels
ax.set_title(f'{room_type}', fontsize=20, fontweight='bold')
ax.set_ylabel('Price per Night ($)' if i % 2 == 0 else '') # Y-label only for left plots
ax.set_xlabel('Neighborhood' if i >= 2 else '', labelpad=10) # X-label only for bottom plots
# Adjust x ticks labels
xlabels = []
for label in neighborhoods:
xlabels.append(re.sub(r"\s",r"\n", label))
# Set tick positions and labels
ax.set_xticks(positions)
ax.set_xticklabels(xlabels, ha='center')
# Set y-axis limit to be the same for all plots
ax.set_ylim(0, max_price)
# Add grid lines for better readability
ax.grid(axis='y', linestyle='--', alpha=0.7)
# Add a main title for the entire figure
plt.suptitle('Price Distribution by Neighborhood for Different Room Types', fontsize=26, y=1)
# Adjust layout to prevent overlapping
plt.tight_layout()
plt.subplots_adjust(top=0.9)
plt.savefig('price_by_room_neighb.png')
# Get a list of all unique amenities across all listings
all_amenities = []
for amenities_list in listings['amenities']:
all_amenities.extend(ast.literal_eval(amenities_list))
unique_amenities = sorted(list(set(all_amenities)))
print(f"Total number of unique amenities: {len(unique_amenities)}")
print("\nFinding the most frequent amenities.")
# Create a dictionary to count occurrences
amenity_counts = {}
for amenities_list in listings['amenities']:
for amenity in ast.literal_eval(amenities_list):
if amenity in amenity_counts:
amenity_counts[amenity] += 1
else:
amenity_counts[amenity] = 1
# Convert to dataframe
amenity_df = pd.DataFrame({
'Amenity': list(amenity_counts.keys()),
'Count': list(amenity_counts.values())
})
# Sort by frequency
amenity_df = amenity_df.sort_values('Count', ascending=False).reset_index(drop=True)
# Display the top 20 most common amenities
print("\nTop 20 most common amenities:")
amenity_df.head(20)
Total number of unique amenities: 2924 Finding the most frequent amenities. Top 20 most common amenities:
Amenity | Count | |
---|---|---|
0 | Hair dryer | 11464 |
1 | Wifi | 11194 |
2 | Kitchen | 10916 |
3 | Hot water | 10911 |
4 | Hangers | 10316 |
5 | Essentials | 10268 |
6 | Bed linens | 10007 |
7 | Refrigerator | 9818 |
8 | Dishes and silverware | 9726 |
9 | Iron | 9542 |
10 | Cooking basics | 8772 |
11 | Bidet | 7901 |
12 | Heating | 7898 |
13 | TV | 7665 |
14 | Shampoo | 7631 |
15 | Air conditioning | 7327 |
16 | Dining table | 7013 |
17 | Microwave | 6897 |
18 | Carbon monoxide alarm | 6631 |
19 | Freezer | 6518 |
# Retrieve a DataFrame with average price per night by listings
q6 = """
SELECT
c.listing_id,
ROUND(AVG(c.price), 2) AS avg_price,
l.amenities
FROM calendar c
JOIN listings l ON c.listing_id = l.id
WHERE c.available = True
GROUP BY 1
ORDER BY 2 DESC
"""
listings_df = pd.read_sql_query(q6, conn)
listings_df['amenities'] = listings_df['amenities'].apply(ast.literal_eval)
top_amenities = amenity_df.head(20)['Amenity'].tolist()
for amenity in top_amenities:
# Check if each listing has this amenity
listings_df[f'has_{amenity.lower().replace(" ", "_")}'] = listings_df['amenities'].apply(
lambda x: 1 if amenity in x else 0)
# Calculate average price for listings with and without each amenity
price_impact = {}
for amenity in top_amenities:
col_name = f'has_{amenity.lower().replace(" ", "_")}'
with_amenity = listings_df[listings_df[col_name] == 1]['avg_price'].mean()
without_amenity = listings_df[listings_df[col_name] == 0]['avg_price'].mean()
price_diff = with_amenity - without_amenity
price_impact[amenity] = {
'with': with_amenity,
'without': without_amenity,
'difference': price_diff,
'percentage': ((price_diff) / without_amenity) * 100 if without_amenity > 0 else float('nan')
}
# Convert to dataframe
impact_df = pd.DataFrame(price_impact).T.reset_index()
impact_df.columns = ['Amenity', 'With Amenity', 'Without Amenity', 'Price Difference', 'Percentage Change']
impact_df = impact_df.sort_values('Percentage Change', ascending=False, ignore_index=True)
# Plot amenities with biggest price impact (top 20)
plt.figure(figsize=(15, 12))
plt.rcParams['font.family'] = 'serif'
plt.rcParams['font.size'] = 22
print(f"Percentage Change = (price with amenity - price without amenity) / (price without amenity) * 100%")
sns.barplot(x='Percentage Change', y='Amenity', data=impact_df.head(20),
palette='RdPu', edgecolor='black', linewidth=0.5)
plt.title('Top 20 Amenities by Price Impact', fontsize=32)
plt.grid(axis='both', alpha=0.3, linestyle='--')
plt.tight_layout()
#plt.show()
plt.savefig('amenities_impact.png')
Percentage Change = (price with amenity - price without amenity) / (price without amenity) * 100%