-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathClean_data.sql
115 lines (98 loc) · 3.41 KB
/
Clean_data.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
##Step 1: #Data Cleaning
##Remove #Null Values: Start by ensuring there are no null values in critical fields, such as listing_id, date, and price.
#Listing Table (Cleaned)
CREATE OR REPLACE TABLE `your_project.your_dataset.listing_cleaned` AS
SELECT
id AS listing_id,
room_type,
host_response_time,
review_scores_value
FROM
`your_project.your_dataset.listing`
WHERE
id IS NOT NULL
AND room_type IS NOT NULL
AND host_response_time IS NOT NULL
AND review_scores_value IS NOT NULL;
-------------------------------
# Calendar Table (Cleaned)
CREATE OR REPLACE TABLE `your_project.your_dataset.calendar_cleaned` AS
SELECT
listing_id,
date,
available,
CAST(price AS FLOAT64) AS price
FROM
`your_project.your_dataset.calendar`
WHERE
listing_id IS NOT NULL
AND date IS NOT NULL
AND price IS NOT NULL;
-------------------------------
#Convert Data Types: Ensure that fields like price are in a numeric format (e.g., FLOAT) for calculations, and that date is in DATE format.
#Already included in the cleaning queries above with CAST(price AS FLOAT64) for price and using the date field as DATE.
---------------------------------
#Step 2: Creating Calculated Fields
#Occupied Room Indicator: Create a calculated field to indicate whether a room was occupied (1 for occupied, 0 for available).
#Occupied Room Indicator Calculation in the Combined Table
CASE WHEN available = FALSE THEN 1 ELSE 0 END AS occupied_room_indicator
Revenue Calculation: Calculate the revenue based on the price field, only when a room is occupied.
---------------------------------
#Revenue Calculation
price * (CASE WHEN available = FALSE THEN 1 ELSE 0 END) AS revenue
#Step 3: Joining the Tables
#Now that the data is cleaned and you have necessary calculated fields, you can join the listing_cleaned and calendar_cleaned tables to create a single table for analysis.
CREATE OR REPLACE TABLE `your_project.your_dataset.listing_calendar_combined` AS
SELECT
l.listing_id,
l.room_type,
l.host_response_time,
l.review_scores_value,
c.date,
c.available,
c.price,
-- Calculate Occupied Room Indicator
CASE WHEN c.available = FALSE THEN 1 ELSE 0 END AS occupied_room_indicator,
-- Calculate Revenue
c.price * (CASE WHEN c.available = FALSE THEN 1 ELSE 0 END) AS revenue
FROM
`your_project.your_dataset.listing_cleaned` AS l
JOIN
`your_project.your_dataset.calendar_cleaned` AS c
ON
l.listing_id = c.listing_id;
#Step 4: Additional Queries for Analysis (Optional)
#After joining the tables, you may want to run some additional queries to create aggregated data for analysis:
---------------------------
#Calculate Daily Occupancy Rate:
SELECT
date,
SUM(occupied_room_indicator) / COUNT(available) AS daily_occupancy_rate
FROM
`your_project.your_dataset.listing_calendar_combined`
GROUP BY
date
ORDER BY
date;
-----------------------------
#Calculate Average Price by Room Type:
SELECT
room_type,
AVG(price) AS average_price
FROM
`your_project.your_dataset.listing_calendar_combined`
GROUP BY
room_type
ORDER BY
average_price DESC;
-----------------------------
#Calculate Average Review Score by Host Response Time:
SELECT
host_response_time,
AVG(review_scores_value) AS average_review_score
FROM
`your_project.your_dataset.listing_calendar_combined`
GROUP BY
host_response_time
ORDER BY
average_review_score DESC;