-
Notifications
You must be signed in to change notification settings - Fork 0
/
Cyclistic Trip Data.sql
141 lines (118 loc) · 4.34 KB
/
Cyclistic Trip Data.sql
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
--exploring and cleaning data to verify that ride ids are unique
SELECT DISTINCT COUNT(ride_id) AS total_rides, COUNT (ride_id) AS not_distinct
FROM [Cyclistic_Trip_Data].[dbo].[1_22]
-- check how many of each bike type are used
SELECT rideable_type AS bike_type, COUNT(rideable_type) AS total_by_type
FROM [Cyclistic_Trip_Data].[dbo].[1_22]
GROUP BY rideable_type
--investigate how many of each type of user there is
SELECT DISTINCT member_casual AS member_type, COUNT(member_casual) AS total_members
FROM [Cyclistic_Trip_Data].[dbo].[1_22]
GROUP BY member_casual
--union for whole year data
CREATE TABLE [Cyclistic_Trip_Data].[dbo].full_[year22]
(ride_id nvarchar(50)
,[rideable_type] nvarchar(50)
,[started_at] datetime2
,[ended_at] datetime2
,[start_station_name] nvarchar(100)
,[start_station_id] nvarchar(100)
,[end_station_name] nvarchar(100)
,[end_station_id] nvarchar(100)
,[start_lat] float
,[start_lng] float
,[end_lat] float
,[end_lng] float
,[member_casual] nvarchar(50))
INSERT INTO [Cyclistic_Trip_Data].[dbo].full_year22
SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[1_22]
UNION ALL
SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[2_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[3_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[4_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[5_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[6_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[7_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[8_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[9_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[10_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[11_22]
UNION ALL SELECT *
FROM [Cyclistic_Trip_Data].[dbo].[12_22]
SELECT *
FROM [Cyclistic_Trip_Data].[dbo].full_year22;
-- calculate duration
SELECT
ride_id
started_at,
ended_at,
DATEDIFF(minute, started_at, ended_at) AS duration_in_minutes
FROM [Cyclistic_Trip_Data].[dbo].full_year22
--query for no null stations, date and time or trip, and the duration of trip in minutes
SELECT start_station_name, end_station_name,
CAST(started_at AS date) AS start_date,
CAST(ended_at AS date) AS end_date,
DATEDIFF(minute, started_at, ended_at) AS duration_in_minutes
FROM [Cyclistic_Trip_Data].[dbo].[full_year22]
WHERE start_station_name IS NOT NULL AND end_station_name IS NOT NULL
--to exclude the less than 0 min rides
WITH clean_data AS
(
SELECT ride_id, member_casual, rideable_type, start_station_name, end_station_name,
CAST(started_at AS date) AS start_date,
CAST(ended_at AS date) AS end_date,
DATEDIFF(minute, started_at, ended_at) AS duration_in_minutes
FROM [Cyclistic_Trip_Data].[dbo].[full_year22]
WHERE start_station_name IS NOT NULL AND end_station_name IS NOT NULL
)
SELECT *
FROM clean_data
WHERE duration_in_minutes > 0
-- calculate distance traveled using Haversine formula
SELECT
[start_lat],
[start_lng],
[end_lat],
[end_lng],
2 * 6371 * ASIN(
SQRT(
POWER(
SIN(RADIANS([end_lat] - [start_lat]) / 2), 2) +
COS(RADIANS([start_lat])) *
COS(RADIANS([end_lat])) *
POWER(SIN(RADIANS([end_lng] - [start_lng]) / 2), 2)
)
) AS [distance_in_km]
FROM [Cyclistic_Trip_Data].[dbo].[full_year22]
SELECT AVG([distance_in_km]) AS [average_distance]
FROM [Cyclistic_Trip_Data].[dbo].[full_year22]
GROUP BY member_casual
--find average distance to see what differences exist between membership types
SELECT AVG([distance_in_km]) AS [average_distance]
FROM (
SELECT
2 * 6371 * ASIN(
SQRT(
POWER(
SIN(RADIANS([end_lat] - [start_lat]) / 2), 2) +
COS(RADIANS([start_lat])) *
COS(RADIANS([end_lat])) *
POWER(SIN(RADIANS([end_lng] - [start_lng]) / 2), 2)
)
) AS [distance_in_km]
FROM [Cyclistic_Trip_Data].[dbo].[full_year22]
) AS avg_distance_in_km
SELECT
member_casual,
AVG([distance_in_km]) AS [average_distance]