-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUber_Case_Study.R
212 lines (138 loc) · 8.62 KB
/
Uber_Case_Study.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
############################################ DATA PREPARATION ######################
#### 1) Make a grouped bar chart depicting the hour-wise trip request made at city and airport respectively. Aggregated the data for all 5 days on the same axis of 24 hours.
uber_request_data <- read.csv("Uber request data.csv", stringsAsFactors = F)
PickUp_Point <- uber_request_data$Pickup.point
Hours <- substr(as.factor(uber_request_data$Request.time), 1, 2)
ggplot(uber_request_data, aes(x = Hours, fill = PickUp_Point)) + geom_bar(position = "dodge")
#### 2) 2. In the bar chart above (in pt.1), see 5 major time blocks based on the frequency of requests made at the city and airport. We now divide the request-time into 5 time-slots described below. An additional column “Time_Slot” is added which takes these 5 categorical values depending on the request time
for(i in 1:nrow(uber_request_data)){
if(uber_request_data$Request.time[i] >= '00:00:00' & uber_request_data$Request.time[i] < '04:00:00'){
uber_request_data$Time.Slot[i] = "Pre_Morning"
}
if(uber_request_data$Request.time[i] >= '04:00:00' & uber_request_data$Request.time[i] < '10:00:00'){
uber_request_data$Time.Slot[i] = "Morning_Rush"
}
if(uber_request_data$Request.time[i] >= '10:00:00' & uber_request_data$Request.time[i] < '17:00:00'){
uber_request_data$Time.Slot[i] = "Day_Time"
}
if(uber_request_data$Request.time[i] >= '17:00:00' & uber_request_data$Request.time[i] < '22:00:00'){
uber_request_data$Time.Slot[i] = "Evening_Rush"
}
if(uber_request_data$Request.time[i] >= '22:00:00' ){
uber_request_data$Time.Slot[i] = "Late_Night"
}
}
ggplot(uber_request_data, aes(x = Time.Slot)) + geom_bar()
# plot shows high level of demand in the Evening and Morning rush hours
summary(factor(uber_request_data$Time.Slot))
######################################## PROBLEM IDENTIFICATION #################
#### 1) Stacked bar chart where each bar represents a time slot and the y-axis shows the frequency of requests. Different proportions of bars should represent the completed, cancelled and no cars available out of the total customer requests. (Hint: ggplot)
ggplot(uber_request_data, aes(x = Time.Slot, fill = Status)) + geom_bar()
#### 2) Visually identify the 2 most pressing problems for Uber, out of the 15 possible scenarios (5 slots * 3 trip status).
# Problem 1) "Cancelled cabs" in the Morning Rush time
# Problem 2) "No cars available" in Evening Rush time
########################### PROBLEM 1 #######################################
# "Cancelled cabs" in the Morning Rush time
#### For the time slot when problem 1 exists, plot a stacked bar chart to find out if the problem is more severe for pick-up requests made at the airport or the city.
MorningRush_data <- data.frame(subset(uber_request_data, uber_request_data$Time.Slot == "Morning_Rush"))
problem1_data <- data.frame(subset(uber_request_data, uber_request_data$Time.Slot == "Morning_Rush" & uber_request_data$Status == 'Cancelled'))
Hours_MorningRush <- substr(as.factor(problem1_data$Request.time), 1 ,2)
ggplot(problem1_data, aes(x = Hours_MorningRush, fill = Pickup.point)) + geom_bar()
# Plot shows that the problem is more severe in City pickups
length(which(uber_request_data$Time.Slot=='Morning_Rush'))
length(problem1_data$Status)
length(which(problem1_data$Pickup.point=='Airport'))
length(which(problem1_data$Pickup.point=='City'))
# In Morning Rush time slot, this issue exists 935 times out of a total of 2394 requests made. That's about 39% requests in the morning rush resulting in Cancelled cabs.
# Out of these 935 cancelled requests, 27 were made at airport and 908 in the city
#### Also find the percentage breakup for the total number of issues in this time slot based on the pick-up point?
airport_morningRush <- subset(MorningRush_data, MorningRush_data$Pickup.point == 'Airport')
city_morningRush <- subset(MorningRush_data, MorningRush_data$Pickup.point == 'City')
f1 <- function(x) {
(length(x)/nrow(airport_morningRush))*100
}
airport_aggregate_p1 <- aggregate(airport_morningRush$Pickup.point, by = list(airport_morningRush$Status), FUN = f1)
names(airport_aggregate_p1) <- c("Status", "Airport")
f2 <- function(x) {
(length(x)/nrow(city_morningRush))*100
}
city_aggregate_p1 <- aggregate(city_morningRush$Pickup.point, by = list(city_morningRush$Status), FUN = f2)
names(city_aggregate_p1) <- c("Status", "City")
# Percentage breakup of Status in Morning Rush based on pick-up points:
# Status Airport City
# Cancelled: 5.314961 % 48.14422 %
# No Cars Available: 9.448819 % 23.48887 %
# Total 14.762 % 71.632 %
#### SUPPLY-DEMAND ANALYSIS
# For Morning Rush data
nrow(MorningRush_data)
# Demand = total requests made (including completed, cancelled and no cars available cases) = 2394
length(which(MorningRush_data$Status == 'Trip Completed'))
# Supply = completed requests = 968
# For airport
length(which(MorningRush_data$Pickup.point == 'Airport'))
# demand = 508
length(which(MorningRush_data$Status == 'Trip Completed' & MorningRush_data$Pickup.point == 'Airport'))
# supply = 433
# For City
length(which(MorningRush_data$Pickup.point == 'City'))
# demand = 1886
length(which(MorningRush_data$Status == 'Trip Completed' & MorningRush_data$Pickup.point == 'City'))
# supply = 535
########################### PROBLEM 2 #######################################
# "No cars available" in Evening Rush time
#### For the time slot when problem 2 exists (Evening Rush), plotted a stacked bar chart to find out if the problem is more severe for pick-up requests made at the airport or the city.
EveningRush_data <- data.frame(subset(uber_request_data, uber_request_data$Time.Slot == "Evening_Rush"))
problem2_data <- data.frame(subset(uber_request_data, uber_request_data$Time.Slot == "Evening_Rush" & uber_request_data$Status == 'No Cars Available'))
Hours_EveningRush <- substr(as.factor(problem2_data$Request.time), 1 ,2)
ggplot(problem2_data, aes(x = Hours_EveningRush, fill = Pickup.point)) + geom_bar()
# Plot shows that the problem is more severe in Airport pickups
length(which(uber_request_data$Time.Slot=='Evening_Rush'))
length(problem2_data$Status)
length(which(problem2_data$Pickup.point=='Airport'))
length(which(problem2_data$Pickup.point=='City'))
# In Evening Rush time slot, this issue exists in 1438 cases out of a total of 2416 requests made. That's about 59.5% requests in the evening rush resulting in No cars available.
# Out of these 1438 cases, 1372 were made at airport and 66 in the city
#### Also find the percentage breakup for the total number of issues in this time slot based on the pick-up point?
airport_eveningRush <- subset(EveningRush_data, EveningRush_data$Pickup.point == 'Airport')
city_eveningRush <- subset(EveningRush_data, EveningRush_data$Pickup.point == 'City')
f1 <- function(x) {
(length(x)/nrow(airport_eveningRush))*100
}
airport_aggregate_p2 <- aggregate(airport_eveningRush$Pickup.point, by = list(airport_eveningRush$Status), FUN = f1)
names(airport_aggregate_p2) <- c("Status", "Airport")
f2 <- function(x) {
(length(x)/nrow(city_eveningRush))*100
}
city_aggregate_p2 <- aggregate(city_eveningRush$Pickup.point, by = list(city_eveningRush$Status), FUN = f2)
names(city_aggregate_p2) <- c("Status", "City")
# Percentage breakup of issues in Evening Rush based on pick-up points:
# Status Airport City
# Cancelled: 5.734191 % 10.90909 %
# No Cars Available: 73.526259 % 12.00000 %
# Total: 79.26 % 22.90 %
#### SUPPLY-DEMAND ANALYSIS
# For Evening Rush data
nrow(EveningRush_data)
# Demand = total requests made (including completed, cancelled and no cars available cases) 2416
length(which(EveningRush_data$Status == 'Trip Completed'))
# Supply = completed requests 811
# For airport
length(which(EveningRush_data$Pickup.point == 'Airport'))
# demand = 1866
length(which(EveningRush_data$Status == 'Trip Completed' & EveningRush_data$Pickup.point == 'Airport'))
# supply = 387
# For City
length(which(EveningRush_data$Pickup.point == 'City'))
# demand = 550
length(which(EveningRush_data$Status == 'Trip Completed' & EveningRush_data$Pickup.point == 'City'))
# supply = 424
# Uber data (Overall)
# Demand = total requests made (including completed, cancelled and no cars available cases) 6766
# Supply = completed requests 2852
# For airport (Overall)
# demand = 3251
# supply = 1340
# For City (Overall)
# demand = 3515
# supply = 1512