-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathapp.py
366 lines (300 loc) · 17.8 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# Import Library
from google.cloud import aiplatform
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import requests
import uuid
import os
import folium
from streamlit_folium import folium_static
from dotenv import load_dotenv
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
# Load environment variables from .env file
load_dotenv()
# Retrieve the API key from the environment variable
api_key = os.getenv('API_KEY')
# # Set the path to the JSON file relative to the current working directory
current_directory = os.getcwd()
credentials_path = os.path.join(current_directory, 'credentials.json')
# Set the GOOGLE_APPLICATION_CREDENTIALS environment variable
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
project_id = "ctelkom"
location = "us-west1"
model_id = "6405552433283465216"
endpoint_id = "3363546806855139328"
# endpoint
endpoint = aiplatform.Endpoint(endpoint_name=endpoint_id,
project=project_id,
location=location)
def main():
st.sidebar.title('Taxi Fare Prediction App Demo')
st.sidebar.markdown('''
## About
This tool, developed with [Vertex AI AutoML]('https://cloud.google.com/vertex-ai/docs'),
enables the estimation of taxi fares in the Chicago, Illinois, USA region.
Geolocation is acquired through the [Google Places API]('https://developers.google.com/maps/documentation/places/web-service'),
while the [Google Distance Matrix API]('https://developers.google.com/maps/documentation/distance-matrix') is utilized to gather information on distance and trip duration.
''')
def app():
st.header('Chicago Taxi Fare Prediction App 🚕')
# Generate a random UUID
taxi_id = str(uuid.uuid4()).replace("-", "")[:32]
col1, col2 = st.columns(2)
# Datetime input
with col1:
date_input = st.date_input('Select a date: ')
with col2:
time_input = st.time_input('Select time: ')
combined_datetime = datetime.combine(date_input, time_input)
st.write('---')
# Origin and Destination
def places(api_key, place):
headers = {
'Content-Type': 'application/json',
'X-Goog-Api-Key': api_key,
'X-Goog-FieldMask': 'places.location',
}
data = {
'textQuery': place,
'maxResultCount': 1,
}
url = 'https://places.googleapis.com/v1/places:searchText'
# Convert data to JSON format
json_data = json.dumps(data)
# Make the POST request
response = requests.post(url, data=json_data, headers=headers)
# Print the response
result = response.json()
# Convert JSON data to DataFrame
df_place = pd.json_normalize(result['places'])
return df_place
# Trip
def get_distance_matrix(Origin, Destination, api_key):
base_url = "https://maps.googleapis.com/maps/api/distancematrix/json"
params = {
'origins': origins,
'destinations': destinations,
'key': api_key
}
try:
response = requests.get(base_url, params=params)
response.raise_for_status() # Raise an HTTPError for bad responses
data = response.json()
return data
except requests.exceptions.HTTPError as errh:
st.write(f"HTTP Error: {errh}")
except requests.exceptions.ConnectionError as errc:
st.write(f"Error Connecting: {errc}")
except requests.exceptions.Timeout as errt:
st.write(f"Timeout Error: {errt}")
except requests.exceptions.RequestException as err:
st.write(f"Request Error: {err}")
return None
suffix = ",+Chicago,+IL,+USA"
col1, col2 = st.columns(2)
with col1:
Origin = st.text_input("Pickup Location: ",help='[Community areas in Chicago](https://en.wikipedia.org/wiki/Community_areas_in_Chicago)')
if Origin == '':
st.warning("Please input your pickup location")
else:
df_origin = places(api_key,Origin)
origins = Origin.replace(' ', '+') + suffix
pickup_latitude = df_origin['location.latitude'].iloc[0]
pickup_longitude = df_origin['location.longitude'].iloc[0]
with col2:
Destination = st.text_input("Dropoff Location: ",help='[Community areas in Chicago](https://en.wikipedia.org/wiki/Community_areas_in_Chicago)')
if Destination == '':
st.warning("Please input your dropoff location")
else:
df_destination = places(api_key,Destination)
destinations = Destination.replace(' ', '+') + suffix
dropoff_latitude = df_destination['location.latitude'].iloc[0]
dropoff_longitude = df_destination['location.longitude'].iloc[0]
# Check if both origins and destinations are defined before making the API request
if 'origins' in locals() and 'destinations' in locals():
df = pd.read_csv('taxi-fare.csv')
min_latitude = df['pickup_latitude'].min()
max_latitude = df['pickup_latitude'].max()
min_longitude = df['pickup_longitude'].min()
max_longitude = df['pickup_longitude'].max()
Average_Latitude = (min_latitude + max_latitude) / 2
Average_Longitude = (min_longitude + max_longitude) / 2
initial_location = [Average_Latitude, Average_Longitude]
result = get_distance_matrix(origins, destinations, api_key)
if result and result['status'] == 'OK':
distance_text = result['rows'][0]['elements'][0]['distance']['text']
distance_value = result['rows'][0]['elements'][0]['distance']['value']
duration_text = result['rows'][0]['elements'][0]['duration']['text']
duration_value = result['rows'][0]['elements'][0]['duration']['value']
st.write(f"Distance: {distance_text} ({distance_value} meters) & Duration: {duration_text} ({duration_value} seconds)")
else:
st.write("Failed to retrieve distance matrix.")
# Check if any value is outside the coverage area
if min_latitude <= pickup_latitude <= max_latitude and min_longitude <= pickup_longitude <= max_longitude:
pickup_latitude = pickup_latitude
pickup_longitude = pickup_longitude
st.write("Pickup location is inside our coverage area")
else:
st.write("Pickup location is outside our coverage area")
# trip in seconds
trip_seconds = duration_value
# # convert from meters to miles
trip_miles = (distance_value * 1 / 1609.344)
# Check if any value is outside the coverage area
if min_latitude <= pickup_latitude <= max_latitude and min_longitude <= pickup_longitude <= max_longitude:
pickup_latitude = pickup_latitude
pickup_longitude = pickup_longitude
st.write('---')
st.write("# Maps 🗺️")
mymap = folium.Map(location = initial_location, zoom_start=10, control_scale=True)
iframe_origin = folium.IFrame(f'Pickup Location: {Origin}', width=250, height=30)
popup_origin = folium.Popup(iframe_origin, max_width=250)
icon_color_origin = 'orange'
folium.Marker(location=[pickup_latitude,pickup_longitude], popup = popup_origin, icon=folium.Icon(color=icon_color_origin, icon='home')).add_to(mymap)
iframe_destination = folium.IFrame(f'Dropoff Location: {Destination}', width=250, height=30)
popup_destination = folium.Popup(iframe_destination, max_width=250)
icon_color_destination = 'green'
folium.Marker(location=[dropoff_latitude,dropoff_longitude], popup = popup_destination, icon=folium.Icon(color=icon_color_destination, icon='flag')).add_to(mymap)
folium_static(mymap)
st.write('---')
st.write(" ## Order Summary:")
st.write(f" Taxi ID: {taxi_id}")
st.write(f" Trip start time: {combined_datetime}")
st.write(f" Pickup Location: {Origin}")
st.write(f" Dropoff Location: {Destination}")
st.write(f" Trip distance: {distance_text}")
st.write(f" Trip duration: {duration_text}")
else:
st.write("Pickup location is outside our coverage area")
# Predict using Endpoint
data = {
'pickup_latitude': [pickup_latitude],
'pickup_longitude': [pickup_longitude],
'dropoff_latitude': [dropoff_latitude],
'dropoff_longitude': [dropoff_longitude],
'trip_seconds': [trip_seconds],
'trip_miles': [trip_miles,]
}
df_prediction = pd.DataFrame(data)
df_prediction['trip_seconds'] = df_prediction['trip_seconds'].astype(str)
# Prepare instances
instances = df_prediction.to_dict(orient='records')
# Send prediction request
response = endpoint.predict(instances=instances)
# Extract predictions
predictions = response.predictions
# Create a new DataFrame with predictions
predictions_df = pd.DataFrame(predictions)
st.success(f"## Total cost of the trip: $ {round(predictions_df['value'].iloc[0],2)}")
def eda():
st.header('Exploratory Data Analysis')
df = pd.read_csv('taxi-fare.csv')
df.drop('Unnamed: 0',axis=1,inplace=True)
df["trip_hours"] = round(df["trip_seconds"] / 3600, 2)
df["trip_speed"] = round(df["trip_miles"] / df["trip_hours"], 2)
selection = st.selectbox('Select Here: ',['Dataset',"Histograms and Boxplots","Trip Duration & Trip Speed","Relationship Between Variable"])
if selection == 'Dataset':
st.write(" ## Dataset")
st.write(' Dataset obtained from Google Cloud Platform - BigQuery database : `chicago_taxi_trips`, table: `taxi_trips`.')
st.dataframe(df)
st.write(''' The chosen dataset consists of the following fields:
- `taxi_id` : A unique identifier for the taxi.
- `trip_start_timestamp`: When the trip started, rounded to the nearest 15 minutes.
- `trip_seconds`: Time of the trip in seconds.
- `trip_miles`: Distance of the trip in miles.
- `pickup_latitude`: The latitude of the center of the pickup census tract or the community area if the census tract has been hidden for privacy.
- `pickup_longitude`: The longitude of the center of the pickup census tract or the community area if the census tract has been hidden for privacy.
- `dropoff_latitude`: The latitude of the center of the dropoff census tract or the community area if the census tract has been hidden for privacy.
- `dropoff_longitude`: The longitude of the center of the dropoff census tract or the community area if the census tract has been hidden for privacy.
- `trip_total`: Total cost of the trip, the total of the fare, tips, tolls, and extras.
''')
st.write('')
st.write('Numerical Distributions of the fields')
df.describe().T
elif selection == "Histograms and Boxplots":
st.write("# Histograms and Boxplots")
target = "trip_total"
num_cols = ['pickup_latitude', 'pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', "trip_seconds", "trip_miles"]
for i in num_cols + [target]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4))
df[i].plot(kind="hist", bins=100, ax=ax[0])
ax[0].set_title(str(i) + " -Histogram")
df[i].plot(kind="box", ax=ax[1])
ax[1].set_title(str(i) + " -Boxplot")
st.pyplot(fig)
# Calculate IQR
Q1 = df[i].quantile(0.25)
Q3 = df[i].quantile(0.75)
IQR = Q3 - Q1
# Count outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = df[(df[i] < lower_bound) | (df[i] > upper_bound)][i]
# Print the number of outliers and percentage
num_outliers = len(outliers)
total_values = len(df[i])
percentage_outliers = (num_outliers / total_values) * 100
st.write("Q1: ",Q1)
st.write("Q3: ",Q3)
st.write(f"Number of outliers in {i}: {len(outliers)}")
st.write(f"Percentage of outliers in {i}: {percentage_outliers:.2f}%")
st.write("---")
elif selection == "Trip Duration & Trip Speed":
st.write("# Trip Duration and Trip Speed")
st.write("The field `trip_seconds` describes the time taken for the trip in seconds. For ease of our analysis, let us convert it into hours.")
fig, ax = plt.subplots()
df["trip_hours"].plot(kind="box", ax=ax)
ax.set_title("Trip Hours Boxplot")
st.pyplot(fig)
st.write('---')
st.write("trip_speed can be added by dividing `trip_miles` and `trip_hours` to understand the speed of the trip in miles/hour")
fig, ax = plt.subplots()
df["trip_speed"].plot(kind="box", ax=ax)
ax.set_title("Trip Speed Boxplot")
st.pyplot(fig)
st.write("From the box plots and the histograms visualized so far, it is evident that there are some outliers causing skewness in the data. The presence of outliers can have an impact on the model's performance and accuracy.")
else:
st.title('Relationship between variable')
st.write('To better understand the relationship between the variables, a pair-plot can be plotted.')
try:
# Create a pairplot for a sample of 10,000 rows
pairplot = sns.pairplot(data=df[["trip_seconds", "trip_miles", "trip_total", "trip_speed"]].sample(10000))
# Display the pairplot in Streamlit
st.pyplot(pairplot)
except Exception as e:
st.error(f"An error occurred: {e}")
st.write('You can see some linear relationships between the independent variables considered in the pair-plot. For example, `trip_miles` and the dependant variable `trip_total`')
def model():
st.title("Model Evaluation")
st.write("## Metric")
st.image('model_evaluation.png',caption='To zoom in click view full screen buton')
st.write('''
- `MAE` = 2.709, Mean absolute error (MAE) is the average of absolute differences between observed and predicted values. A low value indicates a higher-quality model, where 0 means the model made no errors. Interpreting MAE depends on the range of values in the series. MAE has the same unit as the target column.
- `MAPE` = 14.598%, The mean absolute percentage error (MAPE) is the average of absolute percentage errors. MAPE ranges from 0% to 100%, where a lower value indicates a higher quality model. MAPE becomes infinity if 0 values are present in the ground truth data.
- `RMSE` = 5.542, Root mean square error (RMSE) is the root of squared differences between observed and predicted values. A lower value indicates a higher quality model, where 0 means the model made no errors. Interpreting RMSE depends on the range of values in the series. RMSE is more responsive to large errors than MAE.
- `RMSLE` = 0.198, Root mean squared log error (RMSLE) is the root of squared averages of log differences between observed and predicted values. Interpreting RMSLE depends on the range of values in the series. RMSLE is less responsive to outliers than RMSE, and it tends to penalise underestimations slightly more than overestimations.
- `r^2` = 90.6%, R squared (R^2) is the square of the Pearson correlation coefficient between the observed and predicted values. This ranges from 0 to 1, where a higher value indicates a higher-quality model.
Based on the evaluation metrics above, it can be said that this model is good enough to predict taxi fares.
''')
st.write('---')
st.write("## Feature Importance")
st.image('feature_importance.png',caption='To zoom in click view full screen buton')
st.write('Model feature attribution tells you how important each feature is when making a prediction. Attribution values are expressed as a percentage; the higher the percentage, the more strongly that feature impacts a prediction on average. Model feature attribution is expressed using the sampled Shapley method.')
selected_option = st.sidebar.radio("Option: ", ["Application 🚕", "EDA 📊", "Model Evaluation"])
if selected_option == "Application 🚕":
app()
elif selected_option == 'EDA 📊':
eda()
else:
model()
st.sidebar.markdown('''
## Created by:
Ahmad Luay Adnani - [GitHub](https://github.com/ahmadluay9)
''')
if __name__ == '__main__':
main()