Skip to content

Commit

Permalink
Add routes.py with algorithms to estimate journeys
Browse files Browse the repository at this point in the history
  • Loading branch information
sva-filonchik committed Dec 1, 2023
1 parent e4f4e06 commit 37c5ff3
Showing 1 changed file with 270 additions and 0 deletions.
270 changes: 270 additions & 0 deletions mining/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
import pandas as pd
import random

class RoutesMining:

def __init__(self, df):
#Dataframe
self.df = df

def read_eventlog(self, fileName):
df = pd.read_csv(fileName, delimiter=';', encoding='utf-8')

return df

def get_next_stations(self, station_order_df, current_station, train_id):
# This function returns the list of next stations for a given current_station within the same TrainID
stations = station_order_df[station_order_df['TrainID'] == train_id]
current_index = stations[stations['Station'] == current_station].index[0]
next_stations = stations.loc[current_index + 1:]
return next_stations['Station'].tolist()

def route_mining_simple(self, df):
#Initialize an empty list to store passenger paths
passenger_paths = []
# Counter for unique Passenger IDs
passenger_id = 1
# A dictionary to keep track of passengers currently on the train
# Key: PassengerID, Value: (boarding station, boarding time)
passengers_on_train = {}
# Assuming 'train_data' is the DataFrame for the specific train journey
# It should contain 'TrainID', 'Timestamp', 'Station', 'Einsteiger' (boardings), and 'Aussteiger' (alightings)

# Iterate over each unique TrainID
for train_id, train_group in df.groupby('TrainID'):
passengers_on_train = {}

# Iterate over each row in the train data
for index, row in train_group.iterrows():
station = row['Station']
timestamp = row['Arrival']
train_id = row['TrainID']
sbahn_id = row ['sBahnID']
boardings = row['Boardings']
alightings = row['Alightings']


# Assign Passenger IDs to each boarding passenger
# Board new passengers
if boardings > 0:
#print('this is boarding', boardings)
for _ in range(boardings):
passengers_on_train[passenger_id] = {'BoardingStation': station, 'BoardingTime': timestamp}
passenger_id += 1

# Process alightings at this station
if alightings > 0 and passengers_on_train:
#print(alightings, passengers_on_train)
# Select passengers to alight based on their boarding order
#alighting_passenger_ids = sorted(passengers_on_train)[:alightings]
try:
for _ in range(alightings):
# Randomly pick a passenger to alight
#print(passengers_on_train.items())
#print(list(passengers_on_train.items()))

pid, passenger_info = random.choice(list(passengers_on_train.items()))
#print(pid, passenger_info)

#for pid in alighting_passenger_ids:
# boarding_station, boarding_time = passengers_on_train[pid]
# Record the journey for the alighting passenger
passenger_journey = {
'PassengerID': pid,
'TrainID': train_id,
'BoardingStation': passenger_info['BoardingStation'],
'BoardingTime': passenger_info['BoardingTime'],
'AlightingStation': station,
'AlightingTime': timestamp,
'sBahnID': sbahn_id
}
passenger_paths.append(passenger_journey)

# Remove the passenger from the train
del passengers_on_train[pid]

except IndexError: # this will handle only IndexError, don't use pure except
break

# Update the remaining passengers with the next possible alighting stations
next_stations = self.get_next_stations(df, station, train_id)
for pid in list(passengers_on_train):
if not next_stations: # If there are no more stations, alight all remaining passengers
passenger_info = passengers_on_train[pid]
passenger_path = {
'PassengerID': pid,
'TrainID': train_id,
'BoardingStation': passenger_info['BoardingStation'],
'BoardingTime': passenger_info['BoardingTime'],
'AlightingStation': station,
'AlightingTime': timestamp
}
passenger_paths.append(passenger_path)
del passengers_on_train[pid]


# Convert the list of passenger journeys to a DataFrame
passenger_paths_df = pd.DataFrame(passenger_paths)
# Save the DataFrame to a CSV file
passenger_paths_df.to_csv('data/passenger_paths_new3.csv', index=True, sep=';', encoding='utf-8')

return passenger_paths_df

def route_mining(self, df):
passenger_paths = []
passenger_id = 1

for train_id, train_group in df.groupby('TrainID'):
passengers_on_train = {} # Dictionary to track passengers (key: group_id, value: list of PassengerIDs)

# Iterate over each station in the journey
for index, row in train_group.iterrows():
station = row['Station']
timestamp = row['Arrival']
boardings = row['Boardings']
alightings = row['Alightings']
sbahn_id = row ['sBahnID']


# Assign PassengerIDs to each boarding passenger
for _ in range(boardings):
if station not in passengers_on_train:
passengers_on_train[station] = []
passengers_on_train[station].append(passenger_id)
passenger_id += 1

# Probabilistically determine alighting passengers
if alightings > 0:
# Calculate probabilities for each passenger group
# (This requires a probabilistic model based on historical data or assumptions)
# For simplicity, let's assume equal probability for now
total_passengers = sum(len(group) for group in passengers_on_train.values())
if total_passengers > 0:
alighting_probs = {group_id: len(group) / total_passengers for group_id, group in passengers_on_train.items()}
alighting_passengers = random.choices(list(alighting_probs.keys()), weights=alighting_probs.values(), k=alightings)

for group_id in alighting_passengers:
# Select a passenger from the group to alight
if passengers_on_train[group_id]:
pid = passengers_on_train[group_id].pop(0) # Pop the first passenger from the group
# Record the passenger's journey
passenger_path = {
'PassengerID': pid,
'TrainID': train_id,
'BoardingStation': group_id,
'BoardingTime': train_group.loc[train_group['Station'] == group_id, 'Arrival'].iloc[0],
'AlightingStation': station,
'AlightingTime': timestamp,
'sBahnID': sbahn_id
}
passenger_paths.append(passenger_path)
# Convert the list of passenger journeys to a DataFrame
passenger_paths_df = pd.DataFrame(passenger_paths)
# Save the DataFrame to a CSV file
passenger_paths_df.to_csv('data/passenger_paths_new.csv', index=False, sep=';', encoding='utf-8')

return passenger_paths_df

def preprocess_data(self, df):
# Preprocess to create mappings for stations and timestamps for each TrainID
station_map = df.groupby('TrainID')['Station'].apply(list)
timestamp_map = df.groupby('TrainID')['Arrival'].apply(lambda x: x.dt.strftime('%Y-%m-%d %H:%M:%S').tolist())#.apply(list)

return station_map, timestamp_map

def create_eventlog(self, df, routes):
station_map, timestamp_map = self.preprocess_data(df)
#print(timestamp_map)
# Create a new DataFrame for the event log
#event_log = pd.DataFrame(columns=['CaseID', 'Activity', 'Timestamp', 'sBahnID'])
event_log = []
unique_trips = routes[['TrainID', 'BoardingStation', 'AlightingStation', 'BoardingTime', 'AlightingTime', 'sBahnID']].drop_duplicates()
#print(routes)
for index, row in unique_trips.iterrows():
train_id = row['TrainID']
boarding_station = row['BoardingStation']
alighting_station = row['AlightingStation']
boarding_time = row['BoardingTime']
alighting_time = row['AlightingTime']
sbahnid = row['sBahnID']

#ordered_stations = df[df['TrainID'] == train_id]['Station']
ordered_stations = station_map[train_id]
timestamp_order = timestamp_map[train_id]
start_index = ordered_stations.index(boarding_station)#ordered_stations[ordered_stations == boarding_station]#.index[0]
end_index = ordered_stations.index(alighting_station)#ordered_stations[ordered_stations == alighting_station]#.index[0]


complete_route = ordered_stations[start_index:end_index + 1]

#timestamp_order = df[df['TrainID'] == train_id]['Arrival']#.astype(str).values.tolist()
board_index = timestamp_order.index(boarding_time)#timestamp_order[timestamp_order == boarding_time].index[0]
alight_index = timestamp_order.index(alighting_time)#timestamp_order[timestamp_order == alighting_time].index[0]

journey_timestamps = timestamp_order[board_index:alight_index+1]


#for index, journey in routes.iterrows():
# passenger_id = journey['PassengerID']
# train_id = journey['TrainID']
# boarding_station = journey['BoardingStation']
# alighting_station = journey['AlightingStation']
# sbahnid = journey['sBahnID']
#boarding_time = str(journey['BoardingTime'])
# alighting_time = str(journey['AlightingTime'])
#print(train_id, boarding_station, alighting_station, boarding_time, alighting_time)

# Get the ordered list of stations for the TrainID
#station_order = df[df['TrainID'] == train_id]['Station'].values.tolist()
#timestamp_order = df[df['TrainID'] == train_id]['Arrival'].astype(str).values.tolist()
#print(station_order)
#print(timestamp_order)

# Find the indices for the boarding and alighting stations
#boarding_index = station_order.index(boarding_station)
#alighting_index = station_order.index(alighting_station)

#boarding_time_index = timestamp_order.index(boarding_time)
#print(boarding_time_index)
#arrival_time_index = timestamp_order.index(alighting_time)
#print(arrival_time_index)

# Create a list of stations the passenger would pass through
#journey_stations = station_order[boarding_index:alighting_index]
#journey_timestamps = timestamp_order[boarding_time_index:arrival_time_index]

# Create a timestamp for each station in the journey (this part may need additional data or assumptions)
# Here we just use the boarding time for simplicity
#timestamps = [journey['BoardingTime']] * len(journey_stations)

# Find all passengers who traveled this path
passengers_on_this_trip = routes[(routes['TrainID'] == train_id) &
(routes['BoardingStation'] == boarding_station) &
(routes['AlightingStation'] == alighting_station)].drop_duplicates()

# Assign this route to each passenger
for passenger_id in passengers_on_this_trip:
for station, timestamp in zip(complete_route, journey_timestamps):
event = {'CaseID': passenger_id, 'Activity': station, 'Timestamp': timestamp, 'sBahnID': sbahnid}
event_log.append(event)


# Fill in the event log for this passenger
#for station, timestamp in zip(journey_stations, journey_timestamps):
# event = {'CaseID': passenger_id, 'Activity': station, 'Timestamp': timestamp, 'sBahnID': sbahnid}
# event_log.append(event)
#pd.concat([event_log, pd.DataFrame([event])], ignore_index=True)

# Convert timestamp strings to actual datetime objects if necessary

event_log = pd.DataFrame(event_log)
event_log['Timestamp'] = pd.to_datetime(event_log['Timestamp'])
# Save the event log to a CSV file
event_log.to_csv('data/event_log.csv', index=True, sep=';', encoding='utf-8')

# Print the first few rows for verification
print(event_log.head())

return event_log


0 comments on commit 37c5ff3

Please sign in to comment.