Skip to content

Commit 942e3e6

Browse files
fix(MergeLineContext): Partially remove unused service ids
1 parent 6ea5968 commit 942e3e6

File tree

12 files changed

+195
-49
lines changed

12 files changed

+195
-49
lines changed

src/main/java/com/conveyal/datatools/manager/jobs/MergeFeedsJob.java

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import com.fasterxml.jackson.annotation.JsonIgnore;
2424
import com.fasterxml.jackson.databind.ObjectMapper;
2525
import com.google.common.collect.Lists;
26+
import com.google.common.collect.Sets;
2627
import org.bson.codecs.pojo.annotations.BsonIgnore;
2728
import org.slf4j.Logger;
2829
import org.slf4j.LoggerFactory;
@@ -87,6 +88,10 @@ public class MergeFeedsJob extends FeedSourceJob {
8788
public Set<String> serviceIdsToCloneRenameAndExtend = new HashSet<>();
8889
@JsonIgnore @BsonIgnore
8990
public Set<String> serviceIdsFromActiveFeedToTerminateEarly = new HashSet<>();
91+
@JsonIgnore @BsonIgnore
92+
public Set<String> serviceIdsFromActiveFeedToRemove = new HashSet<>();
93+
@JsonIgnore @BsonIgnore
94+
public Set<String> serviceIdsFromFutureFeedToRemove = new HashSet<>();
9095

9196
private List<TripAndCalendars> sharedConsistentTripAndCalendarIds = new ArrayList<>();
9297

@@ -441,17 +446,40 @@ private void determineMergeStrategy() {
441446
getActiveServiceIds(feedMergeContext.getActiveTripIdsNotInFutureFeed())
442447
);
443448

449+
// Build the set of calendars ids from the future feed to be removed
450+
// because they become no longer used after shared trips are remapped to another service id.
451+
serviceIdsFromFutureFeedToRemove = Sets.difference(
452+
feedMergeContext.future.feedToMerge.serviceIds,
453+
getFutureServiceIds(feedMergeContext.getFutureTripIdsNotInActiveFeed())
454+
);
455+
456+
// Build the set of calendars ids from the active feed to be removed
457+
// because they become no longer used after shared trips are remapped to another service id.
458+
serviceIdsFromActiveFeedToRemove = Sets.difference(
459+
feedMergeContext.active.feedToMerge.serviceIds,
460+
getActiveServiceIds(feedMergeContext.getActiveTripIdsNotInFutureFeed())
461+
);
462+
444463
mergeFeedsResult.mergeStrategy = CHECK_STOP_TIMES;
445464
}
446465
}
447466

448467
/**
449468
* Obtains the service ids corresponding to the provided trip ids.
450469
*/
451-
private List<String> getActiveServiceIds(Set<String> tripIds) {
470+
private Set<String> getActiveServiceIds(Set<String> tripIds) {
452471
return tripIds.stream()
453472
.map(tripId -> feedMergeContext.active.feed.trips.get(tripId).service_id)
454-
.collect(Collectors.toList());
473+
.collect(Collectors.toSet());
474+
}
475+
476+
/**
477+
* Obtains the service ids corresponding to the provided trip ids.
478+
*/
479+
private Set<String> getFutureServiceIds(Set<String> tripIds) {
480+
return tripIds.stream()
481+
.map(tripId -> feedMergeContext.future.feed.trips.get(tripId).service_id)
482+
.collect(Collectors.toSet());
455483
}
456484

457485
/**

src/main/java/com/conveyal/datatools/manager/jobs/feedmerge/CalendarMergeLineContext.java

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -86,19 +86,27 @@ private boolean checkCalendarIds(Set<NewGTFSError> idErrors, FieldContext fieldC
8686
.format(GTFS_DATE_FORMATTER));
8787
}
8888
}
89-
/* } else {
89+
90+
// Remove calendar entries that are no longer used.
91+
if (job.serviceIdsFromActiveFeedToRemove.contains(keyValue)) {
92+
LOG.warn(
93+
"Skipping active calendar entry {} because it will become unused in the merged feed.",
94+
keyValue);
95+
mergeFeedsResult.skippedIds.add(key);
96+
shouldSkipRecord = true;
97+
}
98+
} else {
9099
// If handling the future feed, the MTC revised feed merge logic is as follows:
91100
// - Calendar entries from the future feed will be inserted as is in the merged feed.
92-
// so no additional processing needed here, unless the calendar entry is no longer used.
101+
// so no additional processing needed here, unless the calendar entry is no longer used,
102+
// in that case we drop the calendar entry.
93103
if (job.serviceIdsFromFutureFeedToRemove.contains(keyValue)) {
94104
LOG.warn(
95105
"Skipping future calendar entry {} because it will become unused in the merged feed.",
96106
keyValue);
97107
mergeFeedsResult.skippedIds.add(key);
98108
shouldSkipRecord = true;
99109
}
100-
101-
*/
102110
}
103111

104112

@@ -137,7 +145,7 @@ public void addClonedServiceId() throws IOException {
137145
String originalServiceId = keyValue;
138146
if (job.serviceIdsToCloneRenameAndExtend.contains(originalServiceId)) {
139147
// FIXME: Do we need to worry about calendar_dates?
140-
String[] clonedValues = getRowValues().clone();
148+
String[] clonedValues = getOriginalRowValues().clone();
141149
String newServiceId = clonedValues[keyFieldIndex] = String.join(":", getIdScope(), originalServiceId);
142150
// Modify start date only (preserve the end date from the future calendar entry).
143151
int startDateIndex = Table.CALENDAR.getFieldIndex("start_date");

src/main/java/com/conveyal/datatools/manager/jobs/feedmerge/MergeLineContext.java

Lines changed: 51 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public class MergeLineContext {
5252
private CsvReader csvReader;
5353
private boolean skipRecord;
5454
protected boolean keyFieldMissing;
55+
private String[] originalRowValues;
5556
private String[] rowValues;
5657
private int lineNumber = 0;
5758
protected final Table table;
@@ -198,6 +199,7 @@ public boolean iterateOverRows() throws IOException {
198199
if (!constructRowValues()) {
199200
return false;
200201
}
202+
201203
finishRowAndWriteToZip();
202204
}
203205
return true;
@@ -491,6 +493,7 @@ public void initializeRowValues() {
491493
skipRecord = false;
492494
// Reset the row values (this must happen after the first line is checked).
493495
rowValues = new String[sharedSpecFields.size()];
496+
originalRowValues = new String[sharedSpecFields.size()];
494497
}
495498

496499
public void writeValuesToTable(String[] values, boolean incrementLineNumbers) throws IOException {
@@ -522,6 +525,7 @@ public void writeHeaders() throws IOException {
522525
* @return false, if a failing condition was encountered. true, if everything was ok.
523526
*/
524527
public boolean constructRowValues() throws IOException {
528+
boolean result = true;
525529
// Piece together the row to write, which should look practically identical to the original
526530
// row except for the identifiers receiving a prefix to avoid ID conflicts.
527531
for (int specFieldIndex = 0; specFieldIndex < sharedSpecFields.size(); specFieldIndex++) {
@@ -533,65 +537,73 @@ public boolean constructRowValues() throws IOException {
533537
field,
534538
csvReader.get(fieldsFoundList.indexOf(field))
535539
);
536-
// Handle filling in agency_id if missing when merging regional feeds. If false is returned,
537-
// the job has encountered a failing condition (the method handles failing the job itself).
538-
if (!updateAgencyIdIfNeeded(fieldContext)) {
539-
return false;
540-
}
541-
// Determine if field is a GTFS identifier (and scope if needed).
542-
scopeValueIfNeeded(fieldContext);
543-
// Only need to check for merge conflicts if using MTC merge type because
544-
// the regional merge type scopes all identifiers by default. Also, the
545-
// reference tracker will get far too large if we attempt to use it to
546-
// track references for a large number of feeds (e.g., every feed in New
547-
// York State).
548-
if (job.mergeType.equals(SERVICE_PERIOD)) {
549-
// Remap service id from active feed to distinguish them
550-
// from entries with the same id in the future feed.
551-
// See https://github.com/ibi-group/datatools-server/issues/244
552-
if (handlingActiveFeed && fieldContext.nameEquals(SERVICE_ID)) {
553-
updateAndRemapOutput(fieldContext);
540+
originalRowValues[specFieldIndex] = fieldContext.getValueToWrite();
541+
if (!skipRecord) {
542+
// Handle filling in agency_id if missing when merging regional feeds. If false is returned,
543+
// the job has encountered a failing condition (the method handles failing the job itself).
544+
if (!updateAgencyIdIfNeeded(fieldContext)) {
545+
result = false;
554546
}
547+
// Determine if field is a GTFS identifier (and scope if needed).
548+
scopeValueIfNeeded(fieldContext);
549+
// Only need to check for merge conflicts if using MTC merge type because
550+
// the regional merge type scopes all identifiers by default. Also, the
551+
// reference tracker will get far too large if we attempt to use it to
552+
// track references for a large number of feeds (e.g., every feed in New
553+
// York State).
554+
if (job.mergeType.equals(SERVICE_PERIOD)) {
555+
// Remap service id from active feed to distinguish them
556+
// from entries with the same id in the future feed.
557+
// See https://github.com/ibi-group/datatools-server/issues/244
558+
if (handlingActiveFeed && fieldContext.nameEquals(SERVICE_ID)) {
559+
updateAndRemapOutput(fieldContext);
560+
}
555561

556-
updateServiceIdsIfNeeded(fieldContext);
562+
updateServiceIdsIfNeeded(fieldContext);
557563

558-
// Store values for key fields that have been encountered and update any key values that need modification due
559-
// to conflicts.
560-
if (!checkFieldsForMergeConflicts(getIdErrors(fieldContext), fieldContext)) {
564+
// Store values for key fields that have been encountered and update any key values that need modification due
565+
// to conflicts.
566+
if (!checkFieldsForMergeConflicts(getIdErrors(fieldContext), fieldContext)) {
567+
skipRecord = true;
568+
continue;
569+
}
570+
}
571+
// If the current field is a foreign reference, check if the reference has been removed in the
572+
// merged result. If this is the case (or other conditions are met), we will need to skip this
573+
// record. Likewise, if the reference has been modified, ensure that the value written to the
574+
// merged result is correctly updated.
575+
if (!checkForeignReferences(fieldContext)) {
561576
skipRecord = true;
562-
break;
577+
continue;
563578
}
579+
rowValues[specFieldIndex] = fieldContext.getValueToWrite();
564580
}
565-
// If the current field is a foreign reference, check if the reference has been removed in the
566-
// merged result. If this is the case (or other conditions are met), we will need to skip this
567-
// record. Likewise, if the reference has been modified, ensure that the value written to the
568-
// merged result is correctly updated.
569-
if (!checkForeignReferences(fieldContext)) {
570-
skipRecord = true;
571-
break;
572-
}
573-
rowValues[specFieldIndex] = fieldContext.getValueToWrite();
574581
}
575-
return true;
582+
return result;
576583
}
577584

578-
public void finishRowAndWriteToZip() throws IOException {
585+
private void finishRowAndWriteToZip() throws IOException {
586+
boolean shouldWriteCurrentRow = true;
579587
// Do not write rows that are designated to be skipped.
580588
if (skipRecord && job.mergeType.equals(SERVICE_PERIOD)) {
581589
mergeFeedsResult.recordsSkipCount++;
582-
return;
590+
shouldWriteCurrentRow = false;
583591
}
584592
// Store row and stop values. If the return value is true, the record has been skipped and we
585593
// should skip writing the row to the merged table.
586594
if (storeRowAndStopValues()) {
587-
return;
595+
shouldWriteCurrentRow = false;
588596
}
597+
589598
// Finally, handle writing lines to zip entry.
590599
if (mergedLineNumber == 0) {
591600
writeHeaders();
592601
}
593-
// Write line to table.
594-
writeValuesToTable(rowValues, true);
602+
603+
if (shouldWriteCurrentRow) {
604+
// Write line to table.
605+
writeValuesToTable(rowValues, true);
606+
}
595607

596608
// Optional table-specific additional processing.
597609
afterRowWrite();
@@ -631,7 +643,7 @@ protected int getLineNumber() {
631643
return lineNumber;
632644
}
633645

634-
protected String[] getRowValues() { return rowValues; }
646+
protected String[] getOriginalRowValues() { return originalRowValues; }
635647

636648
/**
637649
* Retrieves the value for the specified CSV field.

src/test/java/com/conveyal/datatools/manager/jobs/MergeFeedsJobTest.java

Lines changed: 70 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,11 @@ public class MergeFeedsJobTest extends UnitTest {
6969
* and some added trips, and a trip from the base feed removed.
7070
*/
7171
private static FeedVersion fakeTransitSameSignatureTrips;
72+
/**
73+
* The base feed (transposed to the future dates), with some trip_ids from the base feed with the same signature,
74+
* and a trip from the base feed removed.
75+
*/
76+
private static FeedVersion fakeTransitSameSignatureTrips2;
7277
private static FeedSource bart;
7378
private static FeedVersion noAgencyVersion1;
7479
private static FeedVersion noAgencyVersion2;
@@ -137,6 +142,7 @@ public static void setUp() throws IOException {
137142
fakeTransitModService = createFeedVersion(fakeTransit, zipFolderFiles("merge-data-mod-services"));
138143
fakeTransitNewSignatureTrips = createFeedVersion(fakeTransit, zipFolderFiles("merge-data-mod-trips"));
139144
fakeTransitSameSignatureTrips = createFeedVersion(fakeTransit, zipFolderFiles("merge-data-added-trips"));
145+
fakeTransitSameSignatureTrips2 = createFeedVersion(fakeTransit, zipFolderFiles("merge-data-added-trips-2"));
140146

141147
// Feeds with no agency id
142148
FeedSource noAgencyIds = new FeedSource("no-agency-ids", project.id, MANUALLY_UPLOADED);
@@ -345,17 +351,24 @@ void mergeMTCShouldHandleExtendFutureStrategy() throws SQLException {
345351
// assert service_ids start_dates have been extended to the start_date of the base feed.
346352
String mergedNamespace = mergeFeedsJob.mergedVersion.namespace;
347353

354+
// There should be no unused service ids.
355+
assertThatSqlCountQueryYieldsExpectedCount(
356+
String.format("SELECT count(*) FROM %s.errors where error_type = 'SERVICE_UNUSED'", mergedNamespace),
357+
0
358+
);
359+
348360
// - calendar table
349-
// expect a total of 5 records in calendar table
361+
// expect a total of 1 record in calendar table that
362+
// corresponds to the trip ids present in both active and future feed.
350363
assertThatSqlCountQueryYieldsExpectedCount(
351364
String.format("SELECT count(*) FROM %s.calendar", mergedNamespace),
352-
5
365+
1
353366
);
354367
// expect that two records in calendar table have the correct start_date
355368
// (one for the original calendar entry, one for the extended service id for trips with same signature)
356369
assertThatSqlCountQueryYieldsExpectedCount(
357370
String.format("SELECT count(*) FROM %s.calendar where start_date = '20170918' and monday = 1", mergedNamespace),
358-
2
371+
1
359372
);
360373
}
361374

@@ -432,6 +445,60 @@ void mergeMTCShouldHandleMatchingTripIdsWithSameSignature() throws SQLException
432445
);
433446
}
434447

448+
/**
449+
* Ensures that an MTC merge of feeds with exact matches of service_ids and trip_ids,
450+
* trip ids having the same signature (same stop times) will utilize the
451+
* {@link MergeStrategy#CHECK_STOP_TIMES} strategy correctly and drop unused future service ids.
452+
*/
453+
@Test
454+
void mergeMTCShouldHandleMatchingTripIdsAndDropUnusedFutureCalendar() throws SQLException {
455+
Set<FeedVersion> versions = new HashSet<>();
456+
versions.add(fakeTransitBase);
457+
versions.add(fakeTransitSameSignatureTrips2);
458+
MergeFeedsJob mergeFeedsJob = new MergeFeedsJob(user, versions, "merged_output", MergeFeedsType.SERVICE_PERIOD);
459+
// Run the job in this thread (we're not concerned about concurrency here).
460+
mergeFeedsJob.run();
461+
// Check that correct strategy was used.
462+
assertEquals(
463+
MergeStrategy.CHECK_STOP_TIMES,
464+
mergeFeedsJob.mergeFeedsResult.mergeStrategy
465+
);
466+
// Result should succeed.
467+
assertFalse(
468+
mergeFeedsJob.mergeFeedsResult.failed,
469+
"Merge feeds job should succeed with CHECK_STOP_TIMES strategy."
470+
);
471+
472+
String mergedNamespace = mergeFeedsJob.mergedVersion.namespace;
473+
474+
// - calendar table
475+
// expect a total of 4 records in calendar table:
476+
// - common_id from the active feed (but start date is changed to one day before first start_date in future feed),
477+
// - common_id from the future feed (because of one future trip not in the active feed),
478+
// - common_id cloned and extended for the matching trip id present in both active and future feeds
479+
// (from MergeFeedsJob#serviceIdsToCloneAndRename),
480+
// - only_calendar_id used in the future feed.
481+
assertThatSqlCountQueryYieldsExpectedCount(
482+
String.format("SELECT count(*) FROM %s.calendar", mergedNamespace),
483+
3
484+
);
485+
486+
// Out of all trips from the input datasets, expect 4 trips in merged output.
487+
// 1 trip from active feed that is not in the future feed,
488+
// 1 trip in both the active and future feeds, with the same signature (same stop times),
489+
// 2 trips from the future feed not in the active feed.
490+
assertThatSqlCountQueryYieldsExpectedCount(
491+
String.format("SELECT count(*) FROM %s.trips", mergedNamespace),
492+
3
493+
);
494+
495+
// There should be no unused service ids.
496+
assertThatSqlCountQueryYieldsExpectedCount(
497+
String.format("SELECT count(*) FROM %s.errors where error_type = 'SERVICE_UNUSED'", mergedNamespace),
498+
0
499+
);
500+
}
501+
435502
/**
436503
* Ensures that an MTC merge of feeds with trip_ids matching in the active and future feed,
437504
* but with different signatures (e.g. different stop times) fails.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
agency_id,agency_name,agency_url,agency_lang,agency_phone,agency_email,agency_timezone,agency_fare_url,agency_branding_url
2+
1,Fake Transit,,,,,America/Los_Angeles,,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
service_id,monday,tuesday,wednesday,thursday,friday,saturday,sunday,start_date,end_date
2+
common_id,1,1,1,1,1,1,1,20170923,20170925
3+
only_calendar_id,1,1,1,1,1,1,1,20170920,20170927
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
feed_id,feed_publisher_name,feed_publisher_url,feed_lang,feed_version
2+
fake_transit,Conveyal,http://www.conveyal.com,en,1.0
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
agency_id,route_id,route_short_name,route_long_name,route_desc,route_type,route_url,route_color,route_text_color,route_branding_url
2+
1,1,1,Route 1,,3,,7CE6E7,FFFFFF,
3+
1,2,2,Route 2,,3,,7CE6E7,FFFFFF,
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
stop_id,accessibility_id,cardinal_direction,relative_position,stop_city
2+
4u6g,0,SE,FS,Scotts Valley
3+
johv,0,SE,FS,Scotts Valley
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
trip_id,arrival_time,departure_time,stop_id,stop_sequence,stop_headsign,pickup_type,drop_off_type,shape_dist_traveled,timepoint
2+
trip3,07:00:00,07:00:00,4u6g,1,,0,0,0.0000000,
3+
trip3,07:01:00,07:01:00,johv,2,,0,0,341.4491961,
4+
only-calendar-trip1,07:00:00,07:00:00,4u6g,1,,0,0,0.0000000,
5+
only-calendar-trip1,07:01:00,07:01:00,johv,2,,0,0,341.4491961,
6+
only-calendar-trip2,07:00:00,07:00:00,johv,1,,0,0,0.0000000,
7+
only-calendar-trip2,07:01:00,07:01:00,4u6g,2,,0,0,341.4491961,
8+
only-calendar-trip999,07:00:00,07:00:00,johv,1,,0,0,0.0000000,
9+
only-calendar-trip999,07:01:00,07:01:00,4u6g,2,,0,0,341.4491961,

0 commit comments

Comments
 (0)