-
Notifications
You must be signed in to change notification settings - Fork 13
/
_mart_gtfs_fcts.yml
2513 lines (2447 loc) · 106 KB
/
_mart_gtfs_fcts.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
version: 2
x-common-fields:
# Roughly alphabetical + category (feed_*, rt_* original defs, schedule to RT)
# datetime column wrangling (min/max/local tz, pacific),
# summary stats (num_distinct_*)
- &base64_url
name: base64_url
description: '{{ doc("column_base64_url") }}'
- &feed_key
name: feed_key
description: '{{ doc("gtfs_schedule_feed_key") }}'
tests:
- not_null
- relationships:
to: ref('dim_schedule_feeds')
field: key
- &feed_timezone_no_tests
name: feed_timezone
description: '{{ doc("gtfs_schedule_feed_timezone") }}'
- &feed_type
name: feed_type
description: '{{ doc("gtfs_rt_feed_type") }}'
- >fs_dataset_key_errorif
name: gtfs_dataset_key
description: >fs_dataset_key_desc |
'{{ doc("gtfs_schedule_gtfs_dataset_key") }}'
tests:
- not_null:
config:
error_if: ">10000"
- relationships:
to: ref('dim_gtfs_datasets')
field: key
config:
# there are a dozen rows of SMART transit which was deleted from
# Airtable this will work without exception once the Airtable dim is
# historical; this threshold may need to increase if the backfill
# occurs prior to that
error_if: ">20"
- >fs_rt_dataset_key
name: gtfs_dataset_key
description: |
Foreign key to the associated GTFS dataset record.
tests:
- not_null:
config:
where: '__rt_sampled__'
- relationships:
to: ref('dim_gtfs_datasets')
field: key
config:
# there are a dozen rows of SMART transit which was deleted from
# Airtable this will work without exception once the Airtable dim is
# historical; this threshold may need to increase if the backfill
# occurs prior to that
error_if: ">20"
where: '__rt_sampled__'
- >fs_rt_name
name: gtfs_dataset_name
description: |
Name from the associated GTFS dataset record.
- >fs_dataset_key
name: gtfs_dataset_key
description: *gtfs_dataset_key_desc
tests:
- dbt_utils.not_null_proportion:
# TODO: raise back to .999 after some time getting new data -
# was .998 because old data pipeline produced more nulls
at_least: 0.998
- relationships:
to: ref('dim_gtfs_datasets')
field: key
config:
# there are a dozen rows of SMART transit which was deleted from
# Airtable this will work without exception once the Airtable dim is
# historical; this threshold may need to increase if the backfill
# occurs prior to that
error_if: ">20"
- >fs_rt_schedule_dataset_key
name: schedule_gtfs_dataset_key
description: '{{ doc("column_rt_schedule_dataset_key") }}'
tests:
- relationships:
to: ref('dim_gtfs_datasets')
field: key
config:
where: '__rt_sampled__'
- >fs_rt_schedule_dataset_name
name: schedule_name
description: '{{ doc("column_rt_schedule_dataset_name") }}'
- >fs_rt_schedule_feed_key
name: schedule_feed_key
description: '{{ doc("column_rt_schedule_feed_key") }}'
tests:
- relationships:
to: ref('dim_schedule_feeds')
field: key
config:
where: '__rt_sampled__'
- >fs_rt_schedule_feed_timezone
name: schedule_feed_timezone
description: '{{ doc("column_rt_schedule_feed_timezone") }}'
- &rt_schedule_base64_url
name: schedule_base64_url
description: '{{ doc("column_rt_schedule_base64_url") }}'
- &trip_instance_key
name: trip_instance_key
description: |
A trip level identifier that uniquely identifies an individual trip
across feed types (schedule, trip updates, vehicle positions, service
alerts).
Composite of:
- base64 URL of associated schedule feed
- service date
- trip_id
- iteration number, which attempts to normalize trip start times
tests:
- not_null
- unique_proportion:
at_least: 0.9999
- &rt_trip_summary_key
name: key
description: |
Composite of calculated service date, URL, trip ID, and trip start time.
tests:
- not_null
- unique_proportion:
at_least: 0.9999
- &rt_iteration_num
name: iteration_num
description: |
0-based ordered count of `trip_start_time` values for this `trip_id`
on this `service_date`. Ex.: If a given `trip_id` appears with
`trip_start_time` values of "12:05:10" and "14:40:00", the first will
have `iteration_num` of 0 and the second will have `iteration_num`
of 1.
This helps normalize the following cases:
- For frequency-based trips: `trip_start_time` in RT data may not
align exactly with the start time of any individual iteration in the
schedule data (especially if `frequencies.exact_times` = 1 in
schedule), so this field normalizes
- Across RT feeds: We have seen cases where, for example, the trip
updates feed provides `trip_start_time` (even for non-frequency-based
trips) but the vehicle positions feed does not (the field is null).
We can calculate a `iteration_num` of 0 for both feeds to account for
this difference, because it is likely a byproduct of technical
implementation rather than a true indication that the two represent
different trips.
- >fs_rt_dt
name: dt
description: |
Date (UTC) on which this data was scraped. A value for this field
must be provided when querying this model due to data size.
- >fs_rt_hour
name: hour
description: |
The starting timestamp for the hour (UTC) in which this data was scraped.
Example value: '2023-06-14T15:00:00+00:00'. Adding a filter on this can
further improve performance.
- >fs_rt_extract_ts
name: _extract_ts
description: '{{ doc("column_extract_ts") }}'
- >fs_rt_config_extract_ts
name: _config_extract_ts
description: '{{ doc("column_config_extract_ts") }}'
- &rt_service_date
name: service_date
description: |
Attempt to identify the `service_date` (corresponding to the related
schedule feed) for trip activity referenced in a GTFS RT feed. It uses
the following fallback logic:
* If `trip_start_date` is populated, use that. This is assumed to be
provided with respect to `schedule_feed_timezone`.
* Otherwise, for trip updates and vehicle positions, if
`trip_update_timestamp` or `vehicle_timestamp` (respectively) are
populated, convert that to the `schedule_feed_timezone` and extract the
date from that.
* Otherwise, use `header_timestamp` converted to `schedule_feed_timezone`
and extract the date.
* Finally (and this generally should not happen, since `header_timestamp`
should be populated), fall back to `_extract_ts` converted to
`schedule_feed_timezone` and extract the date.
- &rt_trip_start_time_interval
name: trip_start_time_interval
description: &rt_trip_start_time_interval_desc |
`trip_start_time` converted to a BigQuery INTERVAL type to allow handling
for times after midnight.
See https://gtfs.org/schedule/reference/#field-types for how time strings
are defined in GTFS.
*Note: If the interval is longer than 24 hours and `trip_start_date` is
not populated, the interpretation for this field becomes unclear.*
- &rt_header_timestamp
name: header_timestamp
description: '{{ doc("gtfs_feed_header__timestamp") }}'
- &rt_header_incrementality
name: header_incrementality
description: '{{ doc("gtfs_feed_header__incrementality") }}'
- &rt_header_version
name: header_version
description: '{{ doc("gtfs_feed_header__version") }}'
- &rt_id
name: id
description: '{{ doc("gtfs_feed_entity__id") }}'
- &rt_vehicle_id
name: vehicle_id
description: '{{ doc("gtfs_vehicle_descriptor__id") }}'
- &rt_vehicle_label
name: vehicle_label
description: '{{ doc("gtfs_vehicle_descriptor__label") }}'
- &rt_vehicle_license_plate
name: vehicle_license_plate
description: '{{ doc("gtfs_vehicle_descriptor__license_plate") }}'
- &rt_vehicle_wheelchair_accessible
name: vehicle_wheelchair_accessible
description: '{{ doc("gtfs_vehicle_descriptor__wheelchair_accessible") }}'
- &rt_trip_id
name: trip_id
description: '{{ doc("gtfs_trip_descriptor__trip_id") }}'
- &rt_trip_route_id
name: trip_route_id
description: '{{ doc("gtfs_trip_descriptor__route_id") }}'
- &rt_trip_direction_id
name: trip_direction_id
description: '{{ doc("gtfs_trip_descriptor__direction_id") }}'
- &rt_trip_start_time
name: trip_start_time
description: &rt_trip_start_time_desc |
'{{ doc("gtfs_trip_descriptor__start_time") }}'
- &rt_trip_start_date
name: trip_start_date
description: &rt_trip_start_date_desc |
'{{ doc("gtfs_trip_descriptor__start_date") }}'
- &rt_trip_schedule_relationship
name: trip_schedule_relationship
description: '{{ doc("gtfs_trip_descriptor__schedule_relationship") }}'
- &rt_agency_id
name: agency_id
description: '{{ doc("gtfs_entity_selector__agency_id") }}'
- &rt_route_id
name: route_id
description: '{{ doc("gtfs_entity_selector__route_id") }}'
- &rt_route_type
name: route_type
description: '{{ doc("gtfs_entity_selector__route_type") }}'
- &rt_direction_id
name: direction_id
description: '{{ doc("gtfs_entity_selector__direction_id") }}'
- &rt_stop_id
name: stop_id
description: '{{ doc("gtfs_entity_selector__stop_id") }}'
- &rt_trip_update_timestamp
name: trip_update_timestamp
description: '{{ doc("gtfs_trip_update__timestamp") }}'
- &rt_trip_update_delay
name: trip_update_delay
description: '{{ doc("gtfs_trip_update__delay") }}'
- &rt_active_period_start
name: active_period_start
description: '{{ doc("gtfs_time_range__start") }}'
- &rt_active_period_end
name: active_period_end
description: '{{ doc("gtfs_time_range__end") }}'
- &rt_vp_current_stop_sequence
name: current_stop_sequence
description: '{{ doc("gtfs_vehicle_position__current_stop_sequence") }}'
- &rt_vp_stop_id # distinguish from stop_time_updates__stop_id
name: stop_id
description: '{{ doc("gtfs_vehicle_position__stop_id") }}'
- &rt_current_status
name: current_status
description: '{{ doc("gtfs_vehicle_position__current_status") }}'
- &rt_vehicle_timestamp
name: vehicle_timestamp
description: '{{ doc("gtfs_vehicle_position__timestamp") }}'
- &rt_congestion_level
name: congestion_level
description: '{{ doc("gtfs_vehicle_position__congestion_level") }}'
- &rt_occupancy_status
name: occupancy_status
description: '{{ doc("gtfs_vehicle_position__occupancy_status") }}'
- &rt_occupancy_percentage
name: occupancy_percentage
description: |
'{{ doc("gtfs_vehicle_position__occupancy_percentage") }}'
- &rt_stu_stop_sequence
name: stop_sequence
description: '{{ doc("gtfs_stop_time_update__stop_sequence") }}'
- &rt_stu_stop_id
name: stop_id
description: '{{ doc("gtfs_stop_time_update__stop_id") }}'
- &rt_stu_schedule_relationship
name: schedule_relationship
description: '{{ doc("gtfs_stop_time_update__schedule_relationship") }}'
- &rt_active_period_start_ts
name: active_period_start_ts
description: |
`active_period_start` converted to a TIMESTAMP data type.
If `active_period_start` is null, will be midnight on January 1, 1900.
The spec (https://gtfs.org/realtime/reference/#message-alert) outlines
two cases:
1. If one of active period start/end is populated but the other is
not, the other should be treated as +/- infinity
2. If neither is populated, assume that the message is active while
it is present in the feed
Imputing these timestamp versions of the columns to be far future/past
covers both cases for our use cases.TIMESTAMP
- &rt_active_period_end_ts
name: active_period_end_ts
description: |
`active_period_end` converted to a data type.
If `active_period_end` is null, will be midnight on January 1, 2099.
The spec (https://gtfs.org/realtime/reference/#message-alert) outlines
two cases:
1. If one of active period start/end is populated but the other is
not, the other should be treated as +/- infinity
2. If neither is populated, assume that the message is active while
it is present in the feed
Imputing these timestamp versions of the columns to be far future/past
covers both cases for our use cases.
- &min_extract_ts
name: min_extract_ts
description: |
Timestamp of the first extract (i.e. the time at which we downloaded
this feed extract) of the entity being summarized.
For RT, our extracts are pinned to 20 second intervals,
i.e. :00, :20, :40.
- &max_extract_ts
name: max_extract_ts
description: |
Timestamp of the last extract (i.e. the time at which we downloaded
this feed extract) of the entity being summarized.
For RT, our extracts are pinned to 20 second intervals,
i.e. :00, :20, :40.
- &extract_duration_minutes
name: extract_duration_minutes
description: |
The difference between `max_extract_ts` and `min_extract_ts` in
minutes, representing the duration of time for which this entity was
present in message responses we received from the agency.
Note that the entity may not have been present in every message
between the minimum and maximum timestamps, and therefore may not have
been present continuously for the duration listed here.
- &min_extract_ts_local_tz
name: min_extract_datetime_local_tz
description: |
`min_extract_ts` converted to a DATETIME type in
`schedule_feed_timezone`.
- &max_extract_ts_local_tz
name: max_extract_datetime_local_tz
description: |
`max_extract_ts` converted to a DATETIME type in
`schedule_feed_timezone`.
- &min_extract_ts_pacific
name: min_extract_datetime_pacific
description: |
`min_extract_ts` converted to a DATETIME type in the
"America/Los_Angeles" time zone.
- &max_extract_ts_pacific
name: max_extract_datetime_pacific
description: |
`max_extract_ts` converted to a DATETIME type in the
"America/Los_Angeles" time zone.
- &_header_message_age
name: _header_message_age
description: '{{ doc("column_header_message_age") }}'
- &min_header_timestamp
name: min_header_timestamp
description: |
Earliest header timestamp of a message referencing this entity.
- &max_header_timestamp
name: max_header_timestamp
description: |
Latest header timestamp of a message referencing this entity.
- &header_duration_minutes
name: header_duration_minutes
description: |
The difference between `max_header_timestamp` and
`min_header_timestamp` in minutes, representing the duration of time
for which this entity was present in the messages produced by the
agency.
Note that the entity may not have been present in every message
between the minimum and maximum timestamps, and therefore may not have
been present continuously for the duration listed here.
- &min_header_local_tz
name: min_header_datetime_local_tz
description: |
`min_header_timestamp` converted to a DATETIME type in
`schedule_feed_timezone`.
- &max_header_local_tz
name: max_header_datetime_local_tz
description: |
`max_header_timestamp` converted to a DATETIME type in
`schedule_feed_timezone`.
- &min_header_pacific
name: min_header_datetime_pacific
description: |
`min_header_timestamp` converted to a DATETIME type in the
"America/Los_Angeles" time zone.
- &max_header_pacific
name: max_header_datetime_pacific
description: |
`max_header_timestamp` converted to a DATETIME type in the
"America/Los_Angeles" time zone.
- &_trip_update_message_age
name: _trip_update_message_age
description: '{{ doc("column_trip_update_message_age") }}'
- &_trip_update_message_age_vs_header
name: _trip_update_message_age_vs_header
description: '{{ doc("column_trip_update_message_age_vs_header") }}'
- &num_distinct_extract_ts
name: num_distinct_extract_ts
description: |
Distinct count of `_extract_ts` values for messages being summarized
here (`_extract_ts` represents the instant at which we attempted to
scrape data for this feed.)
This is the count of distinct messages in which this entity appeared.
If you divide this value by `extract_duration_minutes`, you would
get a count of how many messages per minute this entity was present for,
which can help give a sense for how continuously this entity was updated.
This number should be equal to `num_distinct_message_keys`.
- &_vehicle_message_age
name: _vehicle_message_age
description: '{{ doc("column_vehicle_message_age") }}'
- &_vehicle_message_age_vs_header
name: _vehicle_message_age_vs_header
description: |
'{{ doc("column_vehicle_message_age_vs_header") }}'
- &num_distinct_header_timestamps
name: num_distinct_header_timestamps
description: |
Distinct count of header timestamps for messages being summarized
(https://gtfs.org/realtime/reference/#message-feedheader).
- &num_distinct_message_keys
name: num_distinct_message_keys
description: |
Distinct count of message keys being summarized here (message keys are
the synthetic keys we generate for an individual RT message response.)
This is the count of distinct messages in which this entity appeared.
If you divide this value by `extract_duration_minutes`, you would get
a count of how many messages per minute this entity was present for,
which can help give a sense for how continuously this trip was updated.
This number should be equal to `num_distinct_extract_ts`.
- &service_date_for_summaries
name: service_date
description: |
Agency-defined service date on which this service level was present.
This respects the agency's definition of a service day, which may not
correspond to a calendar date or even a 24-hour period.
See https://gtfs.org/schedule/reference/#field-types for the GTFS
Schedule definition of a "date" data type or `fct_scheduled_trips` for
more dates that can be associated with trip activity.
This date is used so that this table's service summaries correspond to the
service levels that an agency might describe on a given (service) date,
even if that does not correspond to the actual calendar date on which the
given trip activity occurred.
models:
- name: fct_daily_schedule_feeds
description: |
Each row is a unique date / feed pair, where the feed is the feed version
that was in effect at noon Pacific Time on that date. This is intended to
make it clear what feed version to use to analyze a given date.
columns:
- name: key
description: |
Synthetic primary key constructed from `date` and `feed_key`.
tests: &primary_key_tests
- unique
- not_null
- name: date
description: |
Date on which the given feed was active at noon Pacific Time.
- *feed_key
- *gtfs_dataset_key_errorif
- *base64_url
- *feed_timezone_no_tests
- name: fct_schedule_feed_downloads
description: |
Each row is an instance of a download attempt, uniquely identified by
base64_url and timestamp.
columns:
- name: key
description: |
Synthetic primary key constructed from `base64_url` and `ts`.
tests: *primary_key_tests
- <<: *feed_key
tests:
- not_null:
config:
where: download_success AND unzip_success
- relationships:
to: ref('dim_schedule_feeds')
field: key
- *gtfs_dataset_key_errorif
- name: ts
description: |
Timestamp at which this feed download attempt occurred.
- *base64_url
- name: download_success
description: '{{ doc("column_download_success") }}'
- name: download_exception
description: '{{ doc("column_download_exception") }}'
- name: last_modified_timestamp
description: Feed last modified string converted to TIMESTAMP data type.
- name: unzip_success
description: '{{ doc("column_unzip_success") }}'
- name: unzip_exception
description: '{{ doc("column_unzip_exception") }}'
- name: zipfile_extract_md5hash
description: '{{ doc("column_zipfile_md5_hash") }}'
- name: zipfile_files
description: '{{ doc("column_zipfile_files") }}'
- name: zipfile_dirs
description: '{{ doc("column_zipfile_dirs") }}'
- name: pct_files_successfully_parsed
description: '{{ doc("column_pct_sucesss") }}'
- name: fct_trip_updates_messages
description: |
Each row is a message received from a trip updates GTFS RT feed.
See https://gtfs.org/realtime/reference/#message-tripupdate for
information about message structure.
Due to data size, this table **must** be queried with a date filter
(like `WHERE dt = 'YYYY-MM-DD'`).
Hour filters will also further improve performance.
columns:
- name: key
description: |
Synthetic primary key constructed from `base64_url`, `extract_ts`,
entity `id`, `vehicle_id`, and `trip_id`.
tests: &almost_unique_rt_key_tests
- unique_proportion:
at_least: 0.999
where: '__rt_sampled__'
- not_null:
where: '__rt_sampled__'
- *gtfs_rt_dataset_key
- *gtfs_rt_dt
- *gtfs_rt_hour
- *base64_url
- *gtfs_rt_extract_ts
- *gtfs_rt_config_extract_ts
- *gtfs_rt_name
- *gtfs_rt_schedule_dataset_key
- *rt_service_date
- *rt_schedule_base64_url
- *gtfs_rt_schedule_dataset_name
- *gtfs_rt_schedule_feed_key
- *gtfs_rt_schedule_feed_timezone
- *rt_trip_start_time_interval
- *rt_header_timestamp
- *rt_header_incrementality
- *rt_header_version
- *rt_id
- *rt_trip_update_timestamp
- *rt_trip_update_delay
- *rt_vehicle_id
- *rt_vehicle_label
- *rt_vehicle_license_plate
- *rt_vehicle_wheelchair_accessible
- *rt_trip_id
- *rt_trip_route_id
- *rt_trip_direction_id
- *rt_trip_start_time
- *rt_trip_start_date
- *rt_trip_schedule_relationship
- name: stop_time_updates
description: |
See https://gtfs.org/realtime/reference/#message-stoptimeupdate.
- *_header_message_age
- *_trip_update_message_age
- *_trip_update_message_age_vs_header
- name: fct_stop_time_updates
description: |
Unnested and de-duped stop time updates.
columns:
- name: key
tests: *almost_unique_rt_key_tests
- *gtfs_rt_dataset_key
- *gtfs_rt_dt
- *gtfs_rt_hour
- *base64_url
- *gtfs_rt_extract_ts
- *gtfs_rt_config_extract_ts
- *gtfs_rt_name
- *gtfs_rt_schedule_dataset_key
- *rt_schedule_base64_url
- *gtfs_rt_schedule_dataset_name
- *gtfs_rt_schedule_feed_key
- *gtfs_rt_schedule_feed_timezone
- *rt_service_date
- *_header_message_age
- *_trip_update_message_age
- *_trip_update_message_age_vs_header
- *rt_header_timestamp
- *rt_header_version
- *rt_header_incrementality
- *rt_id
- *rt_trip_update_timestamp
- *rt_trip_update_delay
- *rt_vehicle_id
- *rt_vehicle_label
- *rt_vehicle_license_plate
- *rt_vehicle_wheelchair_accessible
- *rt_trip_id
- *rt_trip_route_id
- *rt_trip_direction_id
- *rt_trip_start_time
- *rt_trip_start_time_interval
- *rt_trip_start_date
- *rt_trip_schedule_relationship
- name: _trip_updates_message_key
description: |
Synthetic primary key constructed from `base64_url`, `extract_ts`,
entity `id`, `vehicle_id`, and `trip_id`.
- *rt_stu_stop_sequence
- *rt_stu_stop_id
- name: arrival_delay
description: '{{ doc("gtfs_stop_time_event__delay") }}'
- name: arrival_time
description: '{{ doc("gtfs_stop_time_event__time") }}'
- name: arrival_uncertainty
description: '{{ doc("gtfs_stop_time_event__uncertainty") }}'
- name: departure_delay
description: '{{ doc("gtfs_stop_time_event__delay") }}'
- name: departure_time
description: '{{ doc("gtfs_stop_time_event__time") }}'
- name: departure_uncertainty
description: '{{ doc("gtfs_stop_time_event__uncertainty") }}'
- *rt_stu_schedule_relationship
- name: fct_vehicle_locations
description: |
De-duped vehicle positions, removing redundant/duplicated positions
in the underlying messages. Unique at the url/timestamp/vehicle/trip
level.
columns:
- name: key
description: |
Synthetic primary key constructed from `service_date`, `base64_url`,
`location_timestamp`, `vehicle_id`, `vehicle_label`,
`trip_id`, and `trip_start_time`.
tests: *almost_unique_rt_key_tests
- name: gtfs_dataset_key
description: *gtfs_dataset_key_desc
- *gtfs_rt_dt
- *rt_service_date
- *gtfs_rt_hour
- *base64_url
- *gtfs_rt_extract_ts
- *gtfs_rt_config_extract_ts
- *gtfs_rt_name
- *gtfs_rt_schedule_dataset_key
- *rt_schedule_base64_url
- *gtfs_rt_schedule_dataset_name
- *gtfs_rt_schedule_feed_key
- *gtfs_rt_schedule_feed_timezone
- *_header_message_age
- *_vehicle_message_age
- *rt_header_timestamp
- *rt_header_version
- *rt_header_incrementality
- *rt_id
- *rt_vp_current_stop_sequence
- *rt_vp_stop_id
- *rt_current_status
- *rt_vehicle_timestamp
- *rt_congestion_level
- *rt_occupancy_status
- *rt_occupancy_percentage
- *rt_vehicle_id
- *rt_vehicle_label
- *rt_vehicle_license_plate
- *rt_vehicle_wheelchair_accessible
- <<: *rt_trip_id
tests:
- not_null:
config:
where: '__rt_sampled__'
- *rt_trip_route_id
- *rt_trip_direction_id
- *rt_trip_start_time
- *rt_trip_start_time_interval
- *rt_trip_start_date
- *rt_trip_schedule_relationship
- &rt_position_latitude
name: position_latitude
description: '{{ doc("gtfs_position__latitude") }}'
- &rt_position_longitude
name: position_longitude
description: '{{ doc("gtfs_position__longitude") }}'
- &rt_position_bearing
name: position_bearing
description: '{{ doc("gtfs_position__bearing") }}'
- &rt_position_odometer
name: position_odometer
description: '{{ doc("gtfs_position__odometer") }}'
- &rt_position_speed
name: position_speed
description: '{{ doc("gtfs_position__speed") }}'
- name: location_timestamp
description: Vehicle timestamp or header timestamp
- name: vehicle_trip_key
description: |
Composite of service_date, URL, vehicle_id, vehicle_label,
trip_id, and trip_start_time.
- name: next_location_key
description: Location key for the next vehicle timestamp.
- name: location
description: GEOGPOINT created by the position latitute and longitude
- *trip_instance_key
- name: fct_vehicle_positions_messages
description: |
Each row is a message received from a vehicle positions GTFS RT feed.
See https://gtfs.org/realtime/reference/#message-vehicleposition for information
about message structure.
Due to data size, this table **must** be queried with a date filter
(like `WHERE dt = 'YYYY-MM-DD'`).
Hour filters will also further improve performance.
columns:
- name: key
description: |
Synthetic primary key constructed from `base64_url`, `extract_ts`,
entity `id`, `vehicle_id`, and `trip_id`.
tests: *almost_unique_rt_key_tests
- *gtfs_rt_dataset_key
- *gtfs_rt_dt
- *gtfs_rt_hour
- *base64_url
- *rt_schedule_base64_url
- *rt_service_date
- *gtfs_rt_extract_ts
- *gtfs_rt_config_extract_ts
- <<: *gtfs_rt_name
description: |
String name of the GTFS dataset of which this message is a part.
This field is provided for human readability and should not be
used as a join key.
- *gtfs_rt_schedule_dataset_key
- *gtfs_rt_schedule_dataset_name
- *gtfs_rt_schedule_feed_key
- *gtfs_rt_schedule_feed_timezone
- *_header_message_age
- *_vehicle_message_age
- *_vehicle_message_age_vs_header
- *rt_header_timestamp
- *rt_header_incrementality
- *rt_header_version
- *rt_id
- *rt_vp_current_stop_sequence
- *rt_vp_stop_id
- *rt_current_status
- *rt_vehicle_timestamp
- *rt_congestion_level
- *rt_occupancy_status
- *rt_occupancy_percentage
- *rt_vehicle_id
- *rt_vehicle_label
- *rt_vehicle_license_plate
- *rt_vehicle_wheelchair_accessible
- *rt_trip_id
- *rt_trip_route_id
- *rt_trip_direction_id
- *rt_trip_start_time
- *rt_trip_start_time_interval
- *rt_trip_start_date
- *rt_trip_schedule_relationship
- *rt_position_latitude
- *rt_position_longitude
- *rt_position_bearing
- *rt_position_odometer
- *rt_position_speed
- name: fct_daily_rt_feed_files
description: |
Each row is a date / URL pair with a summary of data aggregation outcomes.
columns:
- name: key
tests: *primary_key_tests
- name: date
description: Date that data was downloaded.
- <<: *base64_url
tests:
- not_null
- *feed_type
- name: parse_success_file_count
description: |
Count of files successfully parsed. Target is 4,320 (one file every
20 seconds.)
tests:
- not_null
- name: parse_failure_file_count
description: Count of files where parsing failed, but a file was present.
tests:
- not_null
- *gtfs_dataset_key
- name: schedule_to_use_for_rt_validation_gtfs_dataset_key
description: '{{ doc("column_rt_schedule_dataset_key") }}'
tests:
- relationships:
to: ref('dim_gtfs_datasets')
field: key
- name: schedule_feed_key
description: '{{ doc("column_rt_schedule_feed_key") }}'
tests:
- relationships:
to: ref('dim_schedule_feeds')
field: key
- name: fct_daily_feed_scheduled_service_summary
description: |
Daily service totals by feed. Each row is a date / feed pair where that
feed was online on that date (corresponds to `fct_daily_schedule_feeds`).
This means there are feeds present with 0 `ttl_service_hours`.
Service summary is based on `fct_scheduled_trips`, grouped to the feed
level (a feed with no trips in `fct_scheduled_trips` will have 0s here for
the summary columns.)
tests:
- dbt_utils.unique_combination_of_columns:
combination_of_columns:
- service_date
- feed_key
- dbt_utils.expression_is_true:
expression: "ttl_service_hours >= 0"
columns:
- *service_date_for_summaries
- *feed_key
- name: ttl_service_hours
description: |
Total service hours across entire feed on date; can be 0 if no service
- name: gtfs_dataset_key
description: *gtfs_dataset_key_desc
- name: n_trips
description: Total number of trips ran on date; can be 0 if no scheduled trips
- name: first_departure_sec
description: Time of first departure on date; null when no trips scheduled
- name: last_arrival_sec
description: Time of last departure on date; null when no trips scheduled
- name: num_stop_times
description: |
Total number of stop events observed; can be 0 if not scheduled trips
- name: n_routes
description: |
Total number of routes that had trips scheduled on this day; can be 0
if no scheduled trips
- name: contains_warning_duplicate_stop_times_primary_key
description: |
Rows with `true` in this column indicate that the columns in this table
that are aggregated from stop times data (`n_stops`, `num_stop_times`,
`trip_first_departure_sec`, `trip_last_arrival_sec`, and
`service_hours`) contain at least one row that had a duplicate primary
key in the source stop times data.
I.e., at least one row being aggregated had a `trip_id` / `stop_sequence`
pair that was not unique in the input data. This indicates that data
quality issues were present in the stop times data that is being
summarized here, and counts may be inflated due to multiple rows with
identical identifiers.
- name: contains_warning_duplicate_trip_primary_key
description: |
Rows with `true` in this column indicate that `dim_trips` contains
duplicates of this trip primary key.
i.e., `trip_id` is duplicated within an individual feed instance.
- name: contains_warning_missing_foreign_key_stop_id
description: |
Rows with `true` in this column indicate that the columns in this table
that are aggregated from stop times data (`n_stops`, `num_stop_times`,
`trip_first_departure_sec`, `trip_last_arrival_sec`, and
`service_hours`) contain at least one row that had a missing `stop_id`
foreign key in the source stops data.
I.e., at least one row being aggregated had a `stop_id` foreign key
that was missing in the input data. This indicates that data quality
issues were present in the stop times data that is being summarized here.
- name: fct_hourly_rt_feed_files
description: |
An hourly count of the files that were downloaded each day per feed.
columns:
- name: key
description: |
Synthetic primary key constructed from `dt` and `base64_url`.
tests: *primary_key_tests
- name: dt
description: |
Date on which the download attempt was made.
- *base64_url
- *feed_type
- name: file_count_day
description: |
The total number of files downloaded on on this day for this URL
- name: hr_0
description: |
Count of files downloaded for this URL in the 0 hour UTC
- name: hr_1
- name: hr_2
- name: hr_3
- name: hr_4
- name: hr_5
- name: hr_6
- name: hr_7
- name: hr_8
- name: hr_9
- name: hr_10
- name: hr_11
- name: hr_12
- name: hr_13
- name: hr_14
- name: hr_15
- name: hr_16
- name: hr_17
- name: hr_18
- name: hr_19
- name: hr_20
- name: hr_21
- name: hr_22
- name: hr_23
- name: gtfs_dataset_key
description: |
The primary key for the record in `dim_gtfs_datasets` associated with
this feed.
- name: fct_hourly_rt_feed_files_success
description: |
The proportion of downloaded files that were successfully parsed as GTFS-RT.
columns:
- name: key
description: |
Synthetic primary key constructed from `dt` and `base64_url`.
tests: *primary_key_tests
- name: dt
description: |
Date on which the download attempt was made.
- *base64_url
- *feed_type
- name: prop_success_file_count_day
description: |
The proportion of files successfully downloaded on this day for this URL
- name: hr_0
description: |
The proportion of files successfully downloaded for this URL in the 0
hour UTC
- name: hr_1
- name: hr_2
- name: hr_3
- name: hr_4
- name: hr_5
- name: hr_6
- name: hr_7
- name: hr_8
- name: hr_9
- name: hr_10
- name: hr_11
- name: hr_12
- name: hr_13
- name: hr_14
- name: hr_15
- name: hr_16
- name: hr_17
- name: hr_18
- name: hr_19
- name: hr_20
- name: hr_21
- name: hr_22
- name: hr_23
- name: gtfs_dataset_key
description: |
The primary key for the record in `dim_gtfs_datasets` associated with
this feed.
- name: fct_service_alerts_messages
description: |
Each row is a message received from a service alerts GTFS RT feed.
See https://gtfs.org/realtime/reference/#message-alert for information
about message structure.
Due to data size, this table **must** be queried with a date filter
(like `WHERE dt = 'YYYY-MM-DD'`).
Hour filters will also further improve performance.
columns:
- name: key
description: |
Synthetic primary key constructed from `base64_url`, `extract_ts`,
entity `id`.
tests: &rt_primary_key_tests
- unique:
where: '__rt_sampled__'
- not_null:
where: '__rt_sampled__'
- *gtfs_rt_dataset_key
- *gtfs_rt_dt
- *gtfs_rt_hour
- *rt_schedule_base64_url
- *gtfs_rt_extract_ts