wsp-sag · i-am-sijia · Mar 3, 2022 · Mar 3, 2022
diff --git a/activitysim/abm/models/summarize.py b/activitysim/abm/models/summarize.py
@@ -87,6 +87,7 @@ def construct_bin_labels(bins: pd.Series, label_format: str) -> pd.Series:
     )
 
     def construct_label(label_format, bounds_dict):
+        # parts = [part for part in ['left', 'right'] if part in label_format]
         bounds_dict = {
             x: bound for x, bound in bounds_dict.items() if x in label_format
         }
@@ -203,14 +204,9 @@ def manual_breaks(
 @inject.step()
 def summarize(
     network_los: pipeline.Pipeline,
-    persons: pd.DataFrame,
     persons_merged: pd.DataFrame,
-    households: pd.DataFrame,
-    households_merged: pd.DataFrame,
     trips: pd.DataFrame,
-    tours: pd.DataFrame,
     tours_merged: pd.DataFrame,
-    land_use: pd.DataFrame,
 ):
     """
     A standard model that uses expression files to summarize pipeline tables for vizualization.
@@ -221,8 +217,8 @@ def summarize(
     Columns in pipeline tables can also be sliced and aggregated prior to summarization.
     This preprocessing is configured in `summarize.yaml`.
 
-    Outputs a seperate csv summary file for each expression;
-    outputs starting with '_' are saved as temporary local variables.
+
+    Outputs a seperate csv summary file for each expression.
     """
     trace_label = 'summarize'
     model_settings_file_name = 'summarize.yaml'
@@ -237,15 +233,9 @@ def summarize(
         config.config_file_path(model_settings['SPECIFICATION']), comment='#'
     )
 
-    # Load dataframes from pipeline
-    persons = persons.to_frame()
     persons_merged = persons_merged.to_frame()
-    households = households.to_frame()
-    households_merged = households_merged.to_frame()
     trips = trips.to_frame()
-    tours = tours_merged.to_frame()
     tours_merged = tours_merged.to_frame()
-    land_use = land_use.to_frame()
 
     # - trips_merged - merge trips and tours_merged
     trips_merged = pd.merge(
@@ -257,70 +247,53 @@ def summarize(
         how="left",
     )
 
-    # Add dataframes as local variables
-    locals_d = {
-        'persons': persons,
-        'persons_merged': persons_merged,
-        'households': households,
-        'households_merged': households_merged,
-        'trips': trips,
-        'trips_merged': trips_merged,
-        'tours': tours_merged,
-        'tours_merged': tours_merged,
-        'land_use': land_use,
-    }
+    locals_d = {'trips_merged': trips_merged, 'persons_merged': persons_merged}
 
     skims = wrap_skims(network_los, trips_merged)
 
-    # Annotate trips_merged
     expressions.annotate_preprocessors(
         trips_merged, locals_d, skims, model_settings, 'summarize'
     )
 
     for table_name, df in locals_d.items():
-        if table_name in model_settings:
+        meta = model_settings[table_name]
+        df = eval(table_name)
+
+        if 'AGGREGATE' in meta and meta['AGGREGATE']:
+            for agg in meta['AGGREGATE']:
+                assert set(('column', 'label', 'map')) <= agg.keys()
+                df[agg['label']] = (
+                    df[agg['column']].map(agg['map']).fillna(df[agg['column']])
+                )
+
+        if 'SLICERS' in meta and meta['SLICERS']:
+            for slicer in meta['SLICERS']:
+                if slicer['type'] == 'manual_breaks':
+                    # df[slicer['label']] = pd.cut(df[slicer['column']], slicer['bin_breaks'],
+                    #                              labels=slicer['bin_labels'], include_lowest=True)
+                    df[slicer['label']] = manual_breaks(
+                        df[slicer['column']], slicer['bin_breaks'], slicer['bin_labels']
+                    )
+
+                elif slicer['type'] == 'quantiles':
+                    df[slicer['label']] = quantiles(
+                        df[slicer['column']], slicer['bins'], slicer['label_format']
+                    )
 
-            meta = model_settings[table_name]
-            df = eval(table_name)
+                elif slicer['type'] == 'spaced_intervals':
+                    df[slicer['label']] = spaced_intervals(
+                        df[slicer['column']],
+                        slicer['lower_bound'],
+                        slicer['interval'],
+                        slicer['label_format'],
+                    )
 
-            if 'AGGREGATE' in meta and meta['AGGREGATE']:
-                for agg in meta['AGGREGATE']:
-                    assert set(('column', 'label', 'map')) <= agg.keys()
-                    df[agg['label']] = (
-                        df[agg['column']].map(agg['map']).fillna(df[agg['column']])
+                elif slicer['type'] == 'equal_intervals':
+                    df[slicer['label']] = equal_intervals(
+                        df[slicer['column']], slicer['bins'], slicer['label_format']
                     )
 
-            if 'BIN' in meta and meta['BIN']:
-                for slicer in meta['BIN']:
-                    if slicer['type'] == 'manual_breaks':
-                        df[slicer['label']] = manual_breaks(
-                            df[slicer['column']], slicer['bin_breaks'], slicer['bin_labels']
-                        )
-
-                    elif slicer['type'] == 'quantiles':
-                        df[slicer['label']] = quantiles(
-                            df[slicer['column']], slicer['bins'], slicer['label_format']
-                        )
-
-                    elif slicer['type'] == 'spaced_intervals':
-                        df[slicer['label']] = spaced_intervals(
-                            df[slicer['column']],
-                            slicer['lower_bound'],
-                            slicer['interval'],
-                            slicer['label_format'],
-                        )
-
-                    elif slicer['type'] == 'equal_intervals':
-                        df[slicer['label']] = equal_intervals(
-                            df[slicer['column']], slicer['bins'], slicer['label_format']
-                        )
-
-    # Output pipeline tables for expression development
-    if model_settings['EXPORT_PIPELINE_TABLES'] is True:
-        pipeline_table_dir = os.path.join(output_location, 'pipeline_tables')
-        os.makedirs(config.output_file_path(pipeline_table_dir), exist_ok=True)
-        for name, df in locals_d.items():
-            df.to_csv(config.output_file_path(os.path.join(pipeline_table_dir, f'{name}.csv')))
+    locals_d.update(skims)
 
     # Add classification functions to locals
     locals_d.update(
@@ -332,19 +305,19 @@ def summarize(
         }
     )
 
+    # Save merged tables for expression development
+    # locals_d['trips_merged'].to_csv(
+    #     config.output_file_path(os.path.join(output_location, f'trips_merged.csv'))
+    # )
+    # locals_d['persons_merged'].to_csv(
+    #     config.output_file_path(os.path.join(output_location, f'persons_merged.csv'))
+    # )
+
     for i, row in spec.iterrows():
 
         out_file = row['Output']
         expr = row['Expression']
 
-        # Save temporary variables starting with underscores in locals_d
-        if out_file.startswith('_'):
-
-            logger.debug(f'Temp Variable: {expr} -> {out_file}')
-
-            locals_d[out_file] = eval(expr, globals(), locals_d)
-            continue
-
         logger.debug(f'Summary: {expr} -> {out_file}.csv')
 
         resultset = eval(expr, globals(), locals_d)

diff --git a/activitysim/examples/example_mtc/configs/summarize.csv b/activitysim/examples/example_mtc/configs/summarize.csv
@@ -1,115 +1,38 @@
 Description,Output,Expression
-# These summaries are checked by test module
-,households_count,persons_merged[['household_id']].nunique().rename('households')
-,trips_by_mode_count,trips_merged.groupby('trip_mode')[['number_of_participants']].sum().T
+#,total_vmt,"trips_merged[['auto_distance']].sum().rename('vmt')"
+#,vmt_per_capita,"pd.Series(trips_merged[['auto_distance']].sum() / len(persons_merged), name='vmt_per_capita')"
+#,vmt_per_capita,"pd.Series(trips_merged[['auto_distance']].sum() / len(persons_merged), name='vmt_per_capita')"
+#,vmt_per_capita_by_home_taz,(trips_merged.groupby('home_zone_id').auto_distance.sum() / persons_merged.groupby('home_zone_id').size()).fillna(0).rename('vmt_per_capita_by_home_taz').reset_index()
+#,vmt_per_capita_by_work_taz,(trips_merged.groupby('workplace_zone_id').auto_distance.sum() / persons_merged[persons_merged.workplace_zone_id > 0].groupby('workplace_zone_id').size()).fillna(0).rename('vmt_per_capita_by_work_taz').reset_index()
+#,households_count,persons_merged[['household_id']].nunique().rename('households')
+#,persons_count,persons_merged[['household_id']].count().rename('persons')
+#,person_tours,"trips_merged[['tour_id', 'number_of_participants']].drop_duplicates()[['number_of_participants']].sum().rename('tours')"
+#,person_trips,trips_merged[['number_of_participants']].sum().rename('trips')
+#,tours_per_household_count,"pd.Series(trips_merged[['tour_id', 'number_of_participants']].drop_duplicates()['number_of_participants'].sum() / persons_merged['household_id'].nunique(), name='tours_per_household')"
+#,trips_per_household_count,"pd.Series(trips_merged['number_of_participants'].sum() / persons_merged['household_id'].nunique(), name='trips_per_household')"
+#,trip_by_income_category_major_mode,"pd.merge(trips_merged, persons_merged['income_category'], left_on='person_id', right_index=True).groupby(['income_category', 'major_trip_mode']).size().unstack(-1).reset_index()"
+#,trips_by_mode_count,trips_merged.groupby('trip_mode')[['number_of_participants']].sum().T#
+
+#,trips_by_major_mode_count,trips_merged.groupby('major_trip_mode')[['number_of_participants']].sum().T#
+
+#,trips_by_purpose_count,trips_merged.groupby('primary_purpose_trip')[['number_of_participants']].sum().T
+#,trip_purpose_by_time_of_day,"trips_merged.groupby(['depart','primary_purpose_trip'])['number_of_participants'].sum().unstack(-1).reset_index()"
+#,tour_mode_to_trip_mode,"trips_merged.groupby(['tour_mode','trip_mode']).size().rename('trips').reset_index()"
+#,work_tours_tod_count,"trips_merged[trips_merged['tour_type'] == 'work'][['tour_id', 'start', 'number_of_participants']].drop_duplicates().groupby('start')['number_of_participants'].sum().rename('tours').reset_index()"
+#,school_tours_tod_count,"trips_merged[trips_merged['tour_type'] == 'school'][['tour_id', 'start', 'number_of_participants']].drop_duplicates().groupby('start')['number_of_participants'].sum().rename('tours').reset_index()"
+#,non_mandatory_tours_tod_count,"trips_merged[trips_merged['tour_category'] == 'non_mandatory'][['tour_id', 'start', 'number_of_participants']].drop_duplicates().groupby('start')['number_of_participants'].sum().rename('tours').reset_index()"
+#,work_tours_tod_count,trips_merged[trips_merged.tour_type == 'work'].groupby('tour_id').depart.min().reset_index().groupby('depart').size().sort_index().rename('tours').reset_index()
+#,school_tours_tod_count,trips_merged[trips_merged.tour_type == 'school'].groupby('tour_id').depart.min().reset_index().groupby('depart').size().sort_index().rename('tours').reset_index()
+#,non_mandatory_tours_tod_count,trips_merged[trips_merged.tour_category == 'non_mandatory'].groupby('tour_id').depart.min().reset_index().groupby('depart').size().sort_index().rename('tours').reset_index()#
+
+#,low_income_share_by_taz_deciles,"quantiles(persons_merged[persons_merged['income'] < 50000].groupby('home_zone_id').size() / persons_merged.groupby('home_zone_id').size(), 10, '{rank}').rename('low_income_share_by_taz_deciles').reset_index()"#
+
+#,persons_by_income_category,persons_merged.groupby('income_category')[['income_category']].count().T#
+
+#,vmt_per_capita_by_home_taz_quintiles,"quantiles((trips_merged.groupby('home_zone_id').auto_distance.sum() / trips_merged.groupby('home_zone_id').number_of_participants.sum() / persons_merged.groupby('home_zone_id').size()).fillna(0), 5, '{rank}').rename('vmt_per_capita_by_home_taz_quintiles').reset_index()"#
+
+#,non_motorized_trip_distance_bins,"spaced_intervals((trips_merged.walk_distance + trips_merged.bike_distance)[trips_merged.major_trip_mode == 'Non-Motorized'], 0, 0.25).value_counts().sort_index().rename('trips').reset_index()"#
 
-# Calculate total VMT by summing auto_distance from tours_merged
-# (auto_distance is calculated by an expression in summarize_preprocessor.csv)
-# Initially save as a temporary variable so it can be reused
-,_total_vmt,trips_merged[['auto_distance']].sum()
-# Then save to CSV
-,total_vmt,_total_vmt.rename('vmt')
-
-# Calculate vmt per capita
-# (All outputs to CSV have to be a Pandas Series or DataFrame)
-,vmt_per_capita,"pd.Series(_total_vmt / len(persons_merged), name='vmt_per_capita')"
-
-# Calculate vmt per capita by home taz
-,_vmt_per_home_taz,trips_merged.groupby('home_zone_id').auto_distance.sum()
-,_person_per_home_taz,persons_merged.groupby('home_zone_id').size()
-,_vmt_per_capita_by_home_taz,(_vmt_per_home_taz/_person_per_home_taz).fillna(0)
-,vmt_per_capita_by_home_taz,_vmt_per_capita_by_home_taz.rename('vmt_per_capita_by_home_taz').reset_index()
-
-# Calculate vmt per capita by work taz
-,_vmt_per_work_taz,trips_merged.groupby('workplace_zone_id').auto_distance.sum()
-,_person_per_work_taz,persons_merged.groupby('workplace_zone_id').size()
-,vmt_per_capita_by_work_taz,(_vmt_per_work_taz/_person_per_work_taz).fillna(0).rename('vmt_per_capita_by_work_taz').reset_index()
-
-# Count persons
-,persons_count,persons_merged[['household_id']].count().rename('persons')
-
-# Count person-tours
-,person_tours,"trips_merged[['tour_id', 'number_of_participants']].drop_duplicates()[['number_of_participants']].sum().rename('tours')"
-
-# Count person-trips
-,person_trips,trips_merged[['number_of_participants']].sum().rename('trips')
-
-# Count tours
-,tours_count,tours_merged.reset_index()[['tour_id']].count().rename('tours')
-
-# Count trips
-,trips_count,trips_merged.reset_index()[['trip_id']].count().rename('trips')
-
-# Count tours per household
-,_tours,"trips_merged[['tour_id', 'number_of_participants']].drop_duplicates()['number_of_participants'].sum()"
-,_households,persons_merged['household_id'].nunique()
-,tours_per_household_count,"pd.Series(_tours / _households, name='tours_per_household')"
-
-# Count trips per household
-,_trips,trips_merged['number_of_participants'].sum()
-,trips_per_household_count,"pd.Series(_trips / _households, name='trips_per_household')"
-
-# Count trips by major mode
-#,trips_by_major_mode_count,trips_merged.groupby('major_trip_mode')[['number_of_participants']].sum().T
-
-# Count trips by income category and major mode
-,_trips_with_income,"pd.merge(trips_merged, persons_merged['income_category'], left_on='person_id', right_index=True)"
-,trip_by_income_category_major_mode,"_trips_with_income.groupby(['income_category', 'major_trip_mode']).size().unstack(-1).reset_index()"
-
-# Count trips by purpose
-,trips_by_purpose_count,trips_merged.groupby('primary_purpose_trip')[['number_of_participants']].sum().T
-
-# Count trips by purpose and departure time
-,trip_purpose_by_time_of_day,"trips_merged.groupby(['depart','primary_purpose_trip'])['number_of_participants'].sum().unstack(-1).reset_index()"
-
-# Count trips with each combination of tour mode and trip mode (for Sankey)
-,tour_mode_to_trip_mode,"trips_merged.groupby(['tour_mode','trip_mode']).size().rename('trips').reset_index()"
-
-# Count work tours by time of day
-,_work_tours,trips_merged[trips_merged['tour_type'] == 'work']
-,work_tours_tod_count,_work_tours.groupby('tour_id').depart.min().reset_index().groupby('depart').size().sort_index().rename('tours').reset_index()
-
-# Count school tours by time of day
-,_school_tours,trips_merged[trips_merged['tour_type'] == 'school']
-,school_tours_tod_count,_school_tours.groupby('tour_id').depart.min().reset_index().groupby('depart').size().sort_index().rename('tours').reset_index()
-
-# Count non-manditory tours by time of day
-,_non_mandatory_tours,trips_merged[trips_merged.tour_category == 'non_mandatory']
-,non_mandatory_tours_tod_count,_non_mandatory_tours.groupby('tour_id').depart.min().reset_index().groupby('depart').size().sort_index().rename('tours').reset_index()
-
-# TAZ population density quintiles
-,_taz_pop_dens,land_use.TOTPOP/land_use.TOTACRE
-,taz_population_density_quintiles,"quantiles(_taz_pop_dens, 5, '{rank}').rename('pop_dens_quintile').reset_index()"
-
-# Calculate share of taz population that is low income by decile
-# (Output deciles by specifying '{rank}' as the label format in the quantile function)
-,_low_income_pop_by_taz,persons_merged[persons_merged.income < 50000].groupby('home_zone_id').size()
-,_total_pop_by_taz,persons_merged.groupby('home_zone_id').size()
-,_proportion_low_income_by_taz,"_low_income_pop_by_taz / _total_pop_by_taz"
-,_proportion_low_income_deciles,"quantiles(_proportion_low_income_by_taz, 10, '{rank}')"
-,low_income_share_by_taz_deciles,"_proportion_low_income_deciles.rename('low_income_share_by_taz_deciles').reset_index()"
-
-# Count persons by income category
-# (income_category is calculated by an expression in summarize_preprocessor.csv)
-#,persons_by_income_category,persons_merged.groupby('income_category')[['income_category']].count().T
-
-# Calculate vmt per capita quintiles by taz
-# (Output quintiles by specifying '{rank}' as the label format in the quantile function)
-,_vmt_per_capita_quintiles,"quantiles(_vmt_per_capita_by_home_taz, 5, '{rank}')"
-,vmt_per_capita_by_home_taz_quintiles,"_vmt_per_capita_quintiles.rename('vmt_per_capita_by_home_taz_quintiles').reset_index()"
-
-# Counts of non-motorized trips by 0.25-mile distance bins
-,_non_motorized_distances,(trips_merged.walk_distance + trips_merged.bike_distance)
-,_non_motorized_trips,trips_merged.major_trip_mode == 'Non-Motorized'
-,_non_motorized_trip_distances,_non_motorized_distances[_non_motorized_trips]
-,_counts_of_non_motorized_trips_by_distance_bin,"spaced_intervals(_non_motorized_trip_distances, 0, 0.25).value_counts()"
-,non_motorized_trip_distance_bins,"_counts_of_non_motorized_trips_by_distance_bin.sort_index().rename('trips').reset_index()"
-
-# Counts of trips by income and travel time category
 #,trips_by_income_and_travel_time_category,"trips_merged.groupby(['trip_income_category','total_time_category']).size().rename('trips').unstack(-2).reset_index()"
 
-
-
-
-
-
+,taz_count,"len(land_use)"