From f7377a5379e18e37b56b057b70495b5e8264b405 Mon Sep 17 00:00:00 2001 From: Austen Sharpe <> Date: Fri, 19 Mar 2021 12:51:40 -0600 Subject: [PATCH] fix line length to hard 88 --- src/pudl/transform/eia860.py | 115 ++++++++++------ src/pudl/transform/eia861.py | 249 ++++++++++++++++++++-------------- src/pudl/transform/eia923.py | 163 ++++++++++++++-------- src/pudl/transform/epacems.py | 23 ++-- src/pudl/transform/ferc1.py | 244 +++++++++++++++++++++++++-------- src/pudl/transform/ferc714.py | 29 +++- 6 files changed, 552 insertions(+), 271 deletions(-) diff --git a/src/pudl/transform/eia860.py b/src/pudl/transform/eia860.py index 61de118a4c..b6087f3634 100644 --- a/src/pudl/transform/eia860.py +++ b/src/pudl/transform/eia860.py @@ -17,17 +17,22 @@ def ownership(eia860_dfs, eia860_transformed_dfs): Transformations include: - - Replace . values with NA. - - Convert pre-2012 ownership percentages to proportions to match post-2012 reporting. + * Replace . values with NA. + * Convert pre-2012 ownership percentages to proportions to match post-2012 + reporting. Args: eia860_dfs (dict): Each entry in this dictionary of DataFrame objects - corresponds to a page from the EIA860 form, as reported in the Excel spreadsheets they distribute - eia860_transformed_dfs (dict): A dictionary of DataFrame objects in - which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) + corresponds to a page from the EIA860 form, as reported in the Excel + spreadsheets they distribute. + eia860_transformed_dfs (dict): A dictionary of DataFrame objects in which pages + from EIA860 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) + dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which + pages from EIA860 form (keys) correspond to normalized DataFrames of values + from that page (values). """ # Preiminary clean and get rid of unecessary 'year' column @@ -72,29 +77,37 @@ def generators(eia860_dfs, eia860_transformed_dfs): """ Pull and transform the generators table. - There are three tabs that the generator records come from (proposed, existing, retired). Pre 2009, the existing and retired data are lumped together under a single generator file with one tab. We pull each tab into one dataframe and include an ``operational_status`` to indicate which tab the record came from. We use ``operational_status`` to parse the pre 2009 files as well. + There are three tabs that the generator records come from (proposed, existing, + retired). Pre 2009, the existing and retired data are lumped together under a single + generator file with one tab. We pull each tab into one dataframe and include an + ``operational_status`` to indicate which tab the record came from. We use + ``operational_status`` to parse the pre 2009 files as well. Transformations include: - - Replace . values with NA. - - Update ``operational_status_code`` to reflect plant status as either - proposed, existing or retired. - - Drop values with NA for plant and generator id. - - Replace 0 values with NA where appropriate. - - Convert Y/N/X values to boolean True/False. - - Convert U/Unknown values to NA. - - Map full spelling onto code values. - - Create a fuel_type_code_pudl field that organizes fuel types into + * Replace . values with NA. + * Update ``operational_status_code`` to reflect plant status as either proposed, + existing or retired. + * Drop values with NA for plant and generator id. + * Replace 0 values with NA where appropriate. + * Convert Y/N/X values to boolean True/False. + * Convert U/Unknown values to NA. + * Map full spelling onto code values. + * Create a fuel_type_code_pudl field that organizes fuel types into clean, distinguishable categories. Args: eia860_dfs (dict): Each entry in this - dictionary of DataFrame objects corresponds to a page from the EIA860 form, as reported in the Excel spreadsheets they distribute. + dictionary of DataFrame objects corresponds to a page from the EIA860 form, + as reported in the Excel spreadsheets they distribute. eia860_transformed_dfs (dict): A dictionary of DataFrame objects in - which pages from EIA860 form (keys) correspond to a normalized DataFrame of values from that page (values) + which pages from EIA860 form (keys) correspond to a normalized DataFrame of + values from that page (values). Returns: - dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values). + dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA860 form (keys) correspond to normalized DataFrames of values from that + page (values). """ # Groupby objects were creating chained assignment warning that is N/A @@ -252,22 +265,29 @@ def plants(eia860_dfs, eia860_transformed_dfs): """ Pull and transform the plants table. - Much of the static plant information is reported repeatedly, and scattered across several different pages of EIA 923. The data frame which this function uses is assembled from those many different pages, and passed in via the same dictionary of dataframes that all the other ingest functions use for uniformity. + Much of the static plant information is reported repeatedly, and scattered across + several different pages of EIA 923. The data frame which this function uses is + assembled from those many different pages, and passed in via the same dictionary of + dataframes that all the other ingest functions use for uniformity. Transformations include: - - Replace . values with NA. - - Homogenize spelling of county names. - - Convert Y/N/X values to boolean True/False. + * Replace . values with NA. + * Homogenize spelling of county names. + * Convert Y/N/X values to boolean True/False. Args: eia860_dfs (dict): Each entry in this dictionary of DataFrame objects - corresponds to a page from the EIA860 form, as reported in the Excel spreadsheets they distribute. + corresponds to a page from the EIA860 form, as reported in the Excel + spreadsheets they distribute. eia860_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) + from EIA860 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) + dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA860 form (keys) correspond to normalized DataFrames of values from that + page (values). """ # Populating the 'plants_eia860' table @@ -336,17 +356,21 @@ def boiler_generator_assn(eia860_dfs, eia860_transformed_dfs): Transformations include: - - Drop non-data rows with EIA notes. - - Drop duplicate rows. + * Drop non-data rows with EIA notes. + * Drop duplicate rows. Args: eia860_dfs (dict): Each entry in this dictionary of DataFrame objects - corresponds to a page from the EIA860 form, as reported in the Excel spreadsheets they distribute. + corresponds to a page from the EIA860 form, as reported in the Excel + spreadsheets they distribute. eia860_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) + from EIA860 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) + dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA860 form (keys) correspond to normalized DataFrames of values from that + page (values). """ # Populating the 'generators_eia860' table @@ -388,22 +412,26 @@ def utilities(eia860_dfs, eia860_transformed_dfs): Transformations include: - - Replace . values with NA. - - Fix typos in state abbreviations, convert to uppercase. - - Drop address_3 field (all NA). - - Combine phone number columns into one field and set values that don't - mimic real US phone numbers to NA. - - Convert Y/N/X values to boolean True/False. - - Map full spelling onto code values. + * Replace . values with NA. + * Fix typos in state abbreviations, convert to uppercase. + * Drop address_3 field (all NA). + * Combine phone number columns into one field and set values that don't mimic real + US phone numbers to NA. + * Convert Y/N/X values to boolean True/False. + * Map full spelling onto code values. Args: eia860_dfs (dict): Each entry in this - dictionary of DataFrame objects corresponds to a page from the EIA860 form, as reported in the Excel spreadsheets they distribute. + dictionary of DataFrame objects corresponds to a page from the EIA860 form, + as reported in the Excel spreadsheets they distribute. eia860_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) + from EIA860 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA860 form (keys) correspond to normalized DataFrames of values from that page (values) + dict: eia860_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA860 form (keys) correspond to normalized DataFrames of values from that + page (values). """ # Populating the 'utilities_eia860' table @@ -485,10 +513,11 @@ def transform(eia860_raw_dfs, eia860_tables=pc.pudl_tables["eia860"]): eia860_raw_dfs (dict): a dictionary of tab names (keys) and DataFrames (values). This can be generated by pudl. eia860_tables (tuple): A tuple containing the names of the EIA 860 tables that - can be pulled into PUDL + can be pulled into PUDL. Returns: - dict: A dictionary of DataFrame objects in which pages from EIA860 form (keys) corresponds to a normalized DataFrame of values from that page (values) + dict: A dictionary of DataFrame objects in which pages from EIA860 form (keys) + corresponds to a normalized DataFrame of values from that page (values). """ # these are the tables that we have transform functions for... diff --git a/src/pudl/transform/eia861.py b/src/pudl/transform/eia861.py index 862d854900..8b1adce849 100644 --- a/src/pudl/transform/eia861.py +++ b/src/pudl/transform/eia861.py @@ -440,12 +440,12 @@ def _ba_code_backfill(df): retained. Args: - ba_eia861 (pandas.DataFrame): The transformed EIA 861 Balancing - Authority dataframe (balancing_authority_eia861). + ba_eia861 (pandas.DataFrame): The transformed EIA 861 Balancing Authority + dataframe (balancing_authority_eia861). Returns: - pandas.DataFrame: The balancing_authority_eia861 dataframe, but with - many fewer NA values in the balancing_authority_code_eia column. + pandas.DataFrame: The balancing_authority_eia861 dataframe, but with many fewer + NA values in the balancing_authority_code_eia column. """ start_len = len(df) @@ -454,7 +454,9 @@ def _ba_code_backfill(df): f"Started with {start_nas} missing BA Codes out of {start_len} " f"records ({start_nas/start_len:.2%})") ba_ids = ( - df[["balancing_authority_id_eia", "balancing_authority_code_eia", "report_date"]] + df[["balancing_authority_id_eia", + "balancing_authority_code_eia", + "report_date"]] .drop_duplicates() .sort_values(["balancing_authority_id_eia", "report_date"]) ) @@ -488,7 +490,8 @@ def _tidy_class_dfs(df, df_name, idx_cols, class_list, class_type, keep_totals=F if 'balancing_authority_code_eia' in idx_cols: df = ( df.assign( - balancing_authority_code_eia=lambda x: x.balancing_authority_code_eia.fillna("UNK")) + balancing_authority_code_eia=( + lambda x: x.balancing_authority_code_eia.fillna("UNK"))) ) raw_df = ( df.dropna(subset=["utility_id_eia"]) @@ -504,8 +507,8 @@ def _tidy_class_dfs(df, df_name, idx_cols, class_list, class_type, keep_totals=F # deliniated in the class_list not just an underscore. This enables prefixes with # underscores such as fuel_cell as opposed to single-word prefixes followed by # underscores. Final string looks like: '(?<=customer_test)_|(?<=unbundled)_' - # This ensures that the underscore AFTER the desired string (that can also include underscores) - # is where the column headers are split, not just the first underscore. + # This ensures that the underscore AFTER the desired string (that can also include + # underscores) is where the column headers are split, not just the first underscore. class_list_regex = '|'.join(['(?<=' + col + ')_' for col in class_list]) data_cols.columns = ( @@ -618,11 +621,11 @@ def _compare_totals(data_cols, idx_cols, class_type, df_name): def _clean_nerc(df, idx_cols): """Clean NERC region entries and make new rows for multiple nercs. - This function examines reported NERC regions and makes sure the output column of - the same name has reliable, singular NERC region acronyms. To do so, this function - identifies entries where there are two or more NERC regions specified in a single cell - (such as SPP & ERCOT) and makes new, duplicate rows for each NERC region. It also - converts non-recognized reported nerc regions to 'UNK'. + This function examines reported NERC regions and makes sure the output column of the + same name has reliable, singular NERC region acronyms. To do so, this function + identifies entries where there are two or more NERC regions specified in a single + cell (such as SPP & ERCOT) and makes new, duplicate rows for each NERC region. It + also converts non-recognized reported nerc regions to 'UNK'. Args: df (pandas.DataFrame): A DataFrame with the column 'nerc_region' to be cleaned. @@ -657,9 +660,11 @@ def _clean_nerc(df, idx_cols): nerc_col = nerc_df['nerc_region'].tolist() nerc_list = list(set([item for sublist in nerc_col for item in sublist])) non_nerc_list = [ - nerc_entity for nerc_entity in nerc_list if nerc_entity not in pc.RECOGNIZED_NERC_REGIONS + list(NERC_SPELLCHECK.keys())] + nerc_entity for nerc_entity in nerc_list + if nerc_entity not in pc.RECOGNIZED_NERC_REGIONS + list(NERC_SPELLCHECK.keys())] print( - f'The following reported NERC regions are not currently recognized and become UNK values: {non_nerc_list}') + f'The following reported NERC regions are not currently recognized and become \ + UNK values: {non_nerc_list}') # Function to turn instances of 'SPP_UNK' or 'SPP_SPP' into 'SPP' def _remove_nerc_duplicates(entity_list): @@ -670,12 +675,16 @@ def _remove_nerc_duplicates(entity_list): entity_list = [entity_list[0]] return entity_list - # Go through the nerc regions, spellcheck errors, delete those that aren't recognized, and piece them back together - # (with _ separator if more than one recognized) + # Go through the nerc regions, spellcheck errors, delete those that aren't + # recognized, and piece them back together (with _ separator if more than one + # recognized) nerc_df['nerc_region'] = ( nerc_df['nerc_region'] - .apply(lambda x: [i if i not in NERC_SPELLCHECK.keys() else NERC_SPELLCHECK[i] for i in x]) - .apply(lambda x: sorted([i if i in pc.RECOGNIZED_NERC_REGIONS else 'UNK' for i in x])) + .apply(lambda x: ( + [i if i not in NERC_SPELLCHECK.keys() + else NERC_SPELLCHECK[i] for i in x])) + .apply(lambda x: sorted( + [i if i in pc.RECOGNIZED_NERC_REGIONS else 'UNK' for i in x])) .apply(lambda x: _remove_nerc_duplicates(x)) .str.join('_') ) @@ -712,10 +721,12 @@ def _compare_nerc_physical_w_nerc_operational(df): # Set NA states to UNK df['state'] = df['state'].fillna('UNK') - # Create column indicating whether the nerc region matches the nerc region of operation (TRUE) + # Create column indicating whether the nerc region matches the nerc region of + # operation (TRUE) df['nerc_match'] = df['nerc_region'] == df['nerc_regions_of_operation'] - # Group by utility, state, and report date to see which groups have at least one TRUE value + # Group by utility, state, and report date to see which groups have at least one + # TRUE value grouped_nerc_match_bools = ( df.groupby(['utility_id_eia', 'state', 'report_date']) [['nerc_match']].any() @@ -723,8 +734,8 @@ def _compare_nerc_physical_w_nerc_operational(df): .rename(columns={'nerc_match': 'nerc_group_match'}) ) - # Merge back with original df to show cases where there are multiple non-matching nerc values - # per utility id, year, and state. + # Merge back with original df to show cases where there are multiple non-matching + # nerc values per utility id, year, and state. expanded_nerc_match_bools = ( pd.merge(df, grouped_nerc_match_bools, @@ -766,16 +777,18 @@ def service_territory(tfr_dfs): Transformations include: - - Homogenize spelling of county names. - - Add field for state/county FIPS code. + * Homogenize spelling of county names. + * Add field for state/county FIPS code. Args: tfr_dfs (dict): A dictionary of DataFrame objects in which pages from EIA861 - form (keys) correspond to normalized DataFrames of values from that page (values). + form (keys) correspond to normalized DataFrames of values from that page + (values). Returns: dict: a dictionary of pandas.DataFrame objects in which pages from EIA861 form - (keys) correspond to normalized DataFrames of values from that page (values). + (keys) correspond to normalized DataFrames of values from that page + (values). """ # No data tidying required @@ -803,9 +816,9 @@ def balancing_authority(tfr_dfs): Transformations include: - - Fill in balancing authrority IDs based on date, utility ID, and BA Name. - - Backfill balancing authority codes based on BA ID. - - Fix BA code and ID typos. + * Fill in balancing authrority IDs based on date, utility ID, and BA Name. + * Backfill balancing authority codes based on BA ID. + * Fix BA code and ID typos. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -853,16 +866,29 @@ def balancing_authority_assn(tfr_dfs): """ Compile a balancing authority, utility, state association table. - For the years up through 2012, the only BA-Util information that's available comes from the balancing_authority_eia861 table, and it does not include any state-level information. However, there is utility-state association information in the sales_eia861 and other data tables. + For the years up through 2012, the only BA-Util information that's available comes + from the balancing_authority_eia861 table, and it does not include any state-level + information. However, there is utility-state association information in the + sales_eia861 and other data tables. - For the years from 2013 onward, there's explicit BA-Util-State information in the data tables (e.g. sales_eia861). These observed associations can be compiled to give us a picture of which BA-Util-State associations exist. However, we need to merge in the balancing authority IDs since the data tables only contain the balancing authority codes. + For the years from 2013 onward, there's explicit BA-Util-State information in the + data tables (e.g. sales_eia861). These observed associations can be compiled to give + us a picture of which BA-Util-State associations exist. However, we need to merge in + the balancing authority IDs since the data tables only contain the balancing + authority codes. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 dataframes. This must - include any dataframes from which we want to compile BA-Util-State associations, which means this function has to be called after all the basic transformfunctions that depend on only a single raw table. + include any dataframes from which we want to compile BA-Util-State + associations, which means this function has to be called after all the basic + transformfunctions that depend on only a single raw table. Returns: - dict: a dictionary of transformed dataframes. This function both compiles the association table, and finishes the normalization of the balancing authority table. It may be that once the harvesting process incorporates the EIA 861, some or all of this functionality should be pulled into the phase-2 transform functions. + dict: a dictionary of transformed dataframes. This function both compiles the + association table, and finishes the normalization of the balancing authority + table. It may be that once the harvesting process incorporates the EIA 861, some + or all of this functionality should be pulled into the phase-2 transform + functions. """ # These aren't really "data" tables, and should not be searched for associations @@ -1035,12 +1061,12 @@ def sales(tfr_dfs): Transformations include: - - Remove rows with utility ids 88888 and 99999. - - Tidy data by customer class. - - Drop primary key duplicates. - - Convert 1000s of dollars into dollars. - - Convert data_observed field I/O into boolean. - - Map full spelling onto code values. + * Remove rows with utility ids 88888 and 99999. + * Tidy data by customer class. + * Drop primary key duplicates. + * Convert 1000s of dollars into dollars. + * Convert data_observed field I/O into boolean. + * Map full spelling onto code values. """ idx_cols = [ @@ -1119,8 +1145,8 @@ def advanced_metering_infrastructure(tfr_dfs): Transformations include: - - Tidy data by customer class. - - Drop total_meters columns (it's calculable with other fields). + * Tidy data by customer class. + * Drop total_meters columns (it's calculable with other fields). Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1168,10 +1194,11 @@ def demand_response(tfr_dfs): Transformations include: - - Fill in NA balancing authority codes with UNK (because it's part of the primary key). - - Tidy subset of the data by customer class. - - Drop duplicate rows based on primary keys. - - Convert 1000s of dollars into dollars. + * Fill in NA balancing authority codes with UNK (because it's part of the primary + key). + * Tidy subset of the data by customer class. + * Drop duplicate rows based on primary keys. + * Convert 1000s of dollars into dollars. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1250,18 +1277,25 @@ def demand_side_management(tfr_dfs): """ Transform the EIA 861 Demand Side Management table. - In 2013, the EIA changed the contents of the 861 form so that information - pertaining to demand side management was no longer housed in a single table, - but rather two seperate ones pertaining to energy efficiency and demand response. - While the pre and post 2013 tables contain similar information, one column in the pre-2013 demand side management table may not have an obvious column equivalent in the post-2013 energy efficiency or demand response data. We've addressed this by keeping the demand side management and energy efficiency and demand response tables seperate. Use the DSM table for pre 2013 data and the EE / DR tables for post 2013 data. Despite the uncertainty of comparing across these years, the data are similar and we hope to provide a cohesive dataset in the future with all years and comprable columns combined. + In 2013, the EIA changed the contents of the 861 form so that information pertaining + to demand side management was no longer housed in a single table, but rather two + seperate ones pertaining to energy efficiency and demand response. While the pre and + post 2013 tables contain similar information, one column in the pre-2013 demand side + management table may not have an obvious column equivalent in the post-2013 energy + efficiency or demand response data. We've addressed this by keeping the demand side + management and energy efficiency and demand response tables seperate. Use the DSM + table for pre 2013 data and the EE / DR tables for post 2013 data. Despite the + uncertainty of comparing across these years, the data are similar and we hope to + provide a cohesive dataset in the future with all years and comprable columns + combined. Transformations include: - - Clean up NERC codes and ensure one per row. - - Remove demand_side_management and data_observed columns (they are all the same). - - Tidy subset of the data by customer class. - - Convert Y/N columns to booleans. - - Convert 1000s of dollars into dollars. + * Clean up NERC codes and ensure one per row. + * Remove demand_side_management and data_observed columns (they are all the same). + * Tidy subset of the data by customer class. + * Convert Y/N columns to booleans. + * Convert 1000s of dollars into dollars. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1304,7 +1338,8 @@ def demand_side_management(tfr_dfs): raw_dsm = tfr_dfs['demand_side_management_eia861'].copy() ########################################################################### - # Transform Data Round 1 (must be done to avoid issues with nerc_region col in _tidy_class_dfs()) + # Transform Data Round 1 (must be done to avoid issues with nerc_region col in + # _tidy_class_dfs()) # * Clean NERC region col # * Drop data_status and demand_side_management cols (they don't contain anything) ########################################################################### @@ -1415,11 +1450,11 @@ def distributed_generation(tfr_dfs): Transformations include: - - Map full spelling onto code values. - - Convert pre-2010 percent values in mw values. - - Remove total columns calculable with other fields - - Tidy subset of the data by tech class. - - Tidy subset of the data by fuel class. + * Map full spelling onto code values. + * Convert pre-2010 percent values in mw values. + * Remove total columns calculable with other fields. + * Tidy subset of the data by tech class. + * Tidy subset of the data by fuel class. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1501,9 +1536,9 @@ def distributed_generation(tfr_dfs): # * Turn pct values into mw values # * Remove old pct cols and totals cols # Explanation: Pre 2010 reporting asks for components as a percent of total capacity - # whereas after 2010, the forms ask for the component portion as a mw value. In order - # To coalesce similar data, we've used total values to turn percent values from pre 2010 - # into mw values like those post-2010. + # whereas after 2010, the forms ask for the component portion as a mw value. In + # order to coalesce similar data, we've used total values to turn percent values + # from pre 2010 into mw values like those post-2010. ########################################################################### # Separate datasets into years with only pct values (pre-2010) and years with only mw values (post-2010) @@ -1587,7 +1622,7 @@ def distribution_systems(tfr_dfs): Transformations include: - - No additional transformations. + * No additional transformations. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1618,8 +1653,8 @@ def dynamic_pricing(tfr_dfs): Transformations include: - - Tidy subset of the data by customer class. - - Convert Y/N columns to booleans. + * Tidy subset of the data by customer class. + * Convert Y/N columns to booleans. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1685,9 +1720,9 @@ def energy_efficiency(tfr_dfs): Transformations include: - - Tidy subset of the data by customer class. - - Drop website column (almost no valid information). - - Convert 1000s of dollars into dollars. + * Tidy subset of the data by customer class. + * Drop website column (almost no valid information). + * Convert 1000s of dollars into dollars. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1756,8 +1791,8 @@ def green_pricing(tfr_dfs): Transformations include: - - Tidy subset of the data by customer class. - - Convert 1000s of dollars into dollars. + * Tidy subset of the data by customer class. + * Convert 1000s of dollars into dollars. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1816,8 +1851,8 @@ def mergers(tfr_dfs): Transformations include: - - Map full spelling onto code values. - - Retain preceeding zeros in zipcode field. + * Map full spelling onto code values. + * Retain preceeding zeros in zipcode field. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1857,9 +1892,9 @@ def net_metering(tfr_dfs): Transformations include: - - Remove rows with utility ids 99999. - - Tidy subset of the data by customer class. - - Tidy subset of the data by tech class. + * Remove rows with utility ids 99999. + * Tidy subset of the data by customer class. + * Tidy subset of the data by tech class. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -1941,10 +1976,10 @@ def non_net_metering(tfr_dfs): Transformations include: - - Remove rows with utility ids 99999. - - Drop duplicate rows. - - Tidy subset of the data by customer class. - - Tidy subset of the data by tech class. + * Remove rows with utility ids 99999. + * Drop duplicate rows. + * Tidy subset of the data by customer class. + * Tidy subset of the data by tech class. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -2018,9 +2053,11 @@ def non_net_metering(tfr_dfs): keep_totals=True ) - # No duplicates to speak of (deleted 2018 duplicates above) but take measures to check just in case + # No duplicates to speak of (deleted 2018 duplicates above) but take measures to + # check just in case _check_for_dupes( - tidy_nnm_customer_fuel_class, 'Non Net Metering Customer & Fuel Class', idx_cols) + tidy_nnm_customer_fuel_class, + 'Non Net Metering Customer & Fuel Class', idx_cols) # Delete total_capacity_mw col for redundancy (must delete x not y) tidy_nnm_customer_fuel_class = ( @@ -2033,7 +2070,8 @@ def non_net_metering(tfr_dfs): # Drop original net_metering_eia861 table from tfr_dfs del tfr_dfs['non_net_metering_eia861'] - tfr_dfs["non_net_metering_customer_fuel_class_eia861"] = tidy_nnm_customer_fuel_class + tfr_dfs["non_net_metering_customer_fuel_class_eia861"] = ( + tidy_nnm_customer_fuel_class) tfr_dfs["non_net_metering_misc_eia861"] = raw_nnm_misc return tfr_dfs @@ -2045,12 +2083,12 @@ def operational_data(tfr_dfs): Transformations include: - - Remove rows with utility ids 88888. - - Remove rows with NA utility id. - - Clean up NERC codes and ensure one per row. - - Convert data_observed field I/O into boolean. - - Tidy subset of the data by revenue class. - - Convert 1000s of dollars into dollars. + * Remove rows with utility ids 88888. + * Remove rows with NA utility id. + * Clean up NERC codes and ensure one per row. + * Convert data_observed field I/O into boolean. + * Tidy subset of the data by revenue class. + * Convert 1000s of dollars into dollars. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -2139,10 +2177,10 @@ def reliability(tfr_dfs): Transformations include: - - Tidy subset of the data by reliability standard. - - Convert Y/N columns to booleans. - - Map full spelling onto code values. - - Drop duplicate rows. + * Tidy subset of the data by reliability standard. + * Convert Y/N columns to booleans. + * Map full spelling onto code values. + * Drop duplicate rows. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -2179,7 +2217,8 @@ def reliability(tfr_dfs): ########################################################################### # Transform Data: - # * Re-code outages_recorded_automatically and inactive_accounts_included to boolean: + # * Re-code outages_recorded_automatically and inactive_accounts_included to + # boolean: # * Y/y="Yes" => True # * N/n="No" => False # * Expand momentary_interruption_definition: @@ -2218,11 +2257,11 @@ def utility_data(tfr_dfs): Transformations include: - - Remove rows with utility ids 88888. - - Clean up NERC codes and ensure one per row. - - Tidy subset of the data by NERC region. - - Tidy subset of the data by RTO. - - Convert Y/N columns to booleans. + * Remove rows with utility ids 88888. + * Clean up NERC codes and ensure one per row. + * Tidy subset of the data by NERC region. + * Tidy subset of the data by RTO. + * Convert Y/N columns to booleans. Args: tfr_dfs (dict): A dictionary of transformed EIA 861 DataFrames, keyed by table @@ -2246,10 +2285,11 @@ def utility_data(tfr_dfs): ) - ########################################################################### - # Transform Data Round 1 (must be done to avoid issues with nerc_region col in _tidy_class_dfs()) + ############################################################################## + # Transform Data Round 1 (must be done to avoid issues with nerc_region col in + # _tidy_class_dfs()) # * Clean NERC region col - ########################################################################### + ############################################################################## transformed_ud = _clean_nerc(raw_ud, idx_cols) @@ -2374,7 +2414,8 @@ def transform(raw_dfs, eia861_tables=pc.pudl_tables["eia861"]): can be pulled into PUDL. Returns: - dict: A dictionary of DataFrame objects in which pages from EIA 861 form (keys) corresponds to a normalized DataFrame of values from that page (values). + dict: A dictionary of DataFrame objects in which pages from EIA 861 form (keys) + corresponds to a normalized DataFrame of values from that page (values). """ # these are the tables that we have transform functions for... diff --git a/src/pudl/transform/eia923.py b/src/pudl/transform/eia923.py index 600a9a9ce7..22cc4f918b 100644 --- a/src/pudl/transform/eia923.py +++ b/src/pudl/transform/eia923.py @@ -18,16 +18,23 @@ def _yearly_to_monthly_records(df, md): """Converts an EIA 923 record of 12 months of data into 12 monthly records. - Much of the data reported in EIA 923 is monthly, but all 12 months worth of data is reported in a single record, with one field for each of the 12 months. This function converts these annualized composite records into a set of 12 monthly records containing the same information, by parsing the field names for months, and adding a month field. Non - time series data is retained in the same format. + Much of the data reported in EIA 923 is monthly, but all 12 months worth of data is + reported in a single record, with one field for each of the 12 months. This + function converts these annualized composite records into a set of 12 monthly + records containing the same information, by parsing the field names for months, and + adding a month field. Non - time series data is retained in the same format. Args: df (pandas.DataFrame): A pandas DataFrame containing the annual data to be converted into monthly records. md (dict): a dictionary with the integers 1-12 as keys, and the patterns used - to match field names for each of the months as values. These patterns are also used to rename the columns in the dataframe which is returned, so they need to match the entire portion of the column name that is month specific. + to match field names for each of the months as values. These patterns are + also used to rename the columns in the dataframe which is returned, so they + need to match the entire portion of the column name that is month specific. Returns: - pandas.DataFrame: A dataframe containing the same data as was passed in via df, but with monthly records instead of annual records. + pandas.DataFrame: A dataframe containing the same data as was passed in via df, + but with monthly records instead of annual records. """ yearly = df.copy() @@ -63,11 +70,20 @@ def _yearly_to_monthly_records(df, md): def _coalmine_cleanup(cmi_df): """Cleans up the coalmine_eia923 table. - This function does most of the coalmine_eia923 table transformation. It is separate from the coalmine() transform function because of the peculiar way that we are normalizing the fuel_receipts_costs_eia923() table. + This function does most of the coalmine_eia923 table transformation. It is separate + from the coalmine() transform function because of the peculiar way that we are + normalizing the fuel_receipts_costs_eia923() table. - All of the coalmine information is originally coming from the EIA fuel_receipts_costs spreadsheet, but it really belongs in its own table. We strip it out of FRC, and create that separate table, but then we need to refer to that table through a foreign key. To do so, we actually merge the entire contents of the coalmine table into FRC, including the surrogate key, and then drop the data fields. + All of the coalmine information is originally coming from the EIA + fuel_receipts_costs spreadsheet, but it really belongs in its own table. We strip it + out of FRC, and create that separate table, but then we need to refer to that table + through a foreign key. To do so, we actually merge the entire contents of the + coalmine table into FRC, including the surrogate key, and then drop the data fields. - For this to work, we need to have exactly the same coalmine data fields in both the new coalmine table, and the FRC table. To ensure that's true, we isolate the transformations here in this function, and apply them to the coalmine columns in both the FRC table and the coalmine table. + For this to work, we need to have exactly the same coalmine data fields in both the + new coalmine table, and the FRC table. To ensure that's true, we isolate the + transformations here in this function, and apply them to the coalmine columns in + both the FRC table and the coalmine table. Args: cmi_df (pandas.DataFrame): A DataFrame to be cleaned, containing coalmine @@ -127,23 +143,30 @@ def _coalmine_cleanup(cmi_df): def plants(eia923_dfs, eia923_transformed_dfs): """Transforms the plants_eia923 table. - Much of the static plant information is reported repeatedly, and scattered across several different pages of EIA 923. The data frame that this function uses is assembled from those many different pages, and passed in via the same dictionary of dataframes that all the other ingest functions use for uniformity. + Much of the static plant information is reported repeatedly, and scattered across + several different pages of EIA 923. The data frame that this function uses is + assembled from those many different pages, and passed in via the same dictionary of + dataframes that all the other ingest functions use for uniformity. Transformations include: - - Map full spelling onto code values. - - Convert Y/N columns to booleans. - - Remove excess white space around values. - - Drop duplicate rows. + * Map full spelling onto code values. + * Convert Y/N columns to booleans. + * Remove excess white space around values. + * Drop duplicate rows. Args: eia923_dfs (dictionary of pandas.DataFrame): Each entry in this dictionary of - DataFrame objects corresponds to a page from the EIA 923 form, as reported in the Excel spreadsheets they distribute. + DataFrame objects corresponds to a page from the EIA 923 form, as reported + in the Excel spreadsheets they distribute. eia923_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values) + from EIA923 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values) + dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA923 form (keys) correspond to normalized DataFrames of values from that + page (values). """ plant_info_df = eia923_dfs['plant_frame'].copy() @@ -185,21 +208,25 @@ def generation_fuel(eia923_dfs, eia923_transformed_dfs): Transformations include: - - Remove fields implicated elsewhere. - - Replace . values with NA. - - Remove rows with utility ids 99999. - - Create a fuel_type_code_pudl field that organizes fuel types into + * Remove fields implicated elsewhere. + * Replace . values with NA. + * Remove rows with utility ids 99999. + * Create a fuel_type_code_pudl field that organizes fuel types into clean, distinguishable categories. - - Combine year and month columns into a single date column. + * Combine year and month columns into a single date column. Args: eia923_dfs (dict): Each entry in this dictionary of DataFrame objects - corresponds to a page from the EIA923 form, as reported in the Excel spreadsheets they distribute. + corresponds to a page from the EIA923 form, as reported in the Excel + spreadsheets they distribute. eia923_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). + from EIA923 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). + dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA923 form (keys) correspond to normalized DataFrames of values from that + page (values). """ # This needs to be a copy of what we're passed in so we can edit it. @@ -232,8 +259,10 @@ def generation_fuel(eia923_dfs, eia923_transformed_dfs): # any particular plant (they have plant_id_eia == operator_id == 99999) gf_df = gf_df[gf_df.plant_id_eia != 99999] - gf_df['fuel_type_code_pudl'] = pudl.helpers.cleanstrings_series(gf_df.fuel_type, - pc.fuel_type_eia923_gen_fuel_simple_map) + gf_df['fuel_type_code_pudl'] = ( + pudl.helpers.cleanstrings_series(gf_df.fuel_type, + pc.fuel_type_eia923_gen_fuel_simple_map) + ) # Convert Year/Month columns into a single Date column... gf_df = pudl.helpers.convert_to_date(gf_df) @@ -248,21 +277,25 @@ def boiler_fuel(eia923_dfs, eia923_transformed_dfs): Transformations include: - - Remove fields implicated elsewhere. - - Drop values with plant and boiler id values of NA. - - Replace . values with NA. - - Create a fuel_type_code_pudl field that organizes fuel types into clean, distinguishable categories. - - Combine year and month columns into a single date column. + * Remove fields implicated elsewhere. + * Drop values with plant and boiler id values of NA. + * Replace . values with NA. + * Create a fuel_type_code_pudl field that organizes fuel types into clean, + distinguishable categories. + * Combine year and month columns into a single date column. Args: eia923_dfs (dict): Each entry in this dictionary of DataFrame objects - corresponds to a page from the EIA923 form, as reported in the Excel spreadsheets they distribute. + corresponds to a page from the EIA923 form, as reported in the Excel + spreadsheets they distribute. eia923_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values) + from EIA923 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages - from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). + from EIA923 form (keys) correspond to normalized DataFrames of values from + that page (values). """ bf_df = eia923_dfs['boiler_fuel'].copy() @@ -305,19 +338,23 @@ def generation(eia923_dfs, eia923_transformed_dfs): Transformations include: - - Drop rows with NA for generator id. - - Remove fields implicated elsewhere. - - Replace . values with NA. - - Drop generator-date row duplicates (all have no data). + * Drop rows with NA for generator id. + * Remove fields implicated elsewhere. + * Replace . values with NA. + * Drop generator-date row duplicates (all have no data). Args: eia923_dfs (dict): Each entry in this dictionary of DataFrame objects - corresponds to a page from the EIA923 form, as reported in the Excel spreadsheets they distribute. + corresponds to a page from the EIA923 form, as reported in the Excel + spreadsheets they distribute. eia923_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). + from EIA923 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). + dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA923 form (keys) correspond to normalized DataFrames of values from that + page (values). """ gen_df = ( @@ -357,17 +394,21 @@ def coalmine(eia923_dfs, eia923_transformed_dfs): Transformations include: - - Remove fields implicated elsewhere. - - Drop duplicates with MSHA ID. + * Remove fields implicated elsewhere. + * Drop duplicates with MSHA ID. Args: eia923_dfs (dict): Each entry in this dictionary of DataFrame objects - corresponds to a page from the EIA923 form, as reported in the Excel spreadsheets they distribute. + corresponds to a page from the EIA923 form, as reported in the Excel + spreadsheets they distribute. eia923_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values) + from EIA923 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). + dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA923 form (keys) correspond to normalized DataFrames of values from that + page (values). """ # These are the columns that we want to keep from FRC for the @@ -431,22 +472,26 @@ def fuel_receipts_costs(eia923_dfs, eia923_transformed_dfs): Transformations include: - - Remove fields implicated elsewhere. - - Replace . values with NA. - - Standardize codes values. - - Fix dates. - - Replace invalid mercury content values with NA. + * Remove fields implicated elsewhere. + * Replace . values with NA. + * Standardize codes values. + * Fix dates. + * Replace invalid mercury content values with NA. Fuel cost is reported in cents per mmbtu. Converts cents to dollars. Args: eia923_dfs (dict): Each entry in this dictionary of DataFrame objects - corresponds to a page from the EIA923 form, as reported in the Excel spreadsheets they distribute. + corresponds to a page from the EIA923 form, as reported in the Excel + spreadsheets they distribute. eia923_transformed_dfs (dict): A dictionary of DataFrame objects in which pages - from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). + from EIA923 form (keys) correspond to normalized DataFrames of values from + that page (values). Returns: - dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages from EIA923 form (keys) correspond to normalized DataFrames of values from that page (values). + dict: eia923_transformed_dfs, a dictionary of DataFrame objects in which pages + from EIA923 form (keys) correspond to normalized DataFrames of values from that + page (values). """ frc_df = eia923_dfs['fuel_receipts_costs'].copy() @@ -491,10 +536,13 @@ def fuel_receipts_costs(eia923_dfs, eia923_transformed_dfs): pipe(pudl.helpers.fix_int_na, columns=['contract_expiration_date', ]). assign( # Standardize case on transportaion codes -- all upper case! - primary_transportation_mode_code=lambda x: x.primary_transportation_mode_code.str.upper(), - secondary_transportation_mode_code=lambda x: x.secondary_transportation_mode_code.str.upper(), + primary_transportation_mode_code=lambda x: ( + x.primary_transportation_mode_code.str.upper()), + secondary_transportation_mode_code=lambda x: ( + x.secondary_transportation_mode_code.str.upper()), fuel_cost_per_mmbtu=lambda x: x.fuel_cost_per_mmbtu / 100, - fuel_group_code=lambda x: x.fuel_group_code.str.lower().str.replace(' ', '_'), + fuel_group_code=lambda x: ( + x.fuel_group_code.str.lower().str.replace(' ', '_')), fuel_type_code_pudl=lambda x: pudl.helpers.cleanstrings_series( x.energy_source_code, pc.energy_source_eia_simple_map), fuel_group_code_simple=lambda x: pudl.helpers.cleanstrings_series( @@ -542,7 +590,10 @@ def transform(eia923_raw_dfs, eia923_tables=pc.eia923_pudl_tables): into PUDL. Returns: - dict: A dictionary of DataFrame with table names as keys and :class:`pandas.DataFrame` objects as values, where the contents of the DataFrames correspond to cleaned and normalized PUDL database tables, ready for loading. + dict: A dictionary of DataFrame with table names as keys and + :class:`pandas.DataFrame` objects as values, where the contents of the + DataFrames correspond to cleaned and normalized PUDL database tables, ready for + loading. """ eia923_transform_functions = { diff --git a/src/pudl/transform/epacems.py b/src/pudl/transform/epacems.py index 5d8ca6c757..c600c501d3 100644 --- a/src/pudl/transform/epacems.py +++ b/src/pudl/transform/epacems.py @@ -23,7 +23,7 @@ def fix_up_dates(df, plant_utc_offset): Transformations include: - - Account for timezone differences with offset from UTC. + * Account for timezone differences with offset from UTC. Args: df (pandas.DataFrame): A CEMS hourly dataframe for one year-month-state @@ -70,7 +70,8 @@ def fix_up_dates(df, plant_utc_offset): def _load_plant_utc_offset(datapkg_dir): """Load the UTC offset each EIA plant. - CEMS times don't change for DST, so we get get the UTC offset by using the offset for the plants' timezones in January. + CEMS times don't change for DST, so we get get the UTC offset by using the offset + for the plants' timezones in January. Args: datapkg_dir (path-like) : Path to the directory of the datapackage which is @@ -105,15 +106,18 @@ def harmonize_eia_epa_orispl(df): Harmonize the ORISPL code to match the EIA data -- NOT YET IMPLEMENTED. The EIA plant IDs and CEMS ORISPL codes almost match, but not quite. See - https://www.epa.gov/sites/production/files/2018-02/documents/egrid2016_technicalsupportdocument_0.pdf#page=104 for an example. + https://www.epa.gov/sites/production/files/2018-02/documents/egrid2016_technicalsup + portdocument_0.pdf#page=104 for an example. - Note that this transformation needs to be run *before* fix_up_dates, because fix_up_dates uses the plant ID to look up timezones. + Note that this transformation needs to be run *before* fix_up_dates, because + fix_up_dates uses the plant ID to look up timezones. Args: df (pandas.DataFrame): A CEMS hourly dataframe for one year-month-state. Returns: - pandas.DataFrame: The same data, with the ORISPL plant codes corrected to match the EIA plant IDs. + pandas.DataFrame: The same data, with the ORISPL plant codes corrected to match + the EIA plant IDs. Todo: Actually implement the function... @@ -126,13 +130,15 @@ def add_facility_id_unit_id_epa(df): """ Harmonize columns that are added later. - The datapackage validation checks for consistent column names, and these two columns aren't present before August 2008, so this adds them in. + The datapackage validation checks for consistent column names, and these two columns + aren't present before August 2008, so this adds them in. Args: df (pandas.DataFrame): A CEMS dataframe Returns: - pandas.Dataframe: The same DataFrame guaranteed to have int facility_id and unit_id_epa cols. + pandas.Dataframe: The same DataFrame guaranteed to have int facility_id and + unit_id_epa cols. """ if ("facility_id" not in df.columns) or ("unit_id_epa" not in df.columns): @@ -150,7 +156,8 @@ def _all_na_or_values(series, values): """ Test whether every element in the series is either missing or in values. - This is fiddly because isin() changes behavior if the series is totally NaN (because of type issues) + This is fiddly because isin() changes behavior if the series is totally NaN (because + of type issues). Example: x = pd.DataFrame({'a': ['x', np.NaN], 'b': [np.NaN, np.NaN]}) x.isin({'x', np.NaN}) diff --git a/src/pudl/transform/ferc1.py b/src/pudl/transform/ferc1.py index f880c3ecc4..c09f9b68ae 100644 --- a/src/pudl/transform/ferc1.py +++ b/src/pudl/transform/ferc1.py @@ -1,7 +1,11 @@ """ Routines for transforming FERC Form 1 data before loading into the PUDL DB. -This module provides a variety of functions that are used in cleaning up the FERC Form 1 data prior to loading into our database. This includes adopting standardized units and column names, standardizing the formatting of some string values, and correcting data entry errors which we can infer based on the existing data. It may also include removing bad data, or replacing it with the appropriate NA values. +This module provides a variety of functions that are used in cleaning up the FERC Form 1 +data prior to loading into our database. This includes adopting standardized units and +column names, standardizing the formatting of some string values, and correcting data +entry errors which we can infer based on the existing data. It may also include removing +bad data, or replacing it with the appropriate NA values. """ import importlib.resources @@ -415,7 +419,12 @@ """dict: A dictionary of construction types (keys) and lists of construction type strings associated with each type (values) from FERC Form 1. - There are many strings that weren't categorized, including crosses between conventional and outdoor, PV, wind, combined cycle, and internal combustion. The lists are broken out into the two types specified in Form 1: conventional and outdoor. These lists are inclusive so that variants of conventional (e.g. "conventional full") and outdoor (e.g. "outdoor full" and "outdoor hrsg") are included. + There are many strings that weren't categorized, including crosses between + conventional and outdoor, PV, wind, combined cycle, and internal combustion. The + lists are broken out into the two types specified in Form 1: conventional and + outdoor. These lists are inclusive so that variants of conventional (e.g. + "conventional full") and outdoor (e.g. "outdoor full" and "outdoor hrsg") are + included. """ ############################################################################## @@ -427,13 +436,16 @@ def unpack_table(ferc1_df, table_name, data_cols, data_rows): """ Normalize a row-and-column based FERC Form 1 table. - Pulls the named database table from the FERC Form 1 DB and uses the corresponding ferc1_row_map to unpack the row_number coded data. + Pulls the named database table from the FERC Form 1 DB and uses the corresponding + ferc1_row_map to unpack the row_number coded data. Args: ferc1_df (pandas.DataFrame): Raw FERC Form 1 DataFrame from the DB. table_name (str): Original name of the FERC Form 1 DB table. data_cols (list): List of strings corresponding to the original FERC Form 1 - database table column labels -- these are the columns of data that we are extracting (it can be a subset of the columns which are present in the original database). + database table column labels -- these are the columns of data that we are + extracting (it can be a subset of the columns which are present in the + original database). data_rows (list): List of row_names to extract, as defined in the FERC 1 row maps. Set to slice(None) if you want all rows. @@ -495,7 +507,11 @@ def cols_to_cats(df, cat_name, col_cats): """ Turn top-level MultiIndex columns into a categorial column. - In some cases FERC Form 1 data comes with many different types of related values interleaved in the same table -- e.g. current year and previous year income -- this can result in DataFrames that are hundreds of columns wide, which is unwieldy. This function takes those top level MultiIndex labels and turns them into categories in a single column, which can be used to select a particular type of report. + In some cases FERC Form 1 data comes with many different types of related values + interleaved in the same table -- e.g. current year and previous year income -- this + can result in DataFrames that are hundreds of columns wide, which is unwieldy. This + function takes those top level MultiIndex labels and turns them into categories in a + single column, which can be used to select a particular type of report. Args: df (pandas.DataFrame): the dataframe to be simplified. @@ -505,7 +521,9 @@ def cols_to_cats(df, cat_name, col_cats): and the category to which they should be mapped as values. Returns: - pandas.DataFrame: A re-shaped/re-labeled dataframe with one fewer levels of MultiIndex in the columns, and an additional column containing the assigned labels. + pandas.DataFrame: A re-shaped/re-labeled dataframe with one fewer levels of + MultiIndex in the columns, and an additional column containing the assigned + labels. """ out_df = pd.DataFrame() @@ -520,30 +538,48 @@ def cols_to_cats(df, cat_name, col_cats): def _clean_cols(df, table_name): """Adds a FERC record ID and drop FERC columns not to be loaded into PUDL. - It is often useful to be able to tell exactly which record in the FERC Form 1 database a given record within the PUDL database came from. Within each FERC Form 1 table, each record is supposed to be uniquely identified by the combination of: report_year, report_prd, respondent_id, spplmnt_num, row_number. + It is often useful to be able to tell exactly which record in the FERC Form 1 + database a given record within the PUDL database came from. Within each FERC Form 1 + table, each record is supposed to be uniquely identified by the combination of: + report_year, report_prd, respondent_id, spplmnt_num, row_number. - So this function takes a dataframe, checks to make sure it contains each of those columns and that none of them are NULL, and adds a new column to the dataframe containing a string of the format: + So this function takes a dataframe, checks to make sure it contains each of those + columns and that none of them are NULL, and adds a new column to the dataframe + containing a string of the format: {table_name}_{report_year}_{report_prd}_{respondent_id}_{spplmnt_num}_{row_number} - In some PUDL FERC Form 1 tables (e.g. plant_in_service_ferc1) a single row is re-organized into several new records in order to normalize the data and ensure it is stored in a "tidy" format. In such cases each of the resulting PUDL records will have the same ``record_id``. Otherwise, the ``record_id`` is expected to be unique within each FERC Form 1 table. However there are a handful of cases in which this uniqueness constraint is violated due to data reporting issues in FERC Form 1. + In some PUDL FERC Form 1 tables (e.g. plant_in_service_ferc1) a single row is + re-organized into several new records in order to normalize the data and ensure it + is stored in a "tidy" format. In such cases each of the resulting PUDL records will + have the same ``record_id``. Otherwise, the ``record_id`` is expected to be unique + within each FERC Form 1 table. However there are a handful of cases in which this + uniqueness constraint is violated due to data reporting issues in FERC Form 1. - In addition to those primary key columns, there are some columns which are not meaningful or useful in the context of PUDL, but which show up in virtually every FERC table, and this function drops them if they are present. These columns include: row_prvlg, row_seq, item, record_number (a temporary column used in plants_small) and all the footnote columns, which end in "_f". + In addition to those primary key columns, there are some columns which are not + meaningful or useful in the context of PUDL, but which show up in virtually every + FERC table, and this function drops them if they are present. These columns include: + row_prvlg, row_seq, item, record_number (a temporary column used in plants_small) + and all the footnote columns, which end in "_f". Args: df (pandas.DataFrame): The DataFrame in which the function looks for columns - for the unique identification of FERC records, and ensures that those columns are not NULL. + for the unique identification of FERC records, and ensures that those + columns are not NULL. table_name (str): The name of the table that we are cleaning. Returns: - pandas.DataFrame: The same DataFrame with a column appended containing a string of the format {table_name}_{report_year}_{report_prd}_{respondent_id}_{spplmnt_num}_{row_number} + pandas.DataFrame: The same DataFrame with a column appended containing a string + of the format + {table_name}_{report_year}_{report_prd}_{respondent_id}_{spplmnt_num}_{row_number} Raises: AssertionError: If the table input contains NULL columns """ # Make sure that *all* of these columns exist in the proffered table: - for field in ['report_year', 'report_prd', 'respondent_id', 'spplmnt_num', 'row_number']: + for field in ['report_year', 'report_prd', 'respondent_id', + 'spplmnt_num', 'row_number']: if field in df.columns: if df[field].isnull().any(): raise AssertionError( @@ -591,23 +627,38 @@ def _clean_cols(df, table_name): def _multiplicative_error_correction(tofix, mask, minval, maxval, mults): """Corrects data entry errors where data being multiplied by a factor. - In many cases we know that a particular column in the database should have a value in a particular rage (e.g. the heat content of a ton of coal is a well defined physical quantity -- it can be 15 mmBTU/ton or 22 mmBTU/ton, but it can't be 1 mmBTU/ton or 100 mmBTU/ton). Sometimes these fields are reported in the wrong units (e.g. kWh of electricity generated rather than MWh) resulting in several distributions that have a similar shape showing up at different ranges of value within the data. This function takes a one dimensional data series, a description of a valid range for the values, and a list of factors by which we expect to see some of the data multiplied due to unit errors. Data found in these "ghost" - distributions are multiplied by the appropriate factor to bring them into the expected range. - - Data values which are not found in one of the acceptable multiplicative ranges are set to NA. + In many cases we know that a particular column in the database should have a value + in a particular rage (e.g. the heat content of a ton of coal is a well defined + physical quantity -- it can be 15 mmBTU/ton or 22 mmBTU/ton, but it can't be 1 + mmBTU/ton or 100 mmBTU/ton). Sometimes these fields are reported in the wrong units + (e.g. kWh of electricity generated rather than MWh) resulting in several + distributions that have a similar shape showing up at different ranges of value + within the data. This function takes a one dimensional data series, a description + of a valid range for the values, and a list of factors by which we expect to see + some of the data multiplied due to unit errors. Data found in these "ghost" + distributions are multiplied by the appropriate factor to bring them into the + expected range. + + Data values which are not found in one of the acceptable multiplicative ranges are + set to NA. Args: tofix (pandas.Series): A 1-dimensional data series containing the values to be fixed. mask (pandas.Series): A 1-dimensional masking array of True/False values, which - will be used to select a subset of the tofix series onto which we will apply the multiplicative fixes. + will be used to select a subset of the tofix series onto which we will apply + the multiplicative fixes. min (float): the minimum realistic value for the data series. max (float): the maximum realistic value for the data series. mults (list of floats): values by which "real" data may have been multiplied - due to common data entry errors. These values both show us where to look in the full data series to find recoverable data, and also tell us by what factor those values need to be multiplied to bring them back into the reasonable range. + due to common data entry errors. These values both show us where to look in + the full data series to find recoverable data, and also tell us by what + factor those values need to be multiplied to bring them back into the + reasonable range. Returns: - fixed (pandas.Series): a data series of the same length as the input, but with the transformed values. + fixed (pandas.Series): a data series of the same length as the input, but with + the transformed values. """ # Grab the subset of the input series we are going to work on: records_to_fix = tofix[mask] @@ -646,7 +697,8 @@ def plants_steam(ferc1_raw_dfs, ferc1_transformed_dfs): ferc1_transformed_dfs (dict): A dictionary of DataFrames to be transformed. Returns: - dict: of transformed dataframes, including the newly transformed plants_steam_ferc1 dataframe. + dict: of transformed dataframes, including the newly transformed + plants_steam_ferc1 dataframe. """ ferc1_steam_df = ( @@ -898,7 +950,11 @@ def plants_steam_validate_ids(ferc1_steam_df): def fuel(ferc1_raw_dfs, ferc1_transformed_dfs): """Transforms FERC Form 1 fuel data for loading into PUDL Database. - This process includes converting some columns to be in terms of our preferred units, like MWh and mmbtu instead of kWh and btu. Plant names are also standardized (stripped & lower). Fuel and fuel unit strings are also standardized using our cleanstrings() function and string cleaning dictionaries found above (FUEL_STRINGS, etc.) + This process includes converting some columns to be in terms of our preferred units, + like MWh and mmbtu instead of kWh and btu. Plant names are also standardized + (stripped & lower). Fuel and fuel unit strings are also standardized using our + cleanstrings() function and string cleaning dictionaries found above (FUEL_STRINGS, + etc.) Args: ferc1_raw_dfs (dict): Each entry in this dictionary of DataFrame objects @@ -1010,9 +1066,17 @@ def fuel(ferc1_raw_dfs, ferc1_transformed_dfs): def plants_small(ferc1_raw_dfs, ferc1_transformed_dfs): """Transforms FERC Form 1 plant_small data for loading into PUDL Database. - This FERC Form 1 table contains information about a large number of small plants, including many small hydroelectric and other renewable generation facilities. Unfortunately the data is not well standardized, and so the plants have been categorized manually, with the results of that categorization stored in an Excel spreadsheet. This function reads in the plant type data from the spreadsheet and merges it with the rest of the information from the FERC DB based on record number, FERC respondent ID, and report year. When possible the FERC license number for small hydro plants is also manually extracted from the data. + This FERC Form 1 table contains information about a large number of small plants, + including many small hydroelectric and other renewable generation facilities. + Unfortunately the data is not well standardized, and so the plants have been + categorized manually, with the results of that categorization stored in an Excel + spreadsheet. This function reads in the plant type data from the spreadsheet and + merges it with the rest of the information from the FERC DB based on record number, + FERC respondent ID, and report year. When possible the FERC license number for small + hydro plants is also manually extracted from the data. - This categorization will need to be renewed with each additional year of FERC data we pull in. As of v0.1 the small plants have been categorized for 2004-2015. + This categorization will need to be renewed with each additional year of FERC data + we pull in. As of v0.1 the small plants have been categorized for 2004-2015. Args: ferc1_raw_dfs (dict): Each entry in this dictionary of DataFrame objects @@ -1119,7 +1183,8 @@ def plants_small(ferc1_raw_dfs, ferc1_transformed_dfs): def plants_hydro(ferc1_raw_dfs, ferc1_transformed_dfs): """Transforms FERC Form 1 plant_hydro data for loading into PUDL Database. - Standardizes plant names (stripping whitespace and Using Title Case). Also converts into our preferred units of MW and MWh. + Standardizes plant names (stripping whitespace and Using Title Case). Also converts + into our preferred units of MW and MWh. Args: ferc1_raw_dfs (dict): Each entry in this dictionary of DataFrame objects @@ -1200,7 +1265,8 @@ def plants_hydro(ferc1_raw_dfs, ferc1_transformed_dfs): def plants_pumped_storage(ferc1_raw_dfs, ferc1_transformed_dfs): """Transforms FERC Form 1 pumped storage data for loading into PUDL. - Standardizes plant names (stripping whitespace and Using Title Case). Also converts into our preferred units of MW and MWh. + Standardizes plant names (stripping whitespace and Using Title Case). Also converts + into our preferred units of MW and MWh. Args: ferc1_raw_dfs (dict): Each entry in this dictionary of DataFrame objects @@ -1286,7 +1352,15 @@ def plants_pumped_storage(ferc1_raw_dfs, ferc1_transformed_dfs): def plant_in_service(ferc1_raw_dfs, ferc1_transformed_dfs): """Transforms FERC Form 1 Plant in Service data for loading into PUDL. - Re-organizes the original FERC Form 1 Plant in Service data by unpacking the rows as needed on a year by year basis, to organize them into columns. The "columns" in the original FERC Form 1 denote starting balancing, ending balance, additions, retirements, adjustments, and transfers -- these categories are turned into labels in a column called "amount_type". Because each row in the transformed table is composed of many individual records (rows) from the original table, row_number can't be part of the record_id, which means they are no longer unique. To infer exactly what record a given piece of data came from, the record_id and the row_map (found in the PUDL package_data directory) can be used. + Re-organizes the original FERC Form 1 Plant in Service data by unpacking the rows as + needed on a year by year basis, to organize them into columns. The "columns" in the + original FERC Form 1 denote starting balancing, ending balance, additions, + retirements, adjustments, and transfers -- these categories are turned into labels + in a column called "amount_type". Because each row in the transformed table is + composed of many individual records (rows) from the original table, row_number can't + be part of the record_id, which means they are no longer unique. To infer exactly + what record a given piece of data came from, the record_id and the row_map (found in + the PUDL package_data directory) can be used. Args: ferc1_raw_dfs (dict): Each entry in this dictionary of DataFrame objects @@ -1344,7 +1418,12 @@ def plant_in_service(ferc1_raw_dfs, ferc1_transformed_dfs): def purchased_power(ferc1_raw_dfs, ferc1_transformed_dfs): """Transforms FERC Form 1 pumped storage data for loading into PUDL. - This table has data about inter-utility power purchases into the PUDL DB. This includes how much electricty was purchased, how much it cost, and who it was purchased from. Unfortunately the field describing which other utility the power was being bought from is poorly standardized, making it difficult to correlate with other data. It will need to be categorized by hand or with some fuzzy matching eventually. + This table has data about inter-utility power purchases into the PUDL DB. This + includes how much electricty was purchased, how much it cost, and who it was + purchased from. Unfortunately the field describing which other utility the power was + being bought from is poorly standardized, making it difficult to correlate with + other data. It will need to be categorized by hand or with some fuzzy matching + eventually. Args: ferc1_raw_dfs (dict): Each entry in this dictionary of DataFrame objects @@ -1415,7 +1494,9 @@ def purchased_power(ferc1_raw_dfs, ferc1_transformed_dfs): def accumulated_depreciation(ferc1_raw_dfs, ferc1_transformed_dfs): """Transforms FERC Form 1 depreciation data for loading into PUDL. - This information is organized by FERC account, with each line of the FERC Form 1 having a different descriptive identifier like 'balance_end_of_year' or 'transmission'. + This information is organized by FERC account, with each line of the FERC Form 1 + having a different descriptive identifier like 'balance_end_of_year' or + 'transmission'. Args: ferc1_raw_dfs (dict): Each entry in this dictionary of DataFrame objects @@ -1505,15 +1586,30 @@ def transform(ferc1_raw_dfs, ferc1_tables=pc.pudl_tables['ferc1']): class FERCPlantClassifier(BaseEstimator, ClassifierMixin): """A classifier for identifying FERC plant time series in FERC Form 1 data. - We want to be able to give the classifier a FERC plant record, and get back the group of records(or the ID of the group of records) that it ought to be part of. + We want to be able to give the classifier a FERC plant record, and get back the + group of records(or the ID of the group of records) that it ought to be part of. - There are hundreds of different groups of records, and we can only know what they are by looking at the whole dataset ahead of time. This is the "fitting" step, in which the groups of records resulting from a particular set of model parameters(e.g. the weights that are attributes of the class) are generated. + There are hundreds of different groups of records, and we can only know what they + are by looking at the whole dataset ahead of time. This is the "fitting" step, in + which the groups of records resulting from a particular set of model parameters(e.g. + the weights that are attributes of the class) are generated. - Once we have that set of record categories, we can test how well the classifier performs, by checking it against test / training data which we have already classified by hand. The test / training set is a list of lists of unique FERC plant record IDs(each record ID is the concatenation of: report year, respondent id, supplement number, and row number). It could also be stored as a dataframe where each column is associated with a year of data(some of which could be empty). Not sure what the best structure would be. + Once we have that set of record categories, we can test how well the classifier + performs, by checking it against test / training data which we have already + classified by hand. The test / training set is a list of lists of unique FERC plant + record IDs(each record ID is the concatenation of: report year, respondent id, + supplement number, and row number). It could also be stored as a dataframe where + each column is associated with a year of data(some of which could be empty). Not + sure what the best structure would be. - If it's useful, we can assign each group a unique ID that is the time ordered concatenation of each of the constituent record IDs. Need to understand what the process for checking the classification of an input record looks like. + If it's useful, we can assign each group a unique ID that is the time ordered + concatenation of each of the constituent record IDs. Need to understand what the + process for checking the classification of an input record looks like. - To score a given classifier, we can look at what proportion of the records in the test dataset are assigned to the same group as in our manual classification of those records. There are much more complicated ways to do the scoring too... but for now let's just keep it as simple as possible. + To score a given classifier, we can look at what proportion of the records in the + test dataset are assigned to the same group as in our manual classification of those + records. There are much more complicated ways to do the scoring too... but for now + let's just keep it as simple as possible. """ @@ -1523,9 +1619,13 @@ def __init__(self, min_sim=0.75, plants_df=None): Args: min_sim : Number between 0.0 and 1.0, indicating the minimum value of - cosine similarity that we are willing to accept as indicating two records are part of the same plant record time series. All entries in the pairwise similarity matrix below this value will be zeroed out. + cosine similarity that we are willing to accept as indicating two + records are part of the same plant record time series. All entries in + the pairwise similarity matrix below this value will be zeroed out. plants_df : The entire FERC Form 1 plant table as a dataframe. Needed in - order to calculate the distance metrics between all of the records so we can group the plants in the fit() step, so we can check how well they are categorized later... + order to calculate the distance metrics between all of the records so we + can group the plants in the fit() step, so we can check how well they + are categorized later... Todo: Zane revisit plants_df @@ -1729,7 +1829,10 @@ def make_ferc1_clf(plants_df, """ Create a FERC Plant Classifier using several weighted features. - Given a FERC steam plants dataframe plants_df, which also includes fuel consumption information, transform a selection of useful columns into features suitable for use in calculating inter-record cosine similarities. Individual features are weighted according to the keyword arguments. + Given a FERC steam plants dataframe plants_df, which also includes fuel consumption + information, transform a selection of useful columns into features suitable for use + in calculating inter-record cosine similarities. Individual features are weighted + according to the keyword arguments. Features include: @@ -1739,11 +1842,13 @@ def make_ferc1_clf(plants_df, * capacity_mw (MinMax scaled numerical feature) * construction year (OneHot encoded categorical feature) * utility_id_ferc1 (OneHot encoded categorical feature) - * fuel_fraction_mmbtu (several MinMax scaled numerical columns, which are normalized and treated as a single feature.) + * fuel_fraction_mmbtu (several MinMax scaled numerical columns, which are + normalized and treated as a single feature.) This feature matrix is then used to instantiate a FERCPlantClassifier. - The combination of the ColumnTransformer and FERCPlantClassifier are combined in a sklearn Pipeline, which is returned by the function. + The combination of the ColumnTransformer and FERCPlantClassifier are combined in a + sklearn Pipeline, which is returned by the function. Arguments: ngram_min (int): the minimum lengths to consider in the vectorization of the @@ -1753,23 +1858,37 @@ def make_ferc1_clf(plants_df, min_sim (float): the minimum cosine similarity between two records that can be considered a "match" (a number between 0.0 and 1.0). plant_name_ferc1_wt (float): weight used to determine the relative importance - of each of the features in the feature matrix used to calculate the cosine similarity between records. Used to scale each - individual feature before the vectors are normalized. + of each of the features in the feature matrix used to calculate the cosine + similarity between records. Used to scale each individual feature before the + vectors are normalized. plant_type_wt (float): weight used to determine the relative importance of each - of the features in the feature matrix used to calculate the cosine similarity between records. Used to scale each individual feature before the vectors are normalized. + of the features in the feature matrix used to calculate the cosine + similarity between records. Used to scale each individual feature before the + vectors are normalized. construction_type_wt (float): weight used to determine the relative importance - of each of the features in the feature matrix used to calculate the cosine similarity between records. Used to scale each individual feature before the vectors are normalized. + of each of the features in the feature matrix used to calculate the cosine + similarity between records. Used to scale each individual feature before the + vectors are normalized. capacity_mw_wt (float):weight used to determine the relative importance of each - of the features in the feature matrix used to calculate the cosine similarity between records. Used to scale each individual feature before the vectors are normalized. + of the features in the feature matrix used to calculate the cosine + similarity between records. Used to scale each individual feature before the + vectors are normalized. construction_year_wt (float): weight used to determine the relative importance - of each of the features in the feature matrix used to calculate the cosine similarity between records. Used to scale each individual feature before the vectors are normalized. + of each of the features in the feature matrix used to calculate the cosine + similarity between records. Used to scale each individual feature before the + vectors are normalized. utility_id_ferc1_wt (float): weight used to determine the relative importance - of each of the features in the feature matrix used to calculate the cosine similarity between records. Used to scale each individual feature before the vectors are normalized. + of each of the features in the feature matrix used to calculate the cosine + similarity between records. Used to scale each individual feature before the + vectors are normalized. fuel_fraction_wt (float): weight used to determine the relative importance of - each of the features in the feature matrix used to calculate the cosine similarity between records. Used to scale each individual feature before the vectors are normalized. + each of the features in the feature matrix used to calculate the cosine + similarity between records. Used to scale each individual feature before the + vectors are normalized. Returns: - sklearn.pipeline.Pipeline: an sklearn Pipeline that performs reprocessing and classification with a FERCPlantClassifier object. + sklearn.pipeline.Pipeline: an sklearn Pipeline that performs reprocessing and + classification with a FERCPlantClassifier object. """ # Make a list of all the fuel fraction columns for use as one feature. @@ -1816,18 +1935,37 @@ def make_ferc1_clf(plants_df, def fuel_by_plant_ferc1(fuel_df, thresh=0.5): """Calculates useful FERC Form 1 fuel metrics on a per plant-year basis. - Each record in the FERC Form 1 corresponds to a particular type of fuel. Many plants -- especially coal plants -- use more than one fuel, with gas and/or diesel serving as startup fuels. In order to be able to classify the type of plant based on relative proportions of fuel consumed or fuel costs it is useful to aggregate these per-fuel records into a single record for each plant. - - Fuel cost (in nominal dollars) and fuel heat content (in mmBTU) are calculated for each fuel based on the cost and heat content per unit, and the number of units consumed, and then summed by fuel type (there can be more than one record for a given type of fuel in each plant because we are simplifying the fuel categories). The per-fuel records are then pivoted to create one column per fuel type. The total is summed and stored separately, and the individual fuel costs & heat contents are divided by that total, to yield fuel proportions. Based on those proportions and a minimum threshold that's passed in, a "primary" fuel type is then assigned to the plant-year record and given a string label. + Each record in the FERC Form 1 corresponds to a particular type of fuel. Many plants + -- especially coal plants -- use more than one fuel, with gas and/or diesel serving + as startup fuels. In order to be able to classify the type of plant based on + relative proportions of fuel consumed or fuel costs it is useful to aggregate these + per-fuel records into a single record for each plant. + + Fuel cost (in nominal dollars) and fuel heat content (in mmBTU) are calculated for + each fuel based on the cost and heat content per unit, and the number of units + consumed, and then summed by fuel type (there can be more than one record for a + given type of fuel in each plant because we are simplifying the fuel categories). + The per-fuel records are then pivoted to create one column per fuel type. The total + is summed and stored separately, and the individual fuel costs & heat contents are + divided by that total, to yield fuel proportions. Based on those proportions and a + minimum threshold that's passed in, a "primary" fuel type is then assigned to the + plant-year record and given a string label. Args: fuel_df (pandas.DataFrame): Pandas DataFrame resembling the post-transform result for the fuel_ferc1 table. thresh (float): A value between 0.5 and 1.0 indicating the minimum fraction of - overall heat content that must have been provided by a fuel in a plant-year for it to be considered the "primary" fuel for the plant in that year. Default value: 0.5. + overall heat content that must have been provided by a fuel in a plant-year + for it to be considered the "primary" fuel for the plant in that year. + Default value: 0.5. Returns: - pandas.DataFrame: A DataFrame with a single record for each plant-year, including the columns required to merge it with the plants_steam_ferc1 table/DataFrame (report_year, utility_id_ferc1, and plant_name) as well as totals for fuel mmbtu consumed in that plant-year, and the cost of fuel in that year, the proportions of heat content and fuel costs for each fuel in that year, and a column that labels the plant's primary fuel for that year. + pandas.DataFrame: A DataFrame with a single record for each plant-year, + including the columns required to merge it with the plants_steam_ferc1 + table/DataFrame (report_year, utility_id_ferc1, and plant_name) as well as + totals for fuel mmbtu consumed in that plant-year, and the cost of fuel in that + year, the proportions of heat content and fuel costs for each fuel in that year, + and a column that labels the plant's primary fuel for that year. Raises: AssertionError: If the DataFrame input does not have the columns required to diff --git a/src/pudl/transform/ferc714.py b/src/pudl/transform/ferc714.py index f53194b740..a942d40f83 100644 --- a/src/pudl/transform/ferc714.py +++ b/src/pudl/transform/ferc714.py @@ -192,7 +192,12 @@ """ A mapping of timezone offset codes to Timedelta offsets from UTC. -from one year to the next, and these result in duplicate records, which are Note that the FERC 714 instructions state that all hourly demand is to be reported in STANDARD time for whatever timezone is being used. Even though many respondents use daylight savings / standard time abbreviations, a large majority do appear to conform to using a single UTC offset throughout the year. There are 6 instances in which the timezone associated with reporting changed dropped. +from one year to the next, and these result in duplicate records, which are Note that +the FERC 714 instructions state that all hourly demand is to be reported in STANDARD +time for whatever timezone is being used. Even though many respondents use daylight +savings / standard time abbreviations, a large majority do appear to conform to using a +single UTC offset throughout the year. There are 6 instances in which the timezone +associated with reporting changed dropped. """ @@ -301,7 +306,9 @@ def _standardize_offset_codes(df, offset_fixes): """ Convert to standardized UTC offset abbreviations. - This function ensures that all of the 3-4 letter abbreviations used to indicate a timestamp's localized offset from UTC are standardized, so that they can be used to make the timestamps timezone aware. The standard abbreviations we're using are: + This function ensures that all of the 3-4 letter abbreviations used to indicate a + timestamp's localized offset from UTC are standardized, so that they can be used to + make the timestamps timezone aware. The standard abbreviations we're using are: "HST": Hawaii Standard Time "AKST": Alaska Standard Time @@ -315,7 +322,9 @@ def _standardize_offset_codes(df, offset_fixes): "EST": Eastern Standard Time "EDT": Eastern Daylight Time - In some cases different respondents use the same non-standard abbreviations to indicate different offsets, and so the fixes are applied on a per-respondent basis, as defined by offset_fixes. + In some cases different respondents use the same non-standard abbreviations to + indicate different offsets, and so the fixes are applied on a per-respondent basis, + as defined by offset_fixes. Args: df (pandas.DataFrame): A DataFrame containing a utc_offset_code column @@ -346,14 +355,18 @@ def respondent_id(tfr_dfs): """ Transform the FERC 714 respondent IDs, names, and EIA utility IDs. - This consists primarily of dropping test respondents and manually assigning EIA utility IDs to a few FERC Form 714 respondents that report planning area demand, but which don't have their corresponding EIA utility IDs provided by FERC for some reason (including PacifiCorp). + This consists primarily of dropping test respondents and manually assigning EIA + utility IDs to a few FERC Form 714 respondents that report planning area demand, but + which don't have their corresponding EIA utility IDs provided by FERC for some + reason (including PacifiCorp). Args: tfr_dfs (dict): A dictionary of (partially) transformed dataframes, to be cleaned up. Returns: - dict: The input dictionary of dataframes, but with a finished respondent_id_ferc714 dataframe. + dict: The input dictionary of dataframes, but with a finished + respondent_id_ferc714 dataframe. """ df = ( @@ -578,12 +591,14 @@ def transform(raw_dfs, tables=pc.pudl_tables["ferc714"]): Args: raw_dfs (dict): A dictionary of raw pandas.DataFrame objects, as read out of - the original FERC 714 CSV files. Generated by the `pudl.extract.ferc714.extract()` function. + the original FERC 714 CSV files. Generated by the + `pudl.extract.ferc714.extract()` function. tables (iterable): The set of PUDL tables within FERC 714 that we should process. Typically set to all of them, unless Returns: - dict: A dictionary of pandas.DataFrame objects that are ready to be output in a data package / database table. + dict: A dictionary of pandas.DataFrame objects that are ready to be output in a + data package / database table. """ tfr_funcs = {