catalyst-cooperative · zaneselvans · Jan 26, 2022 · Jan 22, 2022 · Jan 22, 2022 · Jan 23, 2022
diff --git a/setup.py b/setup.py
@@ -45,7 +45,7 @@
         "matplotlib~=3.0",  # Should make this optional with a "viz" extras
         "networkx>=2.2,<3",
         "numpy>=1.18.5,<2",
-        "pandas>=1.3,!=1.3.3,<1.4",  # IntCastingNaNError on v1.3.3 in unit tests
+        "pandas>=1.3,!=1.3.3,<1.5",  # IntCastingNaNError on v1.3.3 in unit tests
         "prefect[viz, gcp]~=0.15.0",
         "pyarrow>=5,<7",
         "pydantic[email]~=1.7",

diff --git a/src/pudl/analysis/mcoe.py b/src/pudl/analysis/mcoe.py
@@ -308,7 +308,7 @@ def fuel_cost(pudl_out):
                              'fuel_cost_from_eiaapi', ]]
 
     fc = (
-        one_fuel.append(multi_fuel, sort=True)
+        pd.concat([one_fuel, multi_fuel], sort=True)
         .assign(
             fuel_cost_per_mwh=lambda x: x.fuel_cost_per_mmbtu * x.heat_rate_mmbtu_mwh
         )

diff --git a/src/pudl/extract/eia860m.py b/src/pudl/extract/eia860m.py
@@ -81,6 +81,9 @@ def append_eia860m(eia860_raw_dfs, eia860m_raw_dfs):
     pages_eia860m = meta_eia860m.get_all_pages()
     # page names in 860m and 860 are the same.
     for page in pages_eia860m:
-        eia860_raw_dfs[page] = eia860_raw_dfs[page].append(
-            eia860m_raw_dfs[page], ignore_index=True, sort=True)
+        eia860_raw_dfs[page] = pd.concat(
+            [eia860_raw_dfs[page], eia860m_raw_dfs[page]],
+            ignore_index=True,
+            sort=True,
+        )
     return eia860_raw_dfs
diff --git a/src/pudl/extract/epacems.py b/src/pudl/extract/epacems.py
@@ -120,11 +120,6 @@ def _csv_to_dataframe(self, csv_file) -> pd.DataFrame:
         """
         Convert a CEMS csv file into a :class:`pandas.DataFrame`.
 
-        Note that some columns are not read. See
-        :mod:`pudl.constants.epacems_columns_to_ignore`. Data types for the columns
-        are specified in :mod:`pudl.constants.epacems_csv_dtypes` and names of the
-        output columns are set by :mod:`pudl.constants.epacems_rename_dict`.
-
         Args:
             csv (file-like object): data to be read
 

diff --git a/src/pudl/extract/excel.py b/src/pudl/extract/excel.py
@@ -208,7 +208,7 @@ def extract(self, **partitions):
             if page in self.BLACKLISTED_PAGES:
                 logger.debug(f'Skipping blacklisted page {page}.')
                 continue
-            df = pd.DataFrame()
+            dfs = [pd.DataFrame(), ]
             for partition in pudl.helpers.iterate_multivalue_dict(**partitions):
                 # we are going to skip
                 if self.excel_filename(page, **partition) == '-1':
@@ -229,7 +229,8 @@ def extract(self, **partitions):
                 newdata = pudl.helpers.simplify_columns(newdata)
                 newdata = self.process_raw(newdata, page, **partition)
                 newdata = self.process_renamed(newdata, page, **partition)
-                df = df.append(newdata, sort=True, ignore_index=True)
+                dfs.append(newdata)
+            df = pd.concat(dfs, sort=True, ignore_index=True)
 
             # After all years are loaded, add empty columns that could appear
             # in other years so that df matches the database schema

diff --git a/src/pudl/glue/ferc1_eia.py b/src/pudl/glue/ferc1_eia.py
@@ -185,7 +185,7 @@ def get_db_plants_ferc1(
         )
 
         # Add all the plants from the current table to our bigger list:
-        all_plants = all_plants.append(
+        db_plants = (
             pd.read_sql(plant_select, ferc1_engine)
             .rename(columns={
                 "respondent_id": "utility_id_ferc1",
@@ -206,6 +206,7 @@ def get_db_plants_ferc1(
                 "plant_table"
             ]]
         )
+        all_plants = pd.concat([all_plants, db_plants])
 
     # We don't want dupes, and sorting makes the whole thing more readable:
     all_plants = (
@@ -556,7 +557,7 @@ def get_unmapped_utils_eia(
     for table in data_tables_eia923:
         query = f"SELECT DISTINCT plant_id_eia FROM {table}"  # nosec
         new_ids = pd.read_sql(query, pudl_engine)
-        plant_ids = plant_ids.append(new_ids["plant_id_eia"])
+        plant_ids = pd.concat([plant_ids, new_ids["plant_id_eia"]])
     plant_ids_in_eia923 = sorted(set(plant_ids))
 
     utils_with_plants = (

diff --git a/src/pudl/output/eia860.py b/src/pudl/output/eia860.py
@@ -386,7 +386,7 @@ def fill_generator_technology_description(gens_df: pd.DataFrame) -> pd.DataFrame
         out_df
         .sort_values("report_date")
         .groupby(["plant_id_eia", "generator_id", "energy_source_code_1"])
-        .technology_description.backfill()
+        .technology_description.bfill()
     )
 
     # Fill in remaining missing technology_descriptions with unique correspondences

diff --git a/src/pudl/output/ferc714.py b/src/pudl/output/ferc714.py
@@ -296,7 +296,7 @@ def balancing_authority_eia861(self) -> pd.DataFrame:
                 if key not in dfi.index:
                     rows.append({**ref, 'report_date': key[1]})
         # Append to original table
-        df = df.append(pd.DataFrame(rows))
+        df = pd.concat([df, pd.DataFrame(rows)])
         # Remove balancing authorities treated as utilities
         mask = df['balancing_authority_id_eia'].isin([util['id'] for util in UTILITIES])
         return df[~mask]
@@ -329,7 +329,7 @@ def balancing_authority_assn_eia861(self) -> pd.DataFrame:
                 tables.append(ref.assign(report_date=key[1]))
                 replaced |= mask
         # Append to original table with matching rows removed
-        df = df[~replaced].append(pd.concat(tables))
+        df = pd.concat([df[~replaced], pd.concat(tables)])
         # Remove balancing authorities treated as utilities
         mask = np.zeros(df.shape[0], dtype=bool)
         tables = []
@@ -359,7 +359,7 @@ def balancing_authority_assn_eia861(self) -> pd.DataFrame:
                 tables.append(table)
                 if 'replace' in util and util['replace']:
                     mask |= is_child
-        return df[~mask].append(pd.concat(tables)).drop_duplicates()
+        return pd.concat([df[~mask], pd.concat(tables)]).drop_duplicates()
 
     @cached_property
     def service_territory_eia861(self) -> pd.DataFrame:

diff --git a/src/pudl/transform/eia.py b/src/pudl/transform/eia.py
@@ -284,7 +284,7 @@ def _lat_long(dirty_df, clean_df, entity_id_df, entity_id,
     ll_df = ll_df[ll_df[f'{col}_consistent']].drop_duplicates(subset=entity_id)
     logger.debug(f"Clean {col} records: {len(ll_df)}")
     # add the newly cleaned records
-    ll_clean_df = ll_clean_df.append(ll_df,)
+    ll_clean_df = pd.concat([ll_clean_df, ll_df])
     # merge onto the plants df w/ all plant ids
     ll_clean_df = entity_id_df.merge(ll_clean_df, how='outer')
     return ll_clean_df
@@ -354,7 +354,7 @@ def _add_additional_epacems_plants(plants_entity):
     # non-matching columns. It also requires an index, so we set and reset the
     # index as necessary. Also, it only works in-place, so we can't chain.
     plants_entity.update(cems_df, overwrite=True)
-    return plants_entity.append(cems_unmatched).reset_index()
+    return pd.concat([plants_entity, cems_unmatched]).reset_index()
 
 
 def _compile_all_entity_records(entity, eia_transformed_dfs):
@@ -602,10 +602,15 @@ def harvesting(entity,  # noqa: C901
                         f'Harvesting of {col} is too inconsistent at {ratio:.3}.')
         # add to a small df to be used in order to print out the ratio of
         # consistent records
-        consistency = consistency.append({'column': col,
-                                          'consistent_ratio': ratio,
-                                          'wrongos': wrongos,
-                                          'total': total}, ignore_index=True)
+        consistency = pd.concat([
+            consistency,
+            pd.DataFrame({
+                'column': [col],
+                'consistent_ratio': [ratio],
+                'wrongos': [wrongos],
+                'total': [total],
+            })], ignore_index=True
+        )
     mcs = consistency['consistent_ratio'].mean()
     logger.info(
         f"Average consistency of static {entity} values is {mcs:.2%}")
@@ -804,7 +809,7 @@ def _boiler_generator_assn(
     )
 
     bga_compiled_2 = (
-        bga_assn.append(bga_unassn)
+        pd.concat([bga_assn, bga_unassn])
         .fillna({'missing_from_923': True})
     )
 
@@ -832,7 +837,7 @@ def _boiler_generator_assn(
     bga_non_units = bga_compiled_2[bga_compiled_2['unit_id_eia'].isnull()]
 
     # combine the unit compilation and the non units
-    bga_compiled_3 = bga_non_units.append(bga_unit_compilation)
+    bga_compiled_3 = pd.concat([bga_non_units, bga_unit_compilation])
 
     bga_compiled_3 = bga_compiled_3[['plant_id_eia',
                                      'report_date',
@@ -935,7 +940,7 @@ def _boiler_generator_assn(
             nx.set_edge_attributes(
                 unit, name='unit_id_pudl', values=unit_id + 1)
             new_unit_df = nx.to_pandas_edgelist(unit)
-            bga_w_units = bga_w_units.append(new_unit_df)
+            bga_w_units = pd.concat([bga_w_units, new_unit_df])
 
     bga_w_units = bga_w_units.drop(['source', 'target'], axis=1)
 

diff --git a/src/pudl/transform/eia861.py b/src/pudl/transform/eia861.py
@@ -1010,7 +1010,7 @@ def _harvest_associations(dfs, cols):
     assn = pd.DataFrame()
     for df in dfs:
         if set(df.columns).issuperset(set(cols)):
-            assn = assn.append(df[cols])
+            assn = pd.concat([assn, df[cols]])
     assn = assn.dropna().drop_duplicates()
     if assn.empty:
         raise ValueError(
@@ -1555,55 +1555,53 @@ def distributed_generation(tfr_dfs):
     ###########################################################################
 
     # Separate datasets into years with only pct values (pre-2010) and years with only mw values (post-2010)
-    df_pre_2010_tech = raw_dg_tech[raw_dg_tech['report_date'] < '2010-01-01']
-    df_post_2010_tech = raw_dg_tech[raw_dg_tech['report_date'] >= '2010-01-01']
-    df_pre_2010_misc = raw_dg_misc[raw_dg_misc['report_date'] < '2010-01-01']
-    df_post_2010_misc = raw_dg_misc[raw_dg_misc['report_date'] >= '2010-01-01']
-
-    logger.info(
-        'Converting pct values into mw values for distributed generation misc table')
-    transformed_dg_misc = (
-        df_pre_2010_misc.assign(
-            distributed_generation_owned_capacity_mw=lambda x: _pct_to_mw(
-                x, 'distributed_generation_owned_capacity_pct'),
-            backup_capacity_mw=lambda x: _pct_to_mw(x, 'backup_capacity_pct'),
-        ).append(df_post_2010_misc)
-        .drop(['distributed_generation_owned_capacity_pct',
-               'backup_capacity_pct',
-               'total_capacity_mw'], axis=1)
-    )
-
-    logger.info(
-        'Converting pct values into mw values for distributed generation tech table')
-    transformed_dg_tech = (
-        df_pre_2010_tech.assign(
-            combustion_turbine_capacity_mw=lambda x: (
-                _pct_to_mw(x, 'combustion_turbine_capacity_pct')),
-            hydro_capacity_mw=lambda x: _pct_to_mw(x, 'hydro_capacity_pct'),
-            internal_combustion_capacity_mw=lambda x: (
-                _pct_to_mw(x, 'internal_combustion_capacity_pct')),
-            other_capacity_mw=lambda x: _pct_to_mw(x, 'other_capacity_pct'),
-            steam_capacity_mw=lambda x: _pct_to_mw(x, 'steam_capacity_pct'),
-            wind_capacity_mw=lambda x: _pct_to_mw(x, 'wind_capacity_pct'),
-        ).append(df_post_2010_tech)
-        .drop([
-            'combustion_turbine_capacity_pct',
-            'hydro_capacity_pct',
-            'internal_combustion_capacity_pct',
-            'other_capacity_pct',
-            'steam_capacity_pct',
-            'wind_capacity_pct',
-            'total_capacity_mw'], axis=1
-        )
-    )
+    dg_tech_early = raw_dg_tech[raw_dg_tech['report_date'] < '2010-01-01']
+    dg_tech_late = raw_dg_tech[raw_dg_tech['report_date'] >= '2010-01-01']
+    dg_misc_early = raw_dg_misc[raw_dg_misc['report_date'] < '2010-01-01']
+    dg_misc_late = raw_dg_misc[raw_dg_misc['report_date'] >= '2010-01-01']
+
+    logger.info('Converting pct to MW for distributed generation misc table')
+    dg_misc_early = dg_misc_early .assign(
+        distributed_generation_owned_capacity_mw=lambda x: _pct_to_mw(
+            x, 'distributed_generation_owned_capacity_pct'),
+        backup_capacity_mw=lambda x: _pct_to_mw(x, 'backup_capacity_pct'),
+    )
+    dg_misc = pd.concat([dg_misc_early, dg_misc_late])
+    dg_misc = dg_misc.drop([
+        'distributed_generation_owned_capacity_pct',
+        'backup_capacity_pct',
+        'total_capacity_mw'
+    ], axis="columns")
+
+    logger.info('Converting pct into MW for distributed generation tech table')
+    dg_tech_early = dg_tech_early.assign(
+        combustion_turbine_capacity_mw=lambda x: (
+            _pct_to_mw(x, 'combustion_turbine_capacity_pct')),
+        hydro_capacity_mw=lambda x: _pct_to_mw(x, 'hydro_capacity_pct'),
+        internal_combustion_capacity_mw=lambda x: (
+            _pct_to_mw(x, 'internal_combustion_capacity_pct')),
+        other_capacity_mw=lambda x: _pct_to_mw(x, 'other_capacity_pct'),
+        steam_capacity_mw=lambda x: _pct_to_mw(x, 'steam_capacity_pct'),
+        wind_capacity_mw=lambda x: _pct_to_mw(x, 'wind_capacity_pct'),
+    )
+    dg_tech = pd.concat([dg_tech_early, dg_tech_late])
+    dg_tech = dg_tech.drop([
+        'combustion_turbine_capacity_pct',
+        'hydro_capacity_pct',
+        'internal_combustion_capacity_pct',
+        'other_capacity_pct',
+        'steam_capacity_pct',
+        'wind_capacity_pct',
+        'total_capacity_mw'
+    ], axis="columns")
 
     ###########################################################################
     # Tidy Data
     ###########################################################################
 
     logger.info('Tidying Distributed Generation Tech Table')
     tidy_dg_tech, tech_idx_cols = _tidy_class_dfs(
-        df=transformed_dg_tech,
+        df=dg_tech,
         df_name='Distributed Generation Tech Component Capacity',
         idx_cols=idx_cols,
         class_list=TECH_CLASSES,
@@ -1624,7 +1622,7 @@ def distributed_generation(tfr_dfs):
 
     tfr_dfs["distributed_generation_tech_eia861"] = tidy_dg_tech
     tfr_dfs["distributed_generation_fuel_eia861"] = tidy_dg_fuel
-    tfr_dfs["distributed_generation_misc_eia861"] = transformed_dg_misc
+    tfr_dfs["distributed_generation_misc_eia861"] = dg_misc
 
     return tfr_dfs
 

diff --git a/src/pudl/transform/eia923.py b/src/pudl/transform/eia923.py
@@ -329,7 +329,7 @@ def _aggregate_generation_fuel_duplicates(
 
     # Add the resolved records back to generation_fuel dataframe.
     gen_df = gen_fuel[~is_duplicate].copy()
-    gen_df = gen_df.append(resolved_dupes)
+    gen_df = pd.concat([gen_df, resolved_dupes])
 
     if gen_df[natural_key_fields].isnull().any().any():
         raise AssertionError(
@@ -782,8 +782,10 @@ def _aggregate_duplicate_boiler_fuel_keys(
         _map_prime_mover_sets)
 
     # NOTE: the following method changes the order of the data and resets the index
-    modified_boiler_fuel_df = boiler_fuel_df[~is_duplicate].append(
-        aggregates.reset_index(), ignore_index=True)
+    modified_boiler_fuel_df = pd.concat(
+        [boiler_fuel_df[~is_duplicate], aggregates.reset_index()],
+        ignore_index=True
+    )
 
     return modified_boiler_fuel_df
 
@@ -988,7 +990,7 @@ def coalmine(eia923_dfs, eia923_transformed_dfs):
     cmi_with_msha = cmi_df[cmi_df['mine_id_msha'] > 0]
     cmi_with_msha = cmi_with_msha.drop_duplicates(subset=['mine_id_msha', ])
     cmi_df.drop(cmi_df[cmi_df['mine_id_msha'] > 0].index)
-    cmi_df.append(cmi_with_msha)
+    cmi_df = pd.concat([cmi_df, cmi_with_msha])
 
     cmi_df = cmi_df.drop_duplicates(subset=['mine_name',
                                             'state',