Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Minor changes to make pandas 1.4.0 work #1421

Merged
merged 5 commits into from
Jan 26, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
"matplotlib~=3.0", # Should make this optional with a "viz" extras
"networkx>=2.2,<3",
"numpy>=1.18.5,<2",
"pandas>=1.3,!=1.3.3,<1.4", # IntCastingNaNError on v1.3.3 in unit tests
"pandas>=1.3,!=1.3.3,<1.5", # IntCastingNaNError on v1.3.3 in unit tests
"prefect[viz, gcp]~=0.15.0",
"pyarrow>=5,<7",
"pydantic[email]~=1.7",
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/analysis/mcoe.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def fuel_cost(pudl_out):
'fuel_cost_from_eiaapi', ]]

fc = (
one_fuel.append(multi_fuel, sort=True)
pd.concat([one_fuel, multi_fuel], sort=True)
.assign(
fuel_cost_per_mwh=lambda x: x.fuel_cost_per_mmbtu * x.heat_rate_mmbtu_mwh
)
Expand Down
7 changes: 5 additions & 2 deletions src/pudl/extract/eia860m.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,9 @@ def append_eia860m(eia860_raw_dfs, eia860m_raw_dfs):
pages_eia860m = meta_eia860m.get_all_pages()
# page names in 860m and 860 are the same.
for page in pages_eia860m:
eia860_raw_dfs[page] = eia860_raw_dfs[page].append(
eia860m_raw_dfs[page], ignore_index=True, sort=True)
eia860_raw_dfs[page] = pd.concat(
[eia860_raw_dfs[page], eia860m_raw_dfs[page]],
ignore_index=True,
sort=True,
)
return eia860_raw_dfs
5 changes: 0 additions & 5 deletions src/pudl/extract/epacems.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,6 @@ def _csv_to_dataframe(self, csv_file) -> pd.DataFrame:
"""
Convert a CEMS csv file into a :class:`pandas.DataFrame`.

Note that some columns are not read. See
:mod:`pudl.constants.epacems_columns_to_ignore`. Data types for the columns
are specified in :mod:`pudl.constants.epacems_csv_dtypes` and names of the
output columns are set by :mod:`pudl.constants.epacems_rename_dict`.

Args:
csv (file-like object): data to be read

Expand Down
5 changes: 3 additions & 2 deletions src/pudl/extract/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def extract(self, **partitions):
if page in self.BLACKLISTED_PAGES:
logger.debug(f'Skipping blacklisted page {page}.')
continue
df = pd.DataFrame()
dfs = [pd.DataFrame(), ]
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved
for partition in pudl.helpers.iterate_multivalue_dict(**partitions):
# we are going to skip
if self.excel_filename(page, **partition) == '-1':
Expand All @@ -229,7 +229,8 @@ def extract(self, **partitions):
newdata = pudl.helpers.simplify_columns(newdata)
newdata = self.process_raw(newdata, page, **partition)
newdata = self.process_renamed(newdata, page, **partition)
df = df.append(newdata, sort=True, ignore_index=True)
dfs.append(newdata)
df = pd.concat(dfs, sort=True, ignore_index=True)

# After all years are loaded, add empty columns that could appear
# in other years so that df matches the database schema
Expand Down
5 changes: 3 additions & 2 deletions src/pudl/glue/ferc1_eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def get_db_plants_ferc1(
)

# Add all the plants from the current table to our bigger list:
all_plants = all_plants.append(
db_plants = (
pd.read_sql(plant_select, ferc1_engine)
.rename(columns={
"respondent_id": "utility_id_ferc1",
Expand All @@ -206,6 +206,7 @@ def get_db_plants_ferc1(
"plant_table"
]]
)
all_plants = pd.concat([all_plants, db_plants])

# We don't want dupes, and sorting makes the whole thing more readable:
all_plants = (
Expand Down Expand Up @@ -556,7 +557,7 @@ def get_unmapped_utils_eia(
for table in data_tables_eia923:
query = f"SELECT DISTINCT plant_id_eia FROM {table}" # nosec
new_ids = pd.read_sql(query, pudl_engine)
plant_ids = plant_ids.append(new_ids["plant_id_eia"])
plant_ids = pd.concat([plant_ids, new_ids["plant_id_eia"]])
plant_ids_in_eia923 = sorted(set(plant_ids))

utils_with_plants = (
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/output/eia860.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,7 +386,7 @@ def fill_generator_technology_description(gens_df: pd.DataFrame) -> pd.DataFrame
out_df
.sort_values("report_date")
.groupby(["plant_id_eia", "generator_id", "energy_source_code_1"])
.technology_description.backfill()
.technology_description.bfill()
)

# Fill in remaining missing technology_descriptions with unique correspondences
Expand Down
6 changes: 3 additions & 3 deletions src/pudl/output/ferc714.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,7 @@ def balancing_authority_eia861(self) -> pd.DataFrame:
if key not in dfi.index:
rows.append({**ref, 'report_date': key[1]})
# Append to original table
df = df.append(pd.DataFrame(rows))
df = pd.concat([df, pd.DataFrame(rows)])
# Remove balancing authorities treated as utilities
mask = df['balancing_authority_id_eia'].isin([util['id'] for util in UTILITIES])
return df[~mask]
Expand Down Expand Up @@ -329,7 +329,7 @@ def balancing_authority_assn_eia861(self) -> pd.DataFrame:
tables.append(ref.assign(report_date=key[1]))
replaced |= mask
# Append to original table with matching rows removed
df = df[~replaced].append(pd.concat(tables))
df = pd.concat([df[~replaced], pd.concat(tables)])
# Remove balancing authorities treated as utilities
mask = np.zeros(df.shape[0], dtype=bool)
tables = []
Expand Down Expand Up @@ -359,7 +359,7 @@ def balancing_authority_assn_eia861(self) -> pd.DataFrame:
tables.append(table)
if 'replace' in util and util['replace']:
mask |= is_child
return df[~mask].append(pd.concat(tables)).drop_duplicates()
return pd.concat([df[~mask], pd.concat(tables)]).drop_duplicates()

@cached_property
def service_territory_eia861(self) -> pd.DataFrame:
Expand Down
23 changes: 14 additions & 9 deletions src/pudl/transform/eia.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def _lat_long(dirty_df, clean_df, entity_id_df, entity_id,
ll_df = ll_df[ll_df[f'{col}_consistent']].drop_duplicates(subset=entity_id)
logger.debug(f"Clean {col} records: {len(ll_df)}")
# add the newly cleaned records
ll_clean_df = ll_clean_df.append(ll_df,)
ll_clean_df = pd.concat([ll_clean_df, ll_df])
# merge onto the plants df w/ all plant ids
ll_clean_df = entity_id_df.merge(ll_clean_df, how='outer')
return ll_clean_df
Expand Down Expand Up @@ -354,7 +354,7 @@ def _add_additional_epacems_plants(plants_entity):
# non-matching columns. It also requires an index, so we set and reset the
# index as necessary. Also, it only works in-place, so we can't chain.
plants_entity.update(cems_df, overwrite=True)
return plants_entity.append(cems_unmatched).reset_index()
return pd.concat([plants_entity, cems_unmatched]).reset_index()


def _compile_all_entity_records(entity, eia_transformed_dfs):
Expand Down Expand Up @@ -602,10 +602,15 @@ def harvesting(entity, # noqa: C901
f'Harvesting of {col} is too inconsistent at {ratio:.3}.')
# add to a small df to be used in order to print out the ratio of
# consistent records
consistency = consistency.append({'column': col,
'consistent_ratio': ratio,
'wrongos': wrongos,
'total': total}, ignore_index=True)
consistency = pd.concat([
consistency,
pd.DataFrame({
'column': [col],
'consistent_ratio': [ratio],
'wrongos': [wrongos],
'total': [total],
})], ignore_index=True
)
mcs = consistency['consistent_ratio'].mean()
logger.info(
f"Average consistency of static {entity} values is {mcs:.2%}")
Expand Down Expand Up @@ -804,7 +809,7 @@ def _boiler_generator_assn(
)

bga_compiled_2 = (
bga_assn.append(bga_unassn)
pd.concat([bga_assn, bga_unassn])
.fillna({'missing_from_923': True})
)

Expand Down Expand Up @@ -832,7 +837,7 @@ def _boiler_generator_assn(
bga_non_units = bga_compiled_2[bga_compiled_2['unit_id_eia'].isnull()]

# combine the unit compilation and the non units
bga_compiled_3 = bga_non_units.append(bga_unit_compilation)
bga_compiled_3 = pd.concat([bga_non_units, bga_unit_compilation])

bga_compiled_3 = bga_compiled_3[['plant_id_eia',
'report_date',
Expand Down Expand Up @@ -935,7 +940,7 @@ def _boiler_generator_assn(
nx.set_edge_attributes(
unit, name='unit_id_pudl', values=unit_id + 1)
new_unit_df = nx.to_pandas_edgelist(unit)
bga_w_units = bga_w_units.append(new_unit_df)
bga_w_units = pd.concat([bga_w_units, new_unit_df])

bga_w_units = bga_w_units.drop(['source', 'target'], axis=1)

Expand Down
86 changes: 42 additions & 44 deletions src/pudl/transform/eia861.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,7 +1010,7 @@ def _harvest_associations(dfs, cols):
assn = pd.DataFrame()
for df in dfs:
if set(df.columns).issuperset(set(cols)):
assn = assn.append(df[cols])
assn = pd.concat([assn, df[cols]])
assn = assn.dropna().drop_duplicates()
if assn.empty:
raise ValueError(
Expand Down Expand Up @@ -1555,55 +1555,53 @@ def distributed_generation(tfr_dfs):
###########################################################################

# Separate datasets into years with only pct values (pre-2010) and years with only mw values (post-2010)
df_pre_2010_tech = raw_dg_tech[raw_dg_tech['report_date'] < '2010-01-01']
df_post_2010_tech = raw_dg_tech[raw_dg_tech['report_date'] >= '2010-01-01']
df_pre_2010_misc = raw_dg_misc[raw_dg_misc['report_date'] < '2010-01-01']
df_post_2010_misc = raw_dg_misc[raw_dg_misc['report_date'] >= '2010-01-01']

logger.info(
'Converting pct values into mw values for distributed generation misc table')
transformed_dg_misc = (
df_pre_2010_misc.assign(
distributed_generation_owned_capacity_mw=lambda x: _pct_to_mw(
x, 'distributed_generation_owned_capacity_pct'),
backup_capacity_mw=lambda x: _pct_to_mw(x, 'backup_capacity_pct'),
).append(df_post_2010_misc)
.drop(['distributed_generation_owned_capacity_pct',
'backup_capacity_pct',
'total_capacity_mw'], axis=1)
)

logger.info(
'Converting pct values into mw values for distributed generation tech table')
transformed_dg_tech = (
df_pre_2010_tech.assign(
combustion_turbine_capacity_mw=lambda x: (
_pct_to_mw(x, 'combustion_turbine_capacity_pct')),
hydro_capacity_mw=lambda x: _pct_to_mw(x, 'hydro_capacity_pct'),
internal_combustion_capacity_mw=lambda x: (
_pct_to_mw(x, 'internal_combustion_capacity_pct')),
other_capacity_mw=lambda x: _pct_to_mw(x, 'other_capacity_pct'),
steam_capacity_mw=lambda x: _pct_to_mw(x, 'steam_capacity_pct'),
wind_capacity_mw=lambda x: _pct_to_mw(x, 'wind_capacity_pct'),
).append(df_post_2010_tech)
.drop([
'combustion_turbine_capacity_pct',
'hydro_capacity_pct',
'internal_combustion_capacity_pct',
'other_capacity_pct',
'steam_capacity_pct',
'wind_capacity_pct',
'total_capacity_mw'], axis=1
)
)
dg_tech_early = raw_dg_tech[raw_dg_tech['report_date'] < '2010-01-01']
dg_tech_late = raw_dg_tech[raw_dg_tech['report_date'] >= '2010-01-01']
dg_misc_early = raw_dg_misc[raw_dg_misc['report_date'] < '2010-01-01']
dg_misc_late = raw_dg_misc[raw_dg_misc['report_date'] >= '2010-01-01']

logger.info('Converting pct to MW for distributed generation misc table')
dg_misc_early = dg_misc_early .assign(
distributed_generation_owned_capacity_mw=lambda x: _pct_to_mw(
x, 'distributed_generation_owned_capacity_pct'),
backup_capacity_mw=lambda x: _pct_to_mw(x, 'backup_capacity_pct'),
)
dg_misc = pd.concat([dg_misc_early, dg_misc_late])
dg_misc = dg_misc.drop([
'distributed_generation_owned_capacity_pct',
'backup_capacity_pct',
'total_capacity_mw'
], axis="columns")

logger.info('Converting pct into MW for distributed generation tech table')
dg_tech_early = dg_tech_early.assign(
combustion_turbine_capacity_mw=lambda x: (
_pct_to_mw(x, 'combustion_turbine_capacity_pct')),
hydro_capacity_mw=lambda x: _pct_to_mw(x, 'hydro_capacity_pct'),
internal_combustion_capacity_mw=lambda x: (
_pct_to_mw(x, 'internal_combustion_capacity_pct')),
other_capacity_mw=lambda x: _pct_to_mw(x, 'other_capacity_pct'),
steam_capacity_mw=lambda x: _pct_to_mw(x, 'steam_capacity_pct'),
wind_capacity_mw=lambda x: _pct_to_mw(x, 'wind_capacity_pct'),
)
dg_tech = pd.concat([dg_tech_early, dg_tech_late])
dg_tech = dg_tech.drop([
'combustion_turbine_capacity_pct',
'hydro_capacity_pct',
'internal_combustion_capacity_pct',
'other_capacity_pct',
'steam_capacity_pct',
'wind_capacity_pct',
'total_capacity_mw'
], axis="columns")

###########################################################################
# Tidy Data
###########################################################################

logger.info('Tidying Distributed Generation Tech Table')
tidy_dg_tech, tech_idx_cols = _tidy_class_dfs(
df=transformed_dg_tech,
df=dg_tech,
df_name='Distributed Generation Tech Component Capacity',
idx_cols=idx_cols,
class_list=TECH_CLASSES,
Expand All @@ -1624,7 +1622,7 @@ def distributed_generation(tfr_dfs):

tfr_dfs["distributed_generation_tech_eia861"] = tidy_dg_tech
tfr_dfs["distributed_generation_fuel_eia861"] = tidy_dg_fuel
tfr_dfs["distributed_generation_misc_eia861"] = transformed_dg_misc
tfr_dfs["distributed_generation_misc_eia861"] = dg_misc

return tfr_dfs

Expand Down
10 changes: 6 additions & 4 deletions src/pudl/transform/eia923.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,7 +329,7 @@ def _aggregate_generation_fuel_duplicates(

# Add the resolved records back to generation_fuel dataframe.
gen_df = gen_fuel[~is_duplicate].copy()
gen_df = gen_df.append(resolved_dupes)
gen_df = pd.concat([gen_df, resolved_dupes])

if gen_df[natural_key_fields].isnull().any().any():
raise AssertionError(
Expand Down Expand Up @@ -782,8 +782,10 @@ def _aggregate_duplicate_boiler_fuel_keys(
_map_prime_mover_sets)

# NOTE: the following method changes the order of the data and resets the index
modified_boiler_fuel_df = boiler_fuel_df[~is_duplicate].append(
aggregates.reset_index(), ignore_index=True)
modified_boiler_fuel_df = pd.concat(
[boiler_fuel_df[~is_duplicate], aggregates.reset_index()],
ignore_index=True
)

return modified_boiler_fuel_df

Expand Down Expand Up @@ -988,7 +990,7 @@ def coalmine(eia923_dfs, eia923_transformed_dfs):
cmi_with_msha = cmi_df[cmi_df['mine_id_msha'] > 0]
cmi_with_msha = cmi_with_msha.drop_duplicates(subset=['mine_id_msha', ])
cmi_df.drop(cmi_df[cmi_df['mine_id_msha'] > 0].index)
cmi_df.append(cmi_with_msha)
cmi_df = pd.concat([cmi_df, cmi_with_msha])

cmi_df = cmi_df.drop_duplicates(subset=['mine_name',
'state',
Expand Down
Loading