Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use pd.NA where appropriate for ENUM and categorical fields #1376

Merged
merged 11 commits into from
Dec 30, 2021
Merged
1 change: 1 addition & 0 deletions src/pudl/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,7 @@ class Partition(TypedDict, total=False):
"report_year": pd.Int64Dtype(),
"utility_id_ferc1": pd.Int64Dtype(),
"utility_id_pudl": pd.Int64Dtype(),
"construction_type": pd.StringDtype(),
},
"ferc714": { # INCOMPLETE
"demand_mwh": float,
Expand Down
1 change: 0 additions & 1 deletion src/pudl/metadata/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,5 @@
"Substitute",
"Undetermined", # Should be replaced with NA
"Unknown Code", # Should be replaced with NA
katie-lamb marked this conversation as resolved.
Show resolved Hide resolved
"", # Should be replaced with NA
]
"""Valid emissions measurement codes for the EPA CEMS hourly data."""
15 changes: 11 additions & 4 deletions src/pudl/metadata/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@
"type": "string",
"description": "Type of plant construction ('outdoor', 'semioutdoor', or 'conventional'). Categorized by PUDL based on our best guess of intended value in FERC1 freeform strings.",
"constraints": {
"enum": ["", "unknown", "conventional", "outdoor", "semioutdoor"]
"enum": ["conventional", "outdoor", "semioutdoor"]
}
},
"construction_year": {
Expand Down Expand Up @@ -1077,7 +1077,7 @@
"type": "string",
"description": "Contract type for natrual gas delivery service:",
"constraints": {
"enum": ["", "firm", "interruptible"]
"enum": ["firm", "interruptible"]
}
},
"natural_gas_local_distribution_company": {
Expand Down Expand Up @@ -1105,7 +1105,7 @@
"type": "string",
"description": "Contract type for natural gas transportation service.",
"constraints": {
"enum": ["", "firm", "interruptible"]
"enum": ["firm", "interruptible"]
}
},
"nerc_region": {
Expand Down Expand Up @@ -2293,7 +2293,6 @@
"fuel_units": {
"constraints": {
"enum": [
"unknown",
"mmbtu",
"gramsU",
"kgU",
Expand Down Expand Up @@ -2324,5 +2323,13 @@
"code": {
"type": "integer"
}
},
"plants_steam_ferc1": {
"plant_type": {
"type": "string",
"constraints": {
"enum": ['steam', 'combustion_turbine', 'combined_cycle', 'nuclear', 'geothermal', 'internal_combustion', 'wind', 'photovoltaic', 'solar_thermal']
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
}
1 change: 1 addition & 0 deletions src/pudl/transform/eia860.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,6 +579,7 @@ def _make_phone_number(col1, col2, col3):
entity_type=lambda x: x.entity_type.map(ENTITY_TYPES)
)
.pipe(pudl.helpers.convert_to_date)
.fillna({'entity_type': pd.NA})
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved
)

eia860_transformed_dfs['utilities_eia860'] = u_df
Expand Down
2 changes: 1 addition & 1 deletion src/pudl/transform/eia923.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,7 +1103,7 @@ def fuel_receipts_costs(eia923_dfs, eia923_transformed_dfs):
'natural_gas_delivery_contract_type_code'],
[{'firm': ['F'], 'interruptible': ['I']},
{'firm': ['F'], 'interruptible': ['I']}],
unmapped='')
unmapped=pd.NA)
)
frc_df = PUDL_META.get_resource("fuel_receipts_costs_eia923").encode(frc_df)
frc_df["fuel_type_code_pudl"] = (
Expand Down
48 changes: 35 additions & 13 deletions src/pudl/transform/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,8 +287,8 @@
'internl combustion', '*int. combustion (1)', 'internal conbustion',
],
"wind": [
'wind', 'wind energy', 'wind turbine', 'wind - turbine', 'wind generation'
'wind turbin'
'wind', 'wind energy', 'wind turbine', 'wind - turbine', 'wind generation',
'wind turbin',
],
"photovoltaic": ['solar photovoltaic', 'photovoltaic', 'solar', 'solar project'],
"solar_thermal": ['solar thermal'],
Expand Down Expand Up @@ -399,7 +399,8 @@
'conventionsl', 'conventiional', 'convntl steam plants', 'indoor const.',
'full indoor', 'indoor', 'indoor automatic', 'indoor boiler',
'(peak load) indoor', 'conventionl,indoor', 'conventionl, indoor',
'conventional, indoor', 'comb. cycle indoor', '3 indoor boiler',
'conventional, indoor', 'conventional;outdoor', 'conven./outdoor',
'conventional;semi-ou', 'comb. cycle indoor', '3 indoor boiler',
'2 indoor boilers', '1 indoor boiler', '2 indoor boiler',
'3 indoor boilers', 'fully contained', 'conv - b', 'conventional/boiler',
'cnventional', 'comb. cycle indooor', 'sonventional', 'ind enclosures',
Expand Down Expand Up @@ -429,7 +430,9 @@
'tower -10 unit', 'tower - 101 unit', '3 on 1 gas turbine', 'tower - 10 units',
'tower - 165 units', 'wind turbine', 'fixed tilt pv', 'tracking pv', 'o',
'wind trubine', 'subcritical', 'sucritical', 'simple cycle',
'simple & reciprocat', 'solar',
'simple & reciprocat', 'solar', 'pre-fab power plant', 'prefab power plant',
'prefab. power plant', 'pump storage', 'underground', 'see page 402',
'conv. underground', 'conven. underground', 'conventional (a)', 'non-applicable',
],
}
"""
Expand Down Expand Up @@ -724,6 +727,8 @@ def plants_steam(ferc1_raw_dfs, ferc1_transformed_dfs):
ferc1_transformed_dfs['fuel_ferc1'])
)
plants_steam_validate_ids(ferc1_steam_df)
ferc1_steam_df = ferc1_steam_df.replace(
{'construction_type': 'unknown', 'plant_type': 'unknown'}, pd.NA)
ferc1_transformed_dfs['plants_steam_ferc1'] = ferc1_steam_df
return ferc1_transformed_dfs

Expand Down Expand Up @@ -769,10 +774,7 @@ def _plants_steam_clean(ferc1_steam_df):
"expns_kwh": 'opex_per_kwh'})
.pipe(_clean_cols, "f1_steam")
.pipe(pudl.helpers.simplify_strings, ['plant_name_ferc1'])
.pipe(pudl.helpers.cleanstrings,
['construction_type', 'plant_type'],
[CONSTRUCTION_TYPE_STRINGS, PLANT_KIND_STRINGS],
unmapped='')
.pipe(pudl.helpers.cleanstrings, ['construction_type', 'plant_type'], [CONSTRUCTION_TYPE_STRINGS, PLANT_KIND_STRINGS], unmapped=pd.NA)
.pipe(pudl.helpers.oob_to_nan,
cols=["construction_year", "installation_year"],
lb=1850, ub=max(pc.WORKING_PARTITIONS["ferc1"]["years"]) + 1)
Expand All @@ -784,6 +786,11 @@ def _plants_steam_clean(ferc1_steam_df):
.drop(columns=["capex_per_kw", "opex_per_kwh", "net_generation_kwh"])
)

for col in ['construction_type', 'plant_type']:
if ferc1_steam_df[col].isnull().any():
raise AssertionError(
f"NA values found in {col} column during FERC 1 steam clean, add string to dictionary for this column"
)
return ferc1_steam_df


Expand Down Expand Up @@ -992,7 +999,7 @@ def fuel(ferc1_raw_dfs, ferc1_transformed_dfs):
# imperfect:
pipe(pudl.helpers.cleanstrings, ['fuel', 'fuel_unit'],
[FUEL_STRINGS, FUEL_UNIT_STRINGS],
unmapped='').
unmapped=pd.NA).
# Fuel cost per kWh is a per-unit value that doesn't make sense to
# report for a single fuel that may be only a small part of the fuel
# consumed. "fuel generaton" is heat rate, but as it's based only on
Expand All @@ -1016,6 +1023,11 @@ def fuel(ferc1_raw_dfs, ferc1_transformed_dfs):
'fuel_cost_btu': 'fuel_cost_per_mmbtu'})
)

if fuel_ferc1_df['fuel_units'].isnull().any():
raise AssertionError(
"NA values found in fuel_units column during FERC 1 fuel clean, add string to dictionary"
)

#########################################################################
# CORRECT DATA ENTRY ERRORS #############################################
#########################################################################
Expand Down Expand Up @@ -1075,6 +1087,8 @@ def fuel(ferc1_raw_dfs, ferc1_transformed_dfs):
# (for example) a "Total" line w/ only fuel_mmbtu_per_kwh on it. Grr.
fuel_ferc1_df.dropna(inplace=True)

# Replace "unkown" fuel unit with NAs - this comes after we drop missing data with NAs
fuel_ferc1_df = fuel_ferc1_df.replace({'fuel_units': 'unknown'}, pd.NA)
zaneselvans marked this conversation as resolved.
Show resolved Hide resolved
ferc1_transformed_dfs['fuel_ferc1'] = fuel_ferc1_df

return ferc1_transformed_dfs
Expand Down Expand Up @@ -1217,7 +1231,7 @@ def plants_hydro(ferc1_raw_dfs, ferc1_transformed_dfs):
# white space -- necesary b/c plant_name is part of many foreign keys.
.pipe(pudl.helpers.simplify_strings, ['plant_name'])
.pipe(pudl.helpers.cleanstrings, ['plant_const'],
[CONSTRUCTION_TYPE_STRINGS], unmapped='')
[CONSTRUCTION_TYPE_STRINGS], unmapped=pd.NA)
.assign(
# Converting kWh to MWh
net_generation_mwh=lambda x: x.net_generation / 1000.0,
Expand Down Expand Up @@ -1273,7 +1287,11 @@ def plants_hydro(ferc1_raw_dfs, ferc1_transformed_dfs):
"capacity_mw"],
keep=False)
)

if ferc1_hydro_df['construction_type'].isnull().any():
raise AssertionError(
"NA values found in construction_type column during FERC1 hydro clean, add string to CONSTRUCTION_TYPE_STRINGS"
)
ferc1_hydro_df = ferc1_hydro_df.replace({'construction_type': 'unknown'}, pd.NA)
ferc1_transformed_dfs['plants_hydro_ferc1'] = ferc1_hydro_df
return ferc1_transformed_dfs

Expand Down Expand Up @@ -1301,7 +1319,7 @@ def plants_pumped_storage(ferc1_raw_dfs, ferc1_transformed_dfs):
.pipe(pudl.helpers.simplify_strings, ['plant_name'])
# Clean up the messy plant construction type column:
.pipe(pudl.helpers.cleanstrings, ['plant_kind'],
[CONSTRUCTION_TYPE_STRINGS], unmapped='')
[CONSTRUCTION_TYPE_STRINGS], unmapped=pd.NA)
.assign(
# Converting from kW/kWh to MW/MWh
net_generation_mwh=lambda x: x.net_generation / 1000.0,
Expand Down Expand Up @@ -1360,7 +1378,11 @@ def plants_pumped_storage(ferc1_raw_dfs, ferc1_transformed_dfs):
"capacity_mw"],
keep=False)
)

if ferc1_pump_df['construction_type'].isnull().any():
raise AssertionError(
"NA values found in construction_type column during FERC 1 pumped storage clean, add string to CONSTRUCTION_TYPE_STRINGS"
)
ferc1_pump_df = ferc1_pump_df.replace({'construction_type': 'unknown'}, pd.NA)
ferc1_transformed_dfs['plants_pumped_storage_ferc1'] = ferc1_pump_df
return ferc1_transformed_dfs

Expand Down