Skip to content

Commit

Permalink
Merge fix/#431 into CI
Browse files Browse the repository at this point in the history
Merge branch 'fix/#431-fill-missing-hh-for-populated-cells' into continuous-integration/run-everything-over-the-weekend-v2
  • Loading branch information
nailend committed Jan 17, 2022
2 parents 632214e + 1116bc1 commit 83d1257
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 102 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,8 @@ Changed
* H2 demand is met from the H2_grid buses. In Addtion, it can be met from the
H2_saltcavern buses if a proximity criterion is fulfilled
`#620 <https://github.com/openego/eGon-data/issues/620>`_
* Fill missing household data for populated cells
`#431 <https://github.com/openego/eGon-data/issues/431>`_


Bug fixes
Expand Down
256 changes: 154 additions & 102 deletions src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,10 @@
attribute 'INSGESAMT'. As the profiles are scaled with demand-regio data at
nuts3-level the impact at a higher aggregation level is negligible.
For sake of simplicity, the data is not corrected.
* There are cells without household data but a population. A randomly chosen
household distribution is taken from a subgroup of cells with same population value and
applied to all cells with missing household distribution and the specific
population value.
Notes
-----
Expand Down Expand Up @@ -133,87 +137,6 @@
# Get random seed from config
RANDOM_SEED = egon.data.config.settings()["egon-data"]["--random-seed"]

# Define mapping of census household family types to Eurostat household types
# - Adults living in households type
# - number of kids are not included even if mentioned in household type name
# **! The Eurostat data only counts adults/seniors, excluding kids <15**
# Eurostat household types are used for demand-profile-generator
# @iee-fraunhofer
HH_TYPES = {
"SR": [
("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Seniors"),
("Alleinerziehende Elternteile", "Insgesamt", "Seniors"),
],
# Single Seniors Single Parents Seniors
"SO": [
("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Adults")
], # Single Adults
"SK": [("Alleinerziehende Elternteile", "Insgesamt", "Adults")],
# Single Parents Adult
"PR": [
("Paare ohne Kind(er)", "2 Personen", "Seniors"),
("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Seniors"),
],
# Couples without Kids Senior & same sex couples & shared flat seniors
"PO": [
("Paare ohne Kind(er)", "2 Personen", "Adults"),
("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Adults"),
],
# Couples without Kids adults & same sex couples & shared flat adults
"P1": [("Paare mit Kind(ern)", "3 Personen", "Adults")],
"P2": [("Paare mit Kind(ern)", "4 Personen", "Adults")],
"P3": [
("Paare mit Kind(ern)", "5 Personen", "Adults"),
("Paare mit Kind(ern)", "6 und mehr Personen", "Adults"),
],
"OR": [
("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Seniors"),
("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Seniors"),
("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Seniors"),
(
"Mehrpersonenhaushalte ohne Kernfamilie",
"6 und mehr Personen",
"Seniors",
),
("Paare mit Kind(ern)", "3 Personen", "Seniors"),
("Paare ohne Kind(er)", "3 Personen", "Seniors"),
("Paare mit Kind(ern)", "4 Personen", "Seniors"),
("Paare ohne Kind(er)", "4 Personen", "Seniors"),
("Paare mit Kind(ern)", "5 Personen", "Seniors"),
("Paare ohne Kind(er)", "5 Personen", "Seniors"),
("Paare mit Kind(ern)", "6 und mehr Personen", "Seniors"),
("Paare ohne Kind(er)", "6 und mehr Personen", "Seniors"),
],
# no info about share of kids
# OO, O1, O2 have the same amount, as no information about the share of
# kids within census data set. If needed the total amount can be estimated
# in the :func:`get_hh_dist` function using multi_adjust=True option.
# The Eurostat share is then applied.
"OO": [
("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Adults"),
("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Adults"),
("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Adults"),
(
"Mehrpersonenhaushalte ohne Kernfamilie",
"6 und mehr Personen",
"Adults",
),
("Paare ohne Kind(er)", "3 Personen", "Adults"),
("Paare ohne Kind(er)", "4 Personen", "Adults"),
("Paare ohne Kind(er)", "5 Personen", "Adults"),
("Paare ohne Kind(er)", "6 und mehr Personen", "Adults"),
],
# no info about share of kids
}

MAPPING_ZENSUS_HH_SUBGROUPS = {
1: ["SR", "SO"],
2: ["PR", "PO"],
3: ["SK"],
4: ["P1", "P2", "P3"],
5: ["OR", "OO"],
}


class IeeHouseholdLoadProfiles(Base):
__tablename__ = "iee_household_load_profiles"
Expand Down Expand Up @@ -250,7 +173,7 @@ class EgonEtragoElectricityHouseholds(Base):
setup = partial(
Dataset,
name="HH Demand",
version="0.0.4",
version="0.0.5",
dependencies=[],
# Tasks are declared in pipeline as function is used multiple times with
# different args.
Expand Down Expand Up @@ -769,36 +692,43 @@ def process_nuts1_census_data(df_census_households_raw):


def fill_missing_hh_in_populated_cells(df_census_households_grid):
"""
"""There are cells without household data but a population. A randomly
chosen household distribution is taken from a subgroup of cells with same
population value and applied to all cells with missing household
distribution and the specific population value. In the case, in which there
is no subgroup with household data of the respective population value, the
fallback is the subgroup with the last last smaller population value.
Parameters
----------
df_census_households_grid: pd.DataFrame
cleaned zensus household type x age category data
census household data at 100x100m grid level
Returns
-------
pd.DataFrame
zensus household data at 100x100m grid level"""
substituted census household data at 100x100m grid level"""

df_w_hh = df_census_households_grid.dropna().reset_index(drop=True)
df_wo_hh = df_census_households_grid.loc[
df_census_households_grid.isna().any(axis=1)
].reset_index(drop=True)

# iterate over unique population values
for i in df_wo_hh["population"].sort_values().unique():
for population in df_wo_hh["population"].sort_values().unique():

# create fallback if no cell with specific population available
if i in df_w_hh["population"].unique():
last_i = i
if population in df_w_hh["population"].unique():
fallback_value = population
population_value = population
# use fallback of last possible household distribution
else:
# print(i)
i = last_i
population_value = fallback_value

# get cells with specific population value from cells with household distribution
df_w_hh_population_i = df_w_hh.loc[df_w_hh["population"] == i]
df_w_hh_population_i = df_w_hh.loc[
df_w_hh["population"] == population_value
]
# choose random cell within this group
rnd_cell_id_population_i = np.random.choice(
df_w_hh_population_i["cell_id"].unique()
Expand All @@ -808,7 +738,9 @@ def fill_missing_hh_in_populated_cells(df_census_households_grid):
df_w_hh_population_i["cell_id"] == rnd_cell_id_population_i
]
# get cells with specific population value from cells without household distribution
df_wo_hh_population_i = df_wo_hh.loc[df_wo_hh["population"] == i]
df_wo_hh_population_i = df_wo_hh.loc[
df_wo_hh["population"] == population
]

# all cells will get the same random household distribution

Expand All @@ -832,7 +764,26 @@ def fill_missing_hh_in_populated_cells(df_census_households_grid):


def get_census_households_grid():
""""""
"""Query census household data at 100x100m grid level from database. As
there is a divergence in the census household data depending which attribute
is used. There also exist cells without household but with population data.
The missing data in these cases are substituted. First census household data
with attribute 'HHTYP_FAM' is missing for some cells with small amount
of households. This data is generated using the average share of household
types for cells with similar household number. For some cells the summed
amount of households per type deviates from the total number with attribute
'INSGESAMT'. As the profiles are scaled with demand-regio data at
nuts3-level the impact at a higher aggregation level is negligible. For sake
of simplicity, the data is not corrected. Secondly, cells without household
data but population value are covered. A randomly chosen household
distribution is taken from a group of cells with same population value and
applied to all cells with missing household distribution and the specific
population value.
Returns
-------
pd.DataFrame
zensus household data at 100x100m grid level"""

# Retrieve information about households for each census cell
# Only use cell-data which quality (quantity_q<2) is acceptable
Expand Down Expand Up @@ -926,9 +877,13 @@ def get_census_households_grid():
right_on="grid_id",
how="right",
)
df_census_households_grid = df_census_households_grid.sort_values(
["cell_id", "characteristics_code"]
)

# fill cells with missing household distribution data but population
# by distribution of random cell with same population value

df_census_households_grid = fill_missing_hh_in_populated_cells(
df_census_households_grid
)
Expand All @@ -941,11 +896,11 @@ def refine_census_data_at_cell_level(
):
"""The zensus data is processed to define the number and type of households
per zensus cell. Two subsets of the zensus data are merged to fit the
IEE profiles specifications. For this, the dataset 'HHTYP_FAM' is
converted from people living in households to number of households of
specific size using the category 'HHGROESS_KLASS' wherever the amount
of people is not trivial (OR, OO). Kids are not counted. Missing data
in 'HHTYP_FAM' is substituted in :func:`create_missing_zensus_data`.
IEE profiles specifications. For this, the dataset of people living in
households at NUTS-1 is converted to number of households of
specific size. The data of category 'HHGROESS_KLASS' in census households
at grid level is used to determine an average wherever the amount
of people is not trivial (OR, OO). Kids are not counted.
Parameters
----------
Expand All @@ -959,10 +914,107 @@ def refine_census_data_at_cell_level(
pd.DataFrame
Number of hh types per census cell and scaling factors
"""
# Define mapping of census household family types to Eurostat household types
# - Adults living in households type
# - number of kids are not included even if mentioned in household type name
# **! The Eurostat data only counts adults/seniors, excluding kids <15**
# Eurostat household types are used for demand-profile-generator
# @iee-fraunhofer
hh_types_eurostat = {
"SR": [
("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Seniors"),
("Alleinerziehende Elternteile", "Insgesamt", "Seniors"),
],
# Single Seniors Single Parents Seniors
"SO": [
("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Adults")
], # Single Adults
"SK": [("Alleinerziehende Elternteile", "Insgesamt", "Adults")],
# Single Parents Adult
"PR": [
("Paare ohne Kind(er)", "2 Personen", "Seniors"),
(
"Mehrpersonenhaushalte ohne Kernfamilie",
"2 Personen",
"Seniors",
),
],
# Couples without Kids Senior & same sex couples & shared flat seniors
"PO": [
("Paare ohne Kind(er)", "2 Personen", "Adults"),
("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Adults"),
],
# Couples without Kids adults & same sex couples & shared flat adults
"P1": [("Paare mit Kind(ern)", "3 Personen", "Adults")],
"P2": [("Paare mit Kind(ern)", "4 Personen", "Adults")],
"P3": [
("Paare mit Kind(ern)", "5 Personen", "Adults"),
("Paare mit Kind(ern)", "6 und mehr Personen", "Adults"),
],
"OR": [
(
"Mehrpersonenhaushalte ohne Kernfamilie",
"3 Personen",
"Seniors",
),
(
"Mehrpersonenhaushalte ohne Kernfamilie",
"4 Personen",
"Seniors",
),
(
"Mehrpersonenhaushalte ohne Kernfamilie",
"5 Personen",
"Seniors",
),
(
"Mehrpersonenhaushalte ohne Kernfamilie",
"6 und mehr Personen",
"Seniors",
),
("Paare mit Kind(ern)", "3 Personen", "Seniors"),
("Paare ohne Kind(er)", "3 Personen", "Seniors"),
("Paare mit Kind(ern)", "4 Personen", "Seniors"),
("Paare ohne Kind(er)", "4 Personen", "Seniors"),
("Paare mit Kind(ern)", "5 Personen", "Seniors"),
("Paare ohne Kind(er)", "5 Personen", "Seniors"),
("Paare mit Kind(ern)", "6 und mehr Personen", "Seniors"),
("Paare ohne Kind(er)", "6 und mehr Personen", "Seniors"),
],
# no info about share of kids
# OO, O1, O2 have the same amount, as no information about the share of
# kids within census data set. If needed the total amount can be estimated
# in the :func:`get_hh_dist` function using multi_adjust=True option.
# The Eurostat share is then applied.
"OO": [
("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Adults"),
("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Adults"),
("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Adults"),
(
"Mehrpersonenhaushalte ohne Kernfamilie",
"6 und mehr Personen",
"Adults",
),
("Paare ohne Kind(er)", "3 Personen", "Adults"),
("Paare ohne Kind(er)", "4 Personen", "Adults"),
("Paare ohne Kind(er)", "5 Personen", "Adults"),
("Paare ohne Kind(er)", "6 und mehr Personen", "Adults"),
],
# no info about share of kids
}

mapping_zensus_hh_subgroups = {
1: ["SR", "SO"],
2: ["PR", "PO"],
3: ["SK"],
4: ["P1", "P2", "P3"],
5: ["OR", "OO"],
}
# :func:`get_hh_dist` without eurostat adjustment for O1-03 Groups in
# absolute values
df_hh_types_nad_abs = get_hh_dist(df_census_households_nuts1, HH_TYPES)
df_hh_types_nad_abs = get_hh_dist(
df_census_households_nuts1, hh_types_eurostat
)

# Get household size for each census cell grouped by
# As this is only used to estimate size of households for OR, OO
Expand Down Expand Up @@ -1002,20 +1054,20 @@ def refine_census_data_at_cell_level(

# Calculate fraction of fine household types within subgroup of
# rough household types
for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
for value in mapping_zensus_hh_subgroups.values():
df_dist_households.loc[value] = df_dist_households.loc[value].div(
df_dist_households.loc[value].sum()
)

# Merge Zensus nuts1 level household data with zensus cell level 100 x 100 m
# by refining hh-groups with MAPPING_ZENSUS_HH_SUBGROUPS
# by refining hh-groups with mapping_zensus_hh_subgroups
df_census_households_grid_refined = pd.DataFrame()
for (country, code), df_country_type in df_census_households_grid.groupby(
["gen", "characteristics_code"]
):

# iterate over zenus_country subgroups
for typ in MAPPING_ZENSUS_HH_SUBGROUPS[code]:
for typ in mapping_zensus_hh_subgroups[code]:
df_country_type["hh_type"] = typ
df_country_type["factor"] = df_dist_households.loc[typ, country]
df_country_type["hh_10types"] = (
Expand Down

0 comments on commit 83d1257

Please sign in to comment.