Merge fix/#431 into CI

Merge branch 'fix/#431-fill-missing-hh-for-populated-cells' into continuous-integration/run-everything-over-the-weekend-v2
openego · Jan 17, 2022 · 83d1257 · 83d1257
2 parents 632214e + 1116bc1
commit 83d1257
Show file tree

Hide file tree

Showing 2 changed files with 156 additions and 102 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -308,6 +308,8 @@ Changed
 * H2 demand is met from the H2_grid buses. In Addtion, it can be met from the
   H2_saltcavern buses if a proximity criterion is fulfilled
   `#620 <https://github.com/openego/eGon-data/issues/620>`_
+* Fill missing household data for populated cells
+  `#431 <https://github.com/openego/eGon-data/issues/431>`_
 
 
 Bug fixes

diff --git a/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py b/src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py
@@ -101,6 +101,10 @@
  attribute 'INSGESAMT'. As the profiles are scaled with demand-regio data at
  nuts3-level the impact at a higher aggregation level is negligible.
  For sake of simplicity, the data is not corrected.
+* There are cells without household data but a population. A randomly chosen
+ household distribution is taken from a subgroup of cells with same population value and
+ applied to all cells with missing household distribution and the specific
+ population value.
 
 Notes
 -----
@@ -133,87 +137,6 @@
 # Get random seed from config
 RANDOM_SEED = egon.data.config.settings()["egon-data"]["--random-seed"]
 
-# Define mapping of census household family types to Eurostat household types
-# - Adults living in households type
-# - number of kids are  not included even if mentioned in household type name
-# **! The Eurostat data only counts adults/seniors, excluding kids <15**
-# Eurostat household types are used for demand-profile-generator
-# @iee-fraunhofer
-HH_TYPES = {
-    "SR": [
-        ("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Seniors"),
-        ("Alleinerziehende Elternteile", "Insgesamt", "Seniors"),
-    ],
-    # Single Seniors Single Parents Seniors
-    "SO": [
-        ("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Adults")
-    ],  # Single Adults
-    "SK": [("Alleinerziehende Elternteile", "Insgesamt", "Adults")],
-    # Single Parents Adult
-    "PR": [
-        ("Paare ohne Kind(er)", "2 Personen", "Seniors"),
-        ("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Seniors"),
-    ],
-    # Couples without Kids Senior & same sex couples & shared flat seniors
-    "PO": [
-        ("Paare ohne Kind(er)", "2 Personen", "Adults"),
-        ("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Adults"),
-    ],
-    # Couples without Kids adults & same sex couples & shared flat adults
-    "P1": [("Paare mit Kind(ern)", "3 Personen", "Adults")],
-    "P2": [("Paare mit Kind(ern)", "4 Personen", "Adults")],
-    "P3": [
-        ("Paare mit Kind(ern)", "5 Personen", "Adults"),
-        ("Paare mit Kind(ern)", "6 und mehr Personen", "Adults"),
-    ],
-    "OR": [
-        ("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Seniors"),
-        ("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Seniors"),
-        ("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Seniors"),
-        (
-            "Mehrpersonenhaushalte ohne Kernfamilie",
-            "6 und mehr Personen",
-            "Seniors",
-        ),
-        ("Paare mit Kind(ern)", "3 Personen", "Seniors"),
-        ("Paare ohne Kind(er)", "3 Personen", "Seniors"),
-        ("Paare mit Kind(ern)", "4 Personen", "Seniors"),
-        ("Paare ohne Kind(er)", "4 Personen", "Seniors"),
-        ("Paare mit Kind(ern)", "5 Personen", "Seniors"),
-        ("Paare ohne Kind(er)", "5 Personen", "Seniors"),
-        ("Paare mit Kind(ern)", "6 und mehr Personen", "Seniors"),
-        ("Paare ohne Kind(er)", "6 und mehr Personen", "Seniors"),
-    ],
-    # no info about share of kids
-    # OO, O1, O2 have the same amount, as no information about the share of
-    # kids within census data set. If needed the total amount can be estimated
-    # in the :func:`get_hh_dist` function using multi_adjust=True option.
-    # The Eurostat share is then applied.
-    "OO": [
-        ("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Adults"),
-        ("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Adults"),
-        ("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Adults"),
-        (
-            "Mehrpersonenhaushalte ohne Kernfamilie",
-            "6 und mehr Personen",
-            "Adults",
-        ),
-        ("Paare ohne Kind(er)", "3 Personen", "Adults"),
-        ("Paare ohne Kind(er)", "4 Personen", "Adults"),
-        ("Paare ohne Kind(er)", "5 Personen", "Adults"),
-        ("Paare ohne Kind(er)", "6 und mehr Personen", "Adults"),
-    ],
-    # no info about share of kids
-}
-
-MAPPING_ZENSUS_HH_SUBGROUPS = {
-    1: ["SR", "SO"],
-    2: ["PR", "PO"],
-    3: ["SK"],
-    4: ["P1", "P2", "P3"],
-    5: ["OR", "OO"],
-}
-
 
 class IeeHouseholdLoadProfiles(Base):
     __tablename__ = "iee_household_load_profiles"
@@ -250,7 +173,7 @@ class EgonEtragoElectricityHouseholds(Base):
 setup = partial(
     Dataset,
     name="HH Demand",
-    version="0.0.4",
+    version="0.0.5",
     dependencies=[],
     # Tasks are declared in pipeline as function is used multiple times with
     # different args.
@@ -769,36 +692,43 @@ def process_nuts1_census_data(df_census_households_raw):
 
 
 def fill_missing_hh_in_populated_cells(df_census_households_grid):
-    """
+    """There are cells without household data but a population. A randomly
+    chosen household distribution is taken from a subgroup of cells with same
+    population value and applied to all cells with missing household
+    distribution and the specific population value. In the case, in which there
+    is no subgroup with household data of the respective population value, the
+    fallback is the subgroup with the last last smaller population value.
 
     Parameters
     ----------
     df_census_households_grid: pd.DataFrame
-        cleaned zensus household type x age category data
+        census household data at 100x100m grid level
 
     Returns
     -------
     pd.DataFrame
-        zensus household data at 100x100m grid level"""
+        substituted census household data at 100x100m grid level"""
 
     df_w_hh = df_census_households_grid.dropna().reset_index(drop=True)
     df_wo_hh = df_census_households_grid.loc[
         df_census_households_grid.isna().any(axis=1)
     ].reset_index(drop=True)
 
     # iterate over unique population values
-    for i in df_wo_hh["population"].sort_values().unique():
+    for population in df_wo_hh["population"].sort_values().unique():
 
         # create fallback if no cell with specific population available
-        if i in df_w_hh["population"].unique():
-            last_i = i
+        if population in df_w_hh["population"].unique():
+            fallback_value = population
+            population_value = population
         # use fallback of last possible household distribution
         else:
-            #         print(i)
-            i = last_i
+            population_value = fallback_value
 
         # get cells with specific population value from cells with household distribution
-        df_w_hh_population_i = df_w_hh.loc[df_w_hh["population"] == i]
+        df_w_hh_population_i = df_w_hh.loc[
+            df_w_hh["population"] == population_value
+        ]
         # choose random cell within this group
         rnd_cell_id_population_i = np.random.choice(
             df_w_hh_population_i["cell_id"].unique()
@@ -808,7 +738,9 @@ def fill_missing_hh_in_populated_cells(df_census_households_grid):
             df_w_hh_population_i["cell_id"] == rnd_cell_id_population_i
         ]
         # get cells with specific population value from cells without household distribution
-        df_wo_hh_population_i = df_wo_hh.loc[df_wo_hh["population"] == i]
+        df_wo_hh_population_i = df_wo_hh.loc[
+            df_wo_hh["population"] == population
+        ]
 
         # all cells will get the same random household distribution
 
@@ -832,7 +764,26 @@ def fill_missing_hh_in_populated_cells(df_census_households_grid):
 
 
 def get_census_households_grid():
-    """"""
+    """Query census household data at 100x100m grid level from database. As
+    there is a divergence in the census household data depending which attribute
+    is used. There also exist cells without household but with population data.
+    The missing data in these cases are substituted. First census household data
+    with attribute 'HHTYP_FAM' is missing for some cells with small amount
+    of households. This data is generated using the average share of household
+    types for cells with similar household number. For some cells the summed
+    amount of households per type deviates from the total number with attribute
+    'INSGESAMT'. As the profiles are scaled with demand-regio data at
+    nuts3-level the impact at a higher aggregation level is negligible. For sake
+    of simplicity, the data is not corrected. Secondly, cells without household
+    data but population value are covered. A randomly chosen household
+    distribution is taken from a group of cells with same population value and
+    applied to all cells with missing household distribution and the specific
+    population value.
+
+    Returns
+    -------
+    pd.DataFrame
+        zensus household data at 100x100m grid level"""
 
     # Retrieve information about households for each census cell
     # Only use cell-data which quality (quantity_q<2) is acceptable
@@ -926,9 +877,13 @@ def get_census_households_grid():
         right_on="grid_id",
         how="right",
     )
+    df_census_households_grid = df_census_households_grid.sort_values(
+        ["cell_id", "characteristics_code"]
+    )
 
     # fill cells with missing household distribution data but population
     # by distribution of random cell with same population value
+
     df_census_households_grid = fill_missing_hh_in_populated_cells(
         df_census_households_grid
     )
@@ -941,11 +896,11 @@ def refine_census_data_at_cell_level(
 ):
     """The zensus data is processed to define the number and type of households
     per zensus cell. Two subsets of the zensus data are merged to fit the
-    IEE profiles specifications. For this, the dataset 'HHTYP_FAM' is
-    converted from people living in households to number of households of
-    specific size using the category 'HHGROESS_KLASS' wherever the amount
-    of people is not trivial (OR, OO). Kids are not counted. Missing data
-    in 'HHTYP_FAM' is substituted in :func:`create_missing_zensus_data`.
+    IEE profiles specifications. For this, the dataset of  people living in
+    households at NUTS-1 is converted to number of households of
+    specific size. The data of category 'HHGROESS_KLASS' in census households
+    at grid level is used to determine an average wherever the amount
+    of people is not trivial (OR, OO). Kids are not counted.
 
     Parameters
     ----------
@@ -959,10 +914,107 @@ def refine_census_data_at_cell_level(
     pd.DataFrame
         Number of hh types per census cell and scaling factors
     """
+    # Define mapping of census household family types to Eurostat household types
+    # - Adults living in households type
+    # - number of kids are  not included even if mentioned in household type name
+    # **! The Eurostat data only counts adults/seniors, excluding kids <15**
+    # Eurostat household types are used for demand-profile-generator
+    # @iee-fraunhofer
+    hh_types_eurostat = {
+        "SR": [
+            ("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Seniors"),
+            ("Alleinerziehende Elternteile", "Insgesamt", "Seniors"),
+        ],
+        # Single Seniors Single Parents Seniors
+        "SO": [
+            ("Einpersonenhaushalte (Singlehaushalte)", "Insgesamt", "Adults")
+        ],  # Single Adults
+        "SK": [("Alleinerziehende Elternteile", "Insgesamt", "Adults")],
+        # Single Parents Adult
+        "PR": [
+            ("Paare ohne Kind(er)", "2 Personen", "Seniors"),
+            (
+                "Mehrpersonenhaushalte ohne Kernfamilie",
+                "2 Personen",
+                "Seniors",
+            ),
+        ],
+        # Couples without Kids Senior & same sex couples & shared flat seniors
+        "PO": [
+            ("Paare ohne Kind(er)", "2 Personen", "Adults"),
+            ("Mehrpersonenhaushalte ohne Kernfamilie", "2 Personen", "Adults"),
+        ],
+        # Couples without Kids adults & same sex couples & shared flat adults
+        "P1": [("Paare mit Kind(ern)", "3 Personen", "Adults")],
+        "P2": [("Paare mit Kind(ern)", "4 Personen", "Adults")],
+        "P3": [
+            ("Paare mit Kind(ern)", "5 Personen", "Adults"),
+            ("Paare mit Kind(ern)", "6 und mehr Personen", "Adults"),
+        ],
+        "OR": [
+            (
+                "Mehrpersonenhaushalte ohne Kernfamilie",
+                "3 Personen",
+                "Seniors",
+            ),
+            (
+                "Mehrpersonenhaushalte ohne Kernfamilie",
+                "4 Personen",
+                "Seniors",
+            ),
+            (
+                "Mehrpersonenhaushalte ohne Kernfamilie",
+                "5 Personen",
+                "Seniors",
+            ),
+            (
+                "Mehrpersonenhaushalte ohne Kernfamilie",
+                "6 und mehr Personen",
+                "Seniors",
+            ),
+            ("Paare mit Kind(ern)", "3 Personen", "Seniors"),
+            ("Paare ohne Kind(er)", "3 Personen", "Seniors"),
+            ("Paare mit Kind(ern)", "4 Personen", "Seniors"),
+            ("Paare ohne Kind(er)", "4 Personen", "Seniors"),
+            ("Paare mit Kind(ern)", "5 Personen", "Seniors"),
+            ("Paare ohne Kind(er)", "5 Personen", "Seniors"),
+            ("Paare mit Kind(ern)", "6 und mehr Personen", "Seniors"),
+            ("Paare ohne Kind(er)", "6 und mehr Personen", "Seniors"),
+        ],
+        # no info about share of kids
+        # OO, O1, O2 have the same amount, as no information about the share of
+        # kids within census data set. If needed the total amount can be estimated
+        # in the :func:`get_hh_dist` function using multi_adjust=True option.
+        # The Eurostat share is then applied.
+        "OO": [
+            ("Mehrpersonenhaushalte ohne Kernfamilie", "3 Personen", "Adults"),
+            ("Mehrpersonenhaushalte ohne Kernfamilie", "4 Personen", "Adults"),
+            ("Mehrpersonenhaushalte ohne Kernfamilie", "5 Personen", "Adults"),
+            (
+                "Mehrpersonenhaushalte ohne Kernfamilie",
+                "6 und mehr Personen",
+                "Adults",
+            ),
+            ("Paare ohne Kind(er)", "3 Personen", "Adults"),
+            ("Paare ohne Kind(er)", "4 Personen", "Adults"),
+            ("Paare ohne Kind(er)", "5 Personen", "Adults"),
+            ("Paare ohne Kind(er)", "6 und mehr Personen", "Adults"),
+        ],
+        # no info about share of kids
+    }
 
+    mapping_zensus_hh_subgroups = {
+        1: ["SR", "SO"],
+        2: ["PR", "PO"],
+        3: ["SK"],
+        4: ["P1", "P2", "P3"],
+        5: ["OR", "OO"],
+    }
     # :func:`get_hh_dist` without eurostat adjustment for O1-03 Groups in
     # absolute values
-    df_hh_types_nad_abs = get_hh_dist(df_census_households_nuts1, HH_TYPES)
+    df_hh_types_nad_abs = get_hh_dist(
+        df_census_households_nuts1, hh_types_eurostat
+    )
 
     # Get household size for each census cell grouped by
     # As this is only used to estimate size of households for OR, OO
@@ -1002,20 +1054,20 @@ def refine_census_data_at_cell_level(
 
     # Calculate fraction of fine household types within subgroup of
     # rough household types
-    for value in MAPPING_ZENSUS_HH_SUBGROUPS.values():
+    for value in mapping_zensus_hh_subgroups.values():
         df_dist_households.loc[value] = df_dist_households.loc[value].div(
             df_dist_households.loc[value].sum()
         )
 
     # Merge Zensus nuts1 level household data with zensus cell level 100 x 100 m
-    # by refining hh-groups with MAPPING_ZENSUS_HH_SUBGROUPS
+    # by refining hh-groups with mapping_zensus_hh_subgroups
     df_census_households_grid_refined = pd.DataFrame()
     for (country, code), df_country_type in df_census_households_grid.groupby(
         ["gen", "characteristics_code"]
     ):
 
         # iterate over zenus_country subgroups
-        for typ in MAPPING_ZENSUS_HH_SUBGROUPS[code]:
+        for typ in mapping_zensus_hh_subgroups[code]:
             df_country_type["hh_type"] = typ
             df_country_type["factor"] = df_dist_households.loc[typ, country]
             df_country_type["hh_10types"] = (