Skip to content

Commit

Permalink
Merge fix/#431 into CI
Browse files Browse the repository at this point in the history
Merge branch 'fix/#431-fill-missing-hh-for-populated-cells' into continuous-integration/run-everything-over-the-weekend-v2
  • Loading branch information
nailend committed Jan 14, 2022
2 parents 80aef6a + 62dfd7c commit e2f4d97
Showing 1 changed file with 202 additions and 116 deletions.
318 changes: 202 additions & 116 deletions src/egon/data/datasets/electricity_demand_timeseries/hh_profiles.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,9 @@
* calculate fraction of fine household types (10) within subgroup of rough
household types (5) :var:`df_dist_households`
* Spatial information about number of households per ha
:var:`df_households_typ` is mapped to NUTS1 and NUTS3 level.
Data is enriched with refined household subgroups via
:var:`df_dist_households` in :var:`df_zensus_cells`.
:var:`df_census_households_nuts3` is mapped to NUTS1 and NUTS3 level.
Data is refined with household subgroups via
:var:`df_dist_households` to :var:`df_census_households_grid_refined`.
* Enriched 100 x 100 m household dataset is used to sample and aggregate
household profiles. A table including individual profile id's for each cell
and scaling factor to match Demand-Regio annual sum projections for 2035
Expand Down Expand Up @@ -432,7 +432,7 @@ def set_multiindex_to_profiles(hh_profiles):
return hh_profiles


def get_zensus_households_raw():
def get_census_households_nuts1_raw():
"""Get zensus age x household type data from egon-data-bundle
Dataset about household size with information about the categories:
Expand All @@ -441,7 +441,7 @@ def get_zensus_households_raw():
* age class
* household size
for Germany in spatial resolution of federal states.
for Germany in spatial resolution of federal states NUTS-1.
Data manually selected and retrieved from:
https://ergebnisse2011.zensus2022.de/datenbank/online
Expand Down Expand Up @@ -768,7 +768,177 @@ def process_nuts1_census_data(df_census_households_raw):
return df_census_households


def refine_census_data_at_cell_level(df_zensus):
def fill_missing_hh_in_populated_cells(df_census_households_grid):
"""
Parameters
----------
df_census_households_grid: pd.DataFrame
cleaned zensus household type x age category data
Returns
-------
pd.DataFrame
zensus household data at 100x100m grid level"""

df_w_hh = df_census_households_grid.dropna().reset_index(drop=True)
df_wo_hh = df_census_households_grid.loc[
df_census_households_grid.isna().any(axis=1)
].reset_index(drop=True)

# iterate over unique population values
for i in df_wo_hh["population"].sort_values().unique():

# create fallback if no cell with specific population available
if i in df_w_hh["population"].unique():
last_i = i
# use fallback of last possible household distribution
else:
# print(i)
i = last_i

# get cells with specific population value from cells with household distribution
df_w_hh_population_i = df_w_hh.loc[df_w_hh["population"] == i]
# choose random cell within this group
rnd_cell_id_population_i = np.random.choice(
df_w_hh_population_i["cell_id"].unique()
)
# get household distribution of this cell
df_rand_hh_distribution = df_w_hh_population_i.loc[
df_w_hh_population_i["cell_id"] == rnd_cell_id_population_i
]
# get cells with specific population value from cells without household distribution
df_wo_hh_population_i = df_wo_hh.loc[df_wo_hh["population"] == i]

# all cells will get the same random household distribution

# prepare size of dataframe by number of household types
df_repeated = pd.concat(
[df_wo_hh_population_i] * df_rand_hh_distribution.shape[0],
ignore_index=True,
)
df_repeated = df_repeated.sort_values("cell_id").reset_index(drop=True)

# insert random household distribution
columns = ["characteristics_code", "hh_5types"]
df_repeated.loc[:, columns] = pd.concat(
[df_rand_hh_distribution.loc[:, columns]]
* df_wo_hh_population_i.shape[0]
).values
# append new cells
df_w_hh = df_w_hh.append(df_repeated, ignore_index=True)

return df_w_hh


def get_census_households_grid():
""""""

# Retrieve information about households for each census cell
# Only use cell-data which quality (quantity_q<2) is acceptable
df_census_households_grid = db.select_dataframe(
sql="""
SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'HHTYP_FAM' AND quantity_q <2"""
)
df_census_households_grid = df_census_households_grid.drop(
columns=["attribute", "characteristics_text"]
)

# Missing data is detected
df_missing_data = db.select_dataframe(
sql="""
SELECT count(joined.quantity_gesamt) as amount, joined.quantity_gesamt as households
FROM(
SELECT t2.grid_id, quantity_gesamt, quantity_sum_fam,
(quantity_gesamt-(case when quantity_sum_fam isnull then 0 else quantity_sum_fam end))
as insgesamt_minus_fam
FROM (
SELECT grid_id, SUM(quantity) as quantity_sum_fam
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'HHTYP_FAM'
GROUP BY grid_id) as t1
Full JOIN (
SELECT grid_id, sum(quantity) as quantity_gesamt
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'INSGESAMT'
GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
) as joined
WHERE quantity_sum_fam isnull
Group by quantity_gesamt """
)
missing_cells = db.select_dataframe(
sql="""
SELECT t12.grid_id, t12.quantity
FROM (
SELECT t2.grid_id, (case when quantity_sum_fam isnull then quantity_gesamt end) as quantity
FROM (
SELECT grid_id, SUM(quantity) as quantity_sum_fam
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'HHTYP_FAM'
GROUP BY grid_id) as t1
Full JOIN (
SELECT grid_id, sum(quantity) as quantity_gesamt
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'INSGESAMT'
GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
) as t12
WHERE quantity is not null"""
)

# Missing cells are substituted by average share of cells with same amount
# of households.
df_average_split = create_missing_zensus_data(
df_census_households_grid, df_missing_data, missing_cells
)

df_census_households_grid = df_census_households_grid.rename(
columns={"quantity": "hh_5types"}
)

df_census_households_grid = pd.concat(
[df_census_households_grid, df_average_split], ignore_index=True
)

# Census cells with nuts3 and nuts1 information
df_grid_id = db.select_dataframe(
sql="""
SELECT pop.grid_id, pop.id as cell_id, pop.population, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
FROM society.destatis_zensus_population_per_ha_inside_germany as pop
LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
ON (pop.id=vg250.zensus_population_id)
LEFT JOIN boundaries.vg250_lan as lan
ON (LEFT(vg250.vg250_nuts3, 3) = lan.nuts)
WHERE lan.gf = 4 """
)
df_grid_id = df_grid_id.drop_duplicates()
df_grid_id = df_grid_id.reset_index(drop=True)

# Merge household type and size data with considered (populated) census
# cells how='right' is used as ids of unpopulated areas are removed
# by df_grid_id or ancestors. See here:
# https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
df_census_households_grid = pd.merge(
df_census_households_grid,
df_grid_id,
left_on="grid_id",
right_on="grid_id",
how="right",
)

# fill cells with missing household distribution data but population
# by distribution of random cell with same population value
df_census_households_grid = fill_missing_hh_in_populated_cells(
df_census_households_grid
)

return df_census_households_grid


def refine_census_data_at_cell_level(
df_census_households_nuts1, df_census_households_grid
):
"""The zensus data is processed to define the number and type of households
per zensus cell. Two subsets of the zensus data are merged to fit the
IEE profiles specifications. For this, the dataset 'HHTYP_FAM' is
Expand All @@ -779,8 +949,10 @@ def refine_census_data_at_cell_level(df_zensus):
Parameters
----------
df_zensus: pd.DataFrame
df_census_households_nuts1: pd.DataFrame
Aggregated zensus household data on NUTS-1 level
df_census_households_grid: pd.DataFrame
Aggregated zensus household data on 100x100m grid level
Returns
-------
Expand All @@ -790,7 +962,7 @@ def refine_census_data_at_cell_level(df_zensus):

# :func:`get_hh_dist` without eurostat adjustment for O1-03 Groups in
# absolute values
df_hh_types_nad_abs = get_hh_dist(df_zensus, HH_TYPES)
df_hh_types_nad_abs = get_hh_dist(df_census_households_nuts1, HH_TYPES)

# Get household size for each census cell grouped by
# As this is only used to estimate size of households for OR, OO
Expand Down Expand Up @@ -835,103 +1007,10 @@ def refine_census_data_at_cell_level(df_zensus):
df_dist_households.loc[value].sum()
)

# Retrieve information about households for each census cell
# Only use cell-data which quality (quantity_q<2) is acceptable
df_households_typ = db.select_dataframe(
sql="""
SELECT grid_id, attribute, characteristics_code, characteristics_text, quantity
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'HHTYP_FAM' AND quantity_q <2"""
)
df_households_typ = df_households_typ.drop(
columns=["attribute", "characteristics_text"]
)

# Missing data is detected
df_missing_data = db.select_dataframe(
sql="""
SELECT count(joined.quantity_gesamt) as amount, joined.quantity_gesamt as households
FROM(
SELECT t2.grid_id, quantity_gesamt, quantity_sum_fam,
(quantity_gesamt-(case when quantity_sum_fam isnull then 0 else quantity_sum_fam end))
as insgesamt_minus_fam
FROM (
SELECT grid_id, SUM(quantity) as quantity_sum_fam
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'HHTYP_FAM'
GROUP BY grid_id) as t1
Full JOIN (
SELECT grid_id, sum(quantity) as quantity_gesamt
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'INSGESAMT'
GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
) as joined
WHERE quantity_sum_fam isnull
Group by quantity_gesamt """
)
missing_cells = db.select_dataframe(
sql="""
SELECT t12.grid_id, t12.quantity
FROM (
SELECT t2.grid_id, (case when quantity_sum_fam isnull then quantity_gesamt end) as quantity
FROM (
SELECT grid_id, SUM(quantity) as quantity_sum_fam
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'HHTYP_FAM'
GROUP BY grid_id) as t1
Full JOIN (
SELECT grid_id, sum(quantity) as quantity_gesamt
FROM society.egon_destatis_zensus_household_per_ha
WHERE attribute = 'INSGESAMT'
GROUP BY grid_id) as t2 ON t1.grid_id = t2.grid_id
) as t12
WHERE quantity is not null"""
)

# Missing cells are substituted by average share of cells with same amount
# of households.
df_average_split = create_missing_zensus_data(
df_households_typ, df_missing_data, missing_cells
)

df_households_typ = df_households_typ.rename(
columns={"quantity": "hh_5types"}
)

df_households_typ = pd.concat(
[df_households_typ, df_average_split], ignore_index=True
)

# Census cells with nuts3 and nuts1 information
df_grid_id = db.select_dataframe(
sql="""
SELECT pop.grid_id, pop.id as cell_id, vg250.vg250_nuts3 as nuts3, lan.nuts as nuts1, lan.gen
FROM society.destatis_zensus_population_per_ha_inside_germany as pop
LEFT JOIN boundaries.egon_map_zensus_vg250 as vg250
ON (pop.id=vg250.zensus_population_id)
LEFT JOIN boundaries.vg250_lan as lan
ON (LEFT(vg250.vg250_nuts3, 3)=lan.nuts)
WHERE lan.gf = 4 """
)
df_grid_id = df_grid_id.drop_duplicates()
df_grid_id = df_grid_id.reset_index(drop=True)

# Merge household type and size data with considered (populated) census
# cells how='inner' is used as ids of unpopulated areas are removed
# df_grid_id or earliers tables. See here:
# https://github.com/openego/eGon-data/blob/59195926e41c8bd6d1ca8426957b97f33ef27bcc/src/egon/data/importing/zensus/__init__.py#L418-L449
df_households_typ = pd.merge(
df_households_typ,
df_grid_id,
left_on="grid_id",
right_on="grid_id",
how="inner",
)

# Merge Zensus nuts1 level household data with zensus cell level 100 x 100 m
# by refining hh-groups with MAPPING_ZENSUS_HH_SUBGROUPS
df_zensus_cells = pd.DataFrame()
for (country, code), df_country_type in df_households_typ.groupby(
df_census_households_grid_refined = pd.DataFrame()
for (country, code), df_country_type in df_census_households_grid.groupby(
["gen", "characteristics_code"]
):

Expand All @@ -943,15 +1022,19 @@ def refine_census_data_at_cell_level(df_zensus):
df_country_type["hh_5types"]
* df_dist_households.loc[typ, country]
)
df_zensus_cells = df_zensus_cells.append(
df_country_type, ignore_index=True
df_census_households_grid_refined = (
df_census_households_grid_refined.append(
df_country_type, ignore_index=True
)
)

df_zensus_cells = df_zensus_cells.sort_values(
by=["grid_id", "characteristics_code"]
).reset_index(drop=True)
df_census_households_grid_refined = (
df_census_households_grid_refined.sort_values(
by=["grid_id", "characteristics_code"]
).reset_index(drop=True)
)

return df_zensus_cells
return df_census_households_grid_refined


def get_cell_demand_profile_ids(df_cell, pool_size):
Expand Down Expand Up @@ -1230,23 +1313,26 @@ def gen_profile_names(n):
# Process profiles for further use
df_iee_profiles = set_multiindex_to_profiles(df_iee_profiles)

# Download zensus household data with family type and age categories
df_census_households_raw = get_zensus_households_raw()
# Download zensus household NUTS-1 data with family type and age categories
df_census_households_nuts1_raw = get_census_households_nuts1_raw()

# Restructure data to be compatible with categories from demand profile
# generator. Reduce age intervals and aggregate data to NUTS-1 level.
df_census_households_nuts1 = process_nuts1_census_data(
df_census_households_raw
df_census_households_nuts1_raw
)

# Refine census cell data with additional nuts1 level attributes
df_census_households_cells = refine_census_data_at_cell_level(
df_census_households_nuts1
# Query census household grid data with family type
df_census_households_grid = get_census_households_grid()

# Refine census household grid data with additional NUTS-1 level attributes
df_census_households_grid_refined = refine_census_data_at_cell_level(
df_census_households_nuts1, df_census_households_grid
)

# Allocate profile ids to each cell by census data
df_hh_profiles_in_census_cells = allocate_hh_demand_profiles_to_cells(
df_census_households_cells, df_iee_profiles
df_census_households_grid_refined, df_iee_profiles
)

# Annual household electricity demand on NUTS-3 level (demand regio)
Expand Down

0 comments on commit e2f4d97

Please sign in to comment.