Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FEAT] Probabilistic Forecasting Util Functions #195

Merged
merged 4 commits into from
Jun 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion hierarchicalforecast/_modidx.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,5 +181,11 @@
'hierarchicalforecast.utils.cov2corr': ('utils.html#cov2corr', 'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.is_strictly_hierarchical': ( 'utils.html#is_strictly_hierarchical',
'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.level_to_outputs': ( 'utils.html#level_to_outputs',
'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.numpy_balance': ( 'utils.html#numpy_balance',
'hierarchicalforecast/utils.py')}}}
'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.quantiles_to_outputs': ( 'utils.html#quantiles_to_outputs',
'hierarchicalforecast/utils.py'),
'hierarchicalforecast.utils.samples_to_quantiles_df': ( 'utils.html#samples_to_quantiles_df',
'hierarchicalforecast/utils.py')}}}
108 changes: 103 additions & 5 deletions hierarchicalforecast/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
import timeit
from itertools import chain
from typing import Callable, Dict, List, Optional
from typing import Callable, Dict, List, Optional, Iterable

import matplotlib.pyplot as plt
import numpy as np
Expand Down Expand Up @@ -68,7 +68,105 @@ def cov2corr(cov, return_std=False):
else:
return corr

# %% ../nbs/utils.ipynb 7
# convert levels to output quantile names
def level_to_outputs(level:Iterable[int]):
""" Converts list of levels into output names matching StatsForecast and NeuralForecast methods.

**Parameters:**<br>
`level`: int list [0,100]. Probability levels for prediction intervals.<br>

**Returns:**<br>
`output_names`: str list. String list with output column names.
"""
qs = sum([[50-l/2, 50+l/2] for l in level], [])
output_names = sum([[f'-lo-{l}', f'-hi-{l}'] for l in level], [])

sort_idx = np.argsort(qs)
quantiles = np.array(qs)[sort_idx]

# Add default median
quantiles = np.concatenate([np.array([50]), quantiles]) / 100
output_names = list(np.array(output_names)[sort_idx])
output_names.insert(0, '-median')

return quantiles, output_names

# convert quantiles to output quantile names
def quantiles_to_outputs(quantiles:Iterable[float]):
"""Converts list of quantiles into output names matching StatsForecast and NeuralForecast methods.

**Parameters:**<br>
`quantiles`: float list [0., 1.]. Alternative to level, quantiles to estimate from y distribution.<br>

**Returns:**<br>
`output_names`: str list. String list with output column names.
"""
output_names = []
for q in quantiles:
if q<.50:
output_names.append(f'-lo-{np.round(100-200*q,2)}')
elif q>.50:
output_names.append(f'-hi-{np.round(100-200*(1-q),2)}')
else:
output_names.append('-median')
return quantiles, output_names

# %% ../nbs/utils.ipynb 8
# given input array of sample forecasts and inptut quantiles/levels,
# output a Pandas Dataframe with columns of quantile predictions
def samples_to_quantiles_df(samples:np.ndarray,
unique_ids:Iterable[str],
dates:Iterable,
quantiles:Optional[Iterable[float]] = None,
level:Optional[Iterable[int]] = None,
model_name:Optional[str] = "model"):
""" Transform Samples into HierarchicalForecast input.
Auxiliary function to create compatible HierarchicalForecast input Y_hat_df dataframe.

**Parameters:**<br>
`samples`: numpy array. Samples from forecast distribution of shape [n_series, n_samples, horizon].<br>
`unique_ids`: string list. Unique identifiers for each time series.<br>
`dates`: datetime list. List of forecast dates.<br>
`quantiles`: float list in [0., 1.]. Alternative to level, quantiles to estimate from y distribution.<br>
`level`: int list in [0,100]. Probability levels for prediction intervals.<br>
`model_name`: string. Name of forecasting model.<br>

**Returns:**<br>
`quantiles`: float list in [0., 1.]. quantiles to estimate from y distribution .<br>
`Y_hat_df`: pd.DataFrame. With base quantile forecasts with columns ds and models to reconcile indexed by unique_id.
"""

# Get the shape of the array
n_series, n_samples, horizon = samples.shape

assert n_series == len(unique_ids)
assert horizon == len(dates)
assert (quantiles is not None) ^ (level is not None) #check exactly one of quantiles/levels has been input

#create initial dictionary
forecasts_mean = np.mean(samples, axis=1).flatten()
unique_ids = np.repeat(unique_ids, horizon)
ds = np.tile(dates, n_series)
data = pd.DataFrame({"unique_id":unique_ids, "ds":ds, model_name:forecasts_mean})

#create quantiles and quantile names
quantiles, quantile_names = level_to_outputs(level) if level is not None else quantiles_to_outputs(quantiles)
percentiles = [quantile * 100 for quantile in quantiles]
col_names = np.array([model_name + quantile_name for quantile_name in quantile_names])

#add quantiles to dataframe
forecasts_quantiles = np.percentile(samples, percentiles, axis=1)

forecasts_quantiles = np.transpose(forecasts_quantiles, (1,2,0)) # [Q,H,N] -> [N,H,Q]
forecasts_quantiles = forecasts_quantiles.reshape(-1,len(quantiles))

df = pd.DataFrame(data=forecasts_quantiles,
columns=col_names)

return quantiles, pd.concat([data,df], axis=1).set_index('unique_id')

# %% ../nbs/utils.ipynb 11
def _to_summing_matrix(S_df: pd.DataFrame):
"""Transforms the DataFrame `df` of hierarchies to a summing matrix S."""
categories = [S_df[col].unique() for col in S_df.columns]
Expand All @@ -81,14 +179,14 @@ def _to_summing_matrix(S_df: pd.DataFrame):
tags = dict(zip(S_df.columns, categories))
return S, tags

# %% ../nbs/utils.ipynb 9
# %% ../nbs/utils.ipynb 12
def aggregate_before(df: pd.DataFrame,
spec: List[List[str]],
agg_fn: Callable = np.sum):
"""Utils Aggregation Function.

Aggregates bottom level series contained in the pd.DataFrame `df` according
to levels defined in the `spec` list applying the `agg_fn` (sum, mean).
to levels defined in the `spec` list applying the `agg_fn` (sum, mean).<br>

**Parameters:**<br>
`df`: pd.DataFrame with columns `['ds', 'y']` and columns to aggregate.<br>
Expand Down Expand Up @@ -123,7 +221,7 @@ def aggregate_before(df: pd.DataFrame,
S, tags = _to_summing_matrix(S_df.loc[bottom_hier, hiers_cols])
return Y_df, S, tags

# %% ../nbs/utils.ipynb 10
# %% ../nbs/utils.ipynb 13
def numpy_balance(*arrs):
"""
Fast NumPy implementation of balance function.
Expand Down Expand Up @@ -248,7 +346,7 @@ def aggregate(df: pd.DataFrame,
Y_df = Y_df.set_index('unique_id')
return Y_df, S_df, tags

# %% ../nbs/utils.ipynb 16
# %% ../nbs/utils.ipynb 21
class HierarchicalPlot:
""" Hierarchical Plot

Expand Down
204 changes: 202 additions & 2 deletions nbs/utils.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
"import sys\n",
"import timeit\n",
"from itertools import chain\n",
"from typing import Callable, Dict, List, Optional\n",
"from typing import Callable, Dict, List, Optional, Iterable\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
Expand Down Expand Up @@ -132,6 +132,132 @@
" return corr"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0665290c",
"metadata": {},
"outputs": [],
"source": [
"#| exporti\n",
"\n",
"# convert levels to output quantile names\n",
"def level_to_outputs(level:Iterable[int]):\n",
" \"\"\" Converts list of levels into output names matching StatsForecast and NeuralForecast methods.\n",
"\n",
" **Parameters:**<br>\n",
" `level`: int list [0,100]. Probability levels for prediction intervals.<br>\n",
"\n",
" **Returns:**<br>\n",
" `output_names`: str list. String list with output column names.\n",
" \"\"\"\n",
" qs = sum([[50-l/2, 50+l/2] for l in level], [])\n",
" output_names = sum([[f'-lo-{l}', f'-hi-{l}'] for l in level], [])\n",
"\n",
" sort_idx = np.argsort(qs)\n",
" quantiles = np.array(qs)[sort_idx]\n",
"\n",
" # Add default median\n",
" quantiles = np.concatenate([np.array([50]), quantiles]) / 100\n",
" output_names = list(np.array(output_names)[sort_idx])\n",
" output_names.insert(0, '-median')\n",
" \n",
" return quantiles, output_names\n",
"\n",
"# convert quantiles to output quantile names\n",
"def quantiles_to_outputs(quantiles:Iterable[float]):\n",
" \"\"\"Converts list of quantiles into output names matching StatsForecast and NeuralForecast methods.\n",
"\n",
" **Parameters:**<br>\n",
" `quantiles`: float list [0., 1.]. Alternative to level, quantiles to estimate from y distribution.<br>\n",
"\n",
" **Returns:**<br>\n",
" `output_names`: str list. String list with output column names.\n",
" \"\"\"\n",
" output_names = []\n",
" for q in quantiles:\n",
" if q<.50:\n",
" output_names.append(f'-lo-{np.round(100-200*q,2)}')\n",
" elif q>.50:\n",
" output_names.append(f'-hi-{np.round(100-200*(1-q),2)}')\n",
" else:\n",
" output_names.append('-median')\n",
" return quantiles, output_names"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4ffbe55",
"metadata": {},
"outputs": [],
"source": [
"#| exporti\n",
"\n",
"# given input array of sample forecasts and inptut quantiles/levels, \n",
"# output a Pandas Dataframe with columns of quantile predictions\n",
"def samples_to_quantiles_df(samples:np.ndarray, \n",
" unique_ids:Iterable[str], \n",
" dates:Iterable, \n",
" quantiles:Optional[Iterable[float]] = None,\n",
" level:Optional[Iterable[int]] = None, \n",
" model_name:Optional[str] = \"model\"):\n",
" \"\"\" Transform Samples into HierarchicalForecast input.\n",
" Auxiliary function to create compatible HierarchicalForecast input Y_hat_df dataframe.\n",
"\n",
" **Parameters:**<br>\n",
" `samples`: numpy array. Samples from forecast distribution of shape [n_series, n_samples, horizon].<br>\n",
" `unique_ids`: string list. Unique identifiers for each time series.<br>\n",
" `dates`: datetime list. List of forecast dates.<br>\n",
" `quantiles`: float list in [0., 1.]. Alternative to level, quantiles to estimate from y distribution.<br>\n",
" `level`: int list in [0,100]. Probability levels for prediction intervals.<br>\n",
" `model_name`: string. Name of forecasting model.<br>\n",
"\n",
" **Returns:**<br>\n",
" `quantiles`: float list in [0., 1.]. quantiles to estimate from y distribution .<br>\n",
" `Y_hat_df`: pd.DataFrame. With base quantile forecasts with columns ds and models to reconcile indexed by unique_id.\n",
" \"\"\"\n",
" \n",
" # Get the shape of the array\n",
" n_series, n_samples, horizon = samples.shape\n",
"\n",
" assert n_series == len(unique_ids)\n",
" assert horizon == len(dates)\n",
" assert (quantiles is not None) ^ (level is not None) #check exactly one of quantiles/levels has been input\n",
"\n",
" #create initial dictionary\n",
" forecasts_mean = np.mean(samples, axis=1).flatten()\n",
" unique_ids = np.repeat(unique_ids, horizon)\n",
" ds = np.tile(dates, n_series)\n",
" data = pd.DataFrame({\"unique_id\":unique_ids, \"ds\":ds, model_name:forecasts_mean})\n",
"\n",
" #create quantiles and quantile names\n",
" quantiles, quantile_names = level_to_outputs(level) if level is not None else quantiles_to_outputs(quantiles)\n",
" percentiles = [quantile * 100 for quantile in quantiles]\n",
" col_names = np.array([model_name + quantile_name for quantile_name in quantile_names])\n",
" \n",
" #add quantiles to dataframe\n",
" forecasts_quantiles = np.percentile(samples, percentiles, axis=1)\n",
"\n",
" forecasts_quantiles = np.transpose(forecasts_quantiles, (1,2,0)) # [Q,H,N] -> [N,H,Q]\n",
" forecasts_quantiles = forecasts_quantiles.reshape(-1,len(quantiles))\n",
"\n",
" df = pd.DataFrame(data=forecasts_quantiles, \n",
" columns=col_names)\n",
" \n",
" return quantiles, pd.concat([data,df], axis=1).set_index('unique_id')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "940693d0",
"metadata": {},
"outputs": [],
"source": [
"show_doc(samples_to_quantiles_df, title_level=3)"
]
},
{
"cell_type": "markdown",
"id": "3a1f4267",
Expand Down Expand Up @@ -175,7 +301,7 @@
" \"\"\"Utils Aggregation Function.\n",
"\n",
" Aggregates bottom level series contained in the pd.DataFrame `df` according \n",
" to levels defined in the `spec` list applying the `agg_fn` (sum, mean).\n",
" to levels defined in the `spec` list applying the `agg_fn` (sum, mean).<br>\n",
"\n",
" **Parameters:**<br>\n",
" `df`: pd.DataFrame with columns `['ds', 'y']` and columns to aggregate.<br>\n",
Expand Down Expand Up @@ -354,6 +480,80 @@
"show_doc(aggregate, title_level=3)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d15e9834",
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"\n",
"#level_to_outputs unit tests\n",
"test_eq(\n",
" level_to_outputs([80, 90]),\n",
" ([0.5 , 0.05, 0.1 , 0.9 , 0.95], ['-median', '-lo-90', '-lo-80', '-hi-80', '-hi-90'])\n",
")\n",
"test_eq(\n",
" level_to_outputs([30]),\n",
" ([0.5 , 0.35, 0.65], ['-median', '-lo-30', '-hi-30'])\n",
")\n",
"#quantiles_to_outputs unit tests\n",
"test_eq(\n",
" quantiles_to_outputs([0.2, 0.4, 0.6, 0.8]),\n",
" ([0.2,0.4,0.6, 0.8], ['-lo-60.0', '-lo-20.0', '-hi-20.0', '-hi-60.0'])\n",
")\n",
"test_eq(\n",
" quantiles_to_outputs([0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]),\n",
" ([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], \n",
" ['-lo-80.0', '-lo-60.0', '-lo-40.0', '-lo-20.0', '-median', '-hi-20.0', '-hi-40.0', '-hi-60.0', '-hi-80.0'])\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3c833f6a",
"metadata": {},
"outputs": [],
"source": [
"#| hide\n",
"\n",
"#samples_to_quantiles_df unit tests\n",
"start_date = pd.Timestamp(\"2023-06-01\")\n",
"end_date = pd.Timestamp(\"2023-06-10\")\n",
"frequency = \"D\" # Daily frequency\n",
"dates = pd.date_range(start=start_date, end=end_date, freq=frequency).tolist()\n",
"samples = np.random.rand(3, 200, 10)\n",
"unique_ids = ['id1', 'id2', 'id3']\n",
"level = np.array([10, 50, 90])\n",
"quantiles=np.array([0.5, 0.05, 0.25, 0.45, 0.55, 0.75, 0.95])\n",
"\n",
"ret_quantiles_1, ret_df_1 = samples_to_quantiles_df(samples, unique_ids, dates, level=level)\n",
"ret_quantiles_2, ret_df_2 = samples_to_quantiles_df(samples, unique_ids, dates, quantiles=quantiles)\n",
"\n",
"test_eq(\n",
" ret_quantiles_1,\n",
" quantiles\n",
")\n",
"test_eq(\n",
" ret_df_1.columns,\n",
" ['ds', 'model', 'model-median', 'model-lo-90', 'model-lo-50', 'model-lo-10', 'model-hi-10', 'model-hi-50', 'model-hi-90']\n",
")\n",
"test_eq(\n",
" ret_df_1.index,\n",
" ['id1', 'id1', 'id1', 'id1', 'id1', 'id1', 'id1', 'id1', 'id1', 'id1',\n",
" 'id2', 'id2', 'id2', 'id2', 'id2', 'id2', 'id2', 'id2', 'id2', 'id2',\n",
" 'id3', 'id3', 'id3', 'id3', 'id3', 'id3', 'id3', 'id3', 'id3', 'id3']\n",
")\n",
"test_eq(\n",
" ret_quantiles_1, ret_quantiles_2\n",
")\n",
"test_eq(\n",
" ret_df_1.index, ret_df_2.index\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down