From 5e50468fc8ed51c024d05b6bcca1053c002a84c0 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Tue, 16 Aug 2022 16:03:50 +0200 Subject: [PATCH 01/13] Create Speed tests DataFrame creation.ipynb --- dev/Speed tests DataFrame creation.ipynb | 335 +++++++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 dev/Speed tests DataFrame creation.ipynb diff --git a/dev/Speed tests DataFrame creation.ipynb b/dev/Speed tests DataFrame creation.ipynb new file mode 100644 index 00000000..62ae3277 --- /dev/null +++ b/dev/Speed tests DataFrame creation.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 11, + "id": "5043e9fd-0df3-494f-8104-feea29aa1264", + "metadata": {}, + "outputs": [], + "source": [ + "import wurst\n", + "import bw2data as bd\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0722faf1-a608-4031-b8c0-adff2558d8da", + "metadata": {}, + "outputs": [], + "source": [ + "bd.projects.set_current(\"ei 3.8 cutoff\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "d4ffb5e5-09f5-4b13-b0ba-d5ba59dd4012", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Getting activity data\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████████████████████████████████████| 19565/19565 [00:00<00:00, 122014.46it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Adding exchange data to activities\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|█████████████████████████████████████████| 629959/629959 [00:46<00:00, 13465.05it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filling out exchange data\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|████████████████████████████████████████████| 19565/19565 [00:02<00:00, 6933.41it/s]\n" + ] + } + ], + "source": [ + "data = wurst.extract_brightway2_databases([\"ecoinvent 3.8 cutoff\"], add_identifiers=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "63016466-d913-4648-bed4-84fcc41301ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'classifications': [('ISIC rev.4 ecoinvent',\n", + " '3510:Electric power generation, transmission and distribution'),\n", + " ('CPC', '17100: Electrical energy')],\n", + " 'comment': 'This dataset changes the names of the (internally used) electricity product of waste incineration and connects it with the external grid and the respective average energy markets (grid electricity).\\nTime period: The Annual Production volume is valid for the year 2012.',\n", + " 'location': 'GR',\n", + " 'database': 'ecoinvent 3.8 cutoff',\n", + " 'code': '00014e7e2dd160027166b7274d58b7cc',\n", + " 'name': 'electricity, from municipal waste incineration to generic market for electricity, medium voltage',\n", + " 'reference product': 'electricity, medium voltage',\n", + " 'unit': 'kilowatt hour',\n", + " 'exchanges': [{'uncertainty type': 0,\n", + " 'loc': 1.0,\n", + " 'amount': 1.0,\n", + " 'type': 'production',\n", + " 'production volume': 60000000.0,\n", + " 'product': 'electricity, medium voltage',\n", + " 'name': 'electricity, from municipal waste incineration to generic market for electricity, medium voltage',\n", + " 'unit': 'kilowatt hour',\n", + " 'location': 'GR',\n", + " 'database': 'ecoinvent 3.8 cutoff',\n", + " 'id': 23785,\n", + " 'code': '00014e7e2dd160027166b7274d58b7cc'},\n", + " {'uncertainty type': 2,\n", + " 'loc': 0.0,\n", + " 'scale': 0.044721359549995794,\n", + " 'amount': 1.0,\n", + " 'pedigree': {'reliability': 1,\n", + " 'completeness': 1,\n", + " 'temporal correlation': 3,\n", + " 'geographical correlation': 1,\n", + " 'further technological correlation': 1},\n", + " 'type': 'technosphere',\n", + " 'production volume': 0.0,\n", + " 'product': 'electricity, for reuse in municipal waste incineration only',\n", + " 'name': 'market for electricity, for reuse in municipal waste incineration only',\n", + " 'unit': 'kilowatt hour',\n", + " 'location': 'RoW',\n", + " 'database': 'ecoinvent 3.8 cutoff',\n", + " 'id': 18374,\n", + " 'code': '358b14803d0148cddc4196e2c2454000'}],\n", + " 'parameters': {},\n", + " 'parameters full': [],\n", + " 'id': 23785}" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d23e2644-1dc5-45dd-aa68-9021456dcf18", + "metadata": {}, + "outputs": [], + "source": [ + "def as_naive_df(data, categorical=True):\n", + " result = []\n", + " \n", + " for target in data:\n", + " for edge in target['exchanges']:\n", + " result.append({\n", + " \"target_id\": target['id'],\n", + " \"target_database\": target['database'],\n", + " \"target_code\": target['code'],\n", + " \"target_activity\": target.get('name'),\n", + " \"target_reference_product\": target.get('reference product'),\n", + " \"target_location\": target.get('location'),\n", + " \"target_unit\": target.get('unit'),\n", + " \"target_type\": target.get('type', 'process'),\n", + " \"source_id\": edge['id'],\n", + " \"source_database\": edge['database'],\n", + " \"source_code\": edge['code'],\n", + " \"source_activity\": edge.get('name'),\n", + " \"source_product\": edge.get('product'),\n", + " \"source_location\": edge.get('location'),\n", + " \"source_unit\": edge.get('unit'),\n", + " \"source_categories\": \"::\".join(edge.get('categories', ('',))),\n", + " \"edge_amount\": edge['amount'],\n", + " \"edge_type\": edge['type'],\n", + " })\n", + " \n", + " df = pd.DataFrame(result)\n", + " \n", + " if categorical:\n", + " categorical_columns = [\n", + " \"target_database\",\n", + " \"target_activity\",\n", + " \"target_reference_product\",\n", + " \"target_location\",\n", + " \"target_unit\",\n", + " \"target_type\",\n", + " \"source_database\",\n", + " \"source_code\",\n", + " \"source_activity\",\n", + " \"source_product\",\n", + " \"source_location\",\n", + " \"source_unit\",\n", + " \"source_categories\",\n", + " ]\n", + " for column in categorical_columns:\n", + " df[column] = df[column].astype(\"category\") \n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "4f4906a9-db5f-42aa-bd26-d6140f97fe48", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5.85 s ± 79.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit as_naive_df(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "8d17c3cc-c709-456b-aaf3-3232f10eacfc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "3.71 s ± 69.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%timeit as_naive_df(data, False)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e6f04ac6-c34f-4362-86ed-9f35b827b1da", + "metadata": {}, + "outputs": [], + "source": [ + "df_compressed = as_naive_df(data)\n", + "df_full = as_naive_df(data, False)" + ] + }, + { + "cell_type": "markdown", + "id": "d3e79afd-0851-4512-ae01-7dd367cb99ca", + "metadata": {}, + "source": [ + "Memory in MB" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "6e740f9a-4848-4b95-982d-5b0fc43b32f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(37.66990280151367, 86.51182556152344)" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_compressed.memory_usage().sum() / 1024 ** 2, df_full.memory_usage().sum() / 1024 ** 2" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "6ab7ca99-3e55-44c3-a7f1-9e4bfb1aaba6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "target_id int64\n", + "target_database category\n", + "target_code object\n", + "target_activity category\n", + "target_reference_product category\n", + "target_location category\n", + "target_unit category\n", + "target_type category\n", + "source_id int64\n", + "source_database category\n", + "source_code category\n", + "source_activity category\n", + "source_product category\n", + "source_location category\n", + "source_unit category\n", + "source_categories category\n", + "edge_amount float64\n", + "edge_type object\n", + "dtype: object" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_compressed.dtypes" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 602bed1ee3061fc60b452e144d092199b65611f8 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Tue, 16 Aug 2022 16:31:59 +0200 Subject: [PATCH 02/13] Add draft to_dataframe function --- bw2data/backends/base.py | 99 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index 63497da1..5f348660 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -13,6 +13,7 @@ from bw_processing import clean_datapackage_name, create_datapackage from fs.zipfs import ZipFS from peewee import DoesNotExist, fn +from tqdm import tqdm from .. import config, databases, geomapping from ..data_store import ProcessedDataStore @@ -886,3 +887,101 @@ def get_uniqueness_key(exchange, fields): for exc in lst[-1:0:-1]: print("Deleting exchange:", exc) exc.delete() + + def to_dataframe(self, categorical=True, formatters=None): + """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are: + + target_id: int, + target_database: str, + target_code: str, + target_activity: Optional[str], + target_reference_product: Optional[str], + target_location: Optional[str], + target_unit: Optional[str], + target_type: Optional[str] + source_id: int, + source_database: str, + source_code: str, + source_activity: Optional[str], + source_product: Optional[str], # Note different label + source_location: Optional[str], + source_unit: Optional[str], + source_type: Optional[str] + source_categories: Optional[str] # Tuple concatenated with "::" as in `bw2io` + edge_amount: float, + edge_type: str, + + Target is the node consuming the edge, source is the node or flow being consumed. The terms target and source were chosen because they also work well for biosphere edges. + + Args: + + ``categorical`` will turn each string column in a `pandas Categorical Series `__. This takes 1-2 extra seconds, but saves around 50% of the memory consumption. + + ``formatters`` is a list of callables that modify each row. These functions must take the following keyword arguments, and use the `Wurst internal data format `__: + + * ``node``: The target node, as a dict + * ``edge``: The edge, including attributes of the source node + * ``row``: The current row dict being modified. ``row`` must be returned as well. + + Returns a pandas ``DataFrame``. + + """ + try: + from wurst import extract_brightway2_databases + except ImportError: + raise ImportError("This method requires the `wurst` library.") + + result = [] + + for target in extract_brightway2_databases(self.name, add_identifiers=True): + for edge in target["exchanges"]: + row = { + "target_id": target["id"], + "target_database": target["database"], + "target_code": target["code"], + "target_activity": target.get("name"), + "target_reference_product": target.get("reference product"), + "target_location": target.get("location"), + "target_unit": target.get("unit"), + "target_type": target.get("type", "process"), + "source_id": edge["id"], + "source_database": edge["database"], + "source_code": edge["code"], + "source_activity": edge.get("name"), + "source_product": edge.get("product"), + "source_location": edge.get("location"), + "source_unit": edge.get("unit"), + "source_categories": "::".join(edge.get("categories", ("",))), + "edge_amount": edge["amount"], + "edge_type": edge["type"], + } + if formatters is not None: + for func in formatters: + row = func(node=target, edge=edge, row=row) + result.append(row) + + print("Creating DataFrame") + df = pandas.DataFrame(result) + + if categorical: + categorical_columns = [ + "target_database", + "target_activity", + "target_reference_product", + "target_location", + "target_unit", + "target_type", + "source_database", + "source_code", + "source_activity", + "source_product", + "source_location", + "source_unit", + "source_categories", + ] + print("Compressing DataFrame") + for column in categorical_columns: + if column in df.columns: + df[column] = df[column].astype("category") + + return df From 6aa64739795af2e8c7ddc3226716e400457e1ef2 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Tue, 16 Aug 2022 21:06:15 +0200 Subject: [PATCH 03/13] Add wurst as dependency for CI testing --- azure-pipelines.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 85347919..22676b3c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -29,7 +29,7 @@ jobs: - bash: | source activate bw2 - mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip + mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee wurst brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip displayName: Install Anaconda packages - bash: | @@ -90,7 +90,7 @@ jobs: - bash: | source activate bw2 - mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pip + mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip displayName: Install Anaconda packages - bash: | @@ -144,7 +144,7 @@ jobs: - bash: | source activate bw2 - mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pip + mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip displayName: Install Anaconda packages - bash: | @@ -198,7 +198,7 @@ jobs: - script: | call activate bw2 - conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pywin32 pip + conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pywin32 pip displayName: Install Anaconda packages - script: | From ad0489c6afa67a46c41a30bc9d498c5f92a45ebb Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Tue, 16 Aug 2022 21:07:06 +0200 Subject: [PATCH 04/13] Add typing for to_dataframe --- bw2data/backends/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index 5f348660..5c53e4b1 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -7,13 +7,13 @@ import sqlite3 import warnings from collections import defaultdict +from typing import List, Callable, Optional import pandas import pyprind from bw_processing import clean_datapackage_name, create_datapackage from fs.zipfs import ZipFS from peewee import DoesNotExist, fn -from tqdm import tqdm from .. import config, databases, geomapping from ..data_store import ProcessedDataStore @@ -888,7 +888,7 @@ def get_uniqueness_key(exchange, fields): print("Deleting exchange:", exc) exc.delete() - def to_dataframe(self, categorical=True, formatters=None): + def to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame: """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are: target_id: int, From 993a91e11caf47ceaa37cb432a15125aff153729 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Tue, 16 Aug 2022 21:07:20 +0200 Subject: [PATCH 05/13] Export `edge_type` as well --- bw2data/backends/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index 5c53e4b1..2b1f83d0 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -978,6 +978,7 @@ def to_dataframe(self, categorical: bool = True, formatters: Optional[List[Calla "source_location", "source_unit", "source_categories", + "edge_type", ] print("Compressing DataFrame") for column in categorical_columns: From fd2975b88d9f94053d5fd7fbeb4365e0e8131492 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Tue, 16 Aug 2022 21:08:33 +0200 Subject: [PATCH 06/13] Rename to prep for node dataframe --- bw2data/backends/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index 2b1f83d0..f9150052 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -888,7 +888,7 @@ def get_uniqueness_key(exchange, fields): print("Deleting exchange:", exc) exc.delete() - def to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame: + def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame: """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are: target_id: int, From 430f50439db08e6f3c8e71783e50444987747345 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Tue, 16 Aug 2022 21:23:58 +0200 Subject: [PATCH 07/13] Add nodes_to_dataframe method --- bw2data/backends/base.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index f9150052..a10b96e8 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -888,6 +888,26 @@ def get_uniqueness_key(exchange, fields): print("Deleting exchange:", exc) exc.delete() + def nodes_to_dataframe(self, columns: Optional[List[str]] = None, return_sorted: bool = True) -> pandas.DataFrame: + """Return a pandas DataFrame with all database nodes. Uses the provided node attributes by default, such as name, unit, location. + + By default, returns a DataFrame sorted by name, reference product, location, and unit. Set ``return_sorted`` to ``False`` to skip sorting. + + Specify ``columns`` to get custom columns. You will need to write your own function to get more customization, there are endless possibilities here. + + Returns a pandas ``DataFrame``. + + """ + if columns is None: + # Feels like magic + df = pandas.DataFrame(self) + else: + df = pandas.DataFrame([{field: obj.get(field) for field in columns} for obj in self]) + if return_sorted: + sort_columns = ['name', 'reference product', 'location', 'unit'] + df = df.sort_values(by=[column for column in sort_columns if column in df.columns]) + return df + def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame: """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are: From 54b85b6b9e73200e7031490f13e8c0a7515d8725 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Wed, 17 Aug 2022 09:04:21 +0200 Subject: [PATCH 08/13] Don't need to return row in DF formatters --- bw2data/backends/base.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index a10b96e8..fe3124c5 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -941,7 +941,9 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List * ``node``: The target node, as a dict * ``edge``: The edge, including attributes of the source node - * ``row``: The current row dict being modified. ``row`` must be returned as well. + * ``row``: The current row dict being modified. + + The functions in ``formatters`` don't need to return anything, they modify ``row`` in place. Returns a pandas ``DataFrame``. @@ -977,7 +979,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List } if formatters is not None: for func in formatters: - row = func(node=target, edge=edge, row=row) + func(node=target, edge=edge, row=row) result.append(row) print("Creating DataFrame") From 79b3abf9de9f14e400b67b61ee788493302d2457 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Wed, 17 Aug 2022 09:04:43 +0200 Subject: [PATCH 09/13] Return `None` if `categories` key not present --- bw2data/backends/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index fe3124c5..25b43975 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -973,7 +973,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List "source_product": edge.get("product"), "source_location": edge.get("location"), "source_unit": edge.get("unit"), - "source_categories": "::".join(edge.get("categories", ("",))), + "source_categories": "::".join(edge["categories"]) if edge.get("categories") else None, "edge_amount": edge["amount"], "edge_type": edge["type"], } From 55820231c9595d7e7bca4108827cdf7060e5e1af Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Wed, 17 Aug 2022 09:04:55 +0200 Subject: [PATCH 10/13] source_type not extracted by wurst --- bw2data/backends/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index 25b43975..df8f631b 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -926,7 +926,6 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List source_product: Optional[str], # Note different label source_location: Optional[str], source_unit: Optional[str], - source_type: Optional[str] source_categories: Optional[str] # Tuple concatenated with "::" as in `bw2io` edge_amount: float, edge_type: str, From 994c67d87e4d07ddad0c08d4939ac47c95dea309 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Wed, 17 Aug 2022 09:05:10 +0200 Subject: [PATCH 11/13] Add edges_to_dataframe tests --- tests/database.py | 119 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) diff --git a/tests/database.py b/tests/database.py index a12d69e8..cb9a3305 100644 --- a/tests/database.py +++ b/tests/database.py @@ -4,9 +4,14 @@ import numpy as np import pandas as pd -from pandas.testing import assert_frame_equal +from pandas.testing import assert_frame_equal, assert_series_equal import pytest +try: + import wurst +except ImportError: + wurst = None + from bw2data import geomapping, get_id, databases, Database, get_activity from bw2data.backends import Activity as PWActivity from bw2data.backends import sqlite3_lci_db @@ -910,3 +915,115 @@ def test_add_geocollections_no_unable_for_product(capsys): ) assert db.metadata["geocollections"] == ["foo"] assert "Not able" not in capsys.readouterr().out + +@pytest.fixture +@bw2test +def df_fixture(): + Database("biosphere").write(biosphere) + Database("food").write(food_data) + + +@pytest.mark.skipif(not wurst, reason="wurst not installed") +def test_to_dataframe_simple(df_fixture): + df = Database("food").edges_to_dataframe(categorical=False) + id_map = {obj['code']: obj.id for obj in Database("food")} + + expected = pd.DataFrame([{ + "target_id": id_map['1'], + "target_database": "food", + "target_code": "1", + "target_activity": "lunch", + "target_reference_product": None, + "target_location": "CA", + "target_unit": "kg", + "target_type": "process", + "source_id": id_map['2'], + "source_database": 'food', + "source_code": '2', + "source_activity": 'dinner', + "source_product": None, + "source_location": 'CH', + "source_unit": 'kg', + "source_categories": None, + "edge_amount": 0.5, + "edge_type": 'technosphere', + }, { + "target_id": id_map['1'], + "target_database": "food", + "target_code": "1", + "target_activity": "lunch", + "target_reference_product": None, + "target_location": "CA", + "target_unit": "kg", + "target_type": "process", + "source_id": get_id(("biosphere", "1")), + "source_database": 'biosphere', + "source_code": '1', + "source_activity": 'an emission', + "source_product": None, + "source_location": None, + "source_unit": 'kg', + "source_categories": "things", + "edge_amount": 0.05, + "edge_type": 'biosphere', + }, { + "target_id": id_map['2'], + "target_database": "food", + "target_code": "2", + "target_activity": "dinner", + "target_reference_product": None, + "target_location": "CH", + "target_unit": "kg", + "target_type": "process", + "source_id": get_id(("biosphere", "2")), + "source_database": 'biosphere', + "source_code": '2', + "source_activity": 'another emission', + "source_product": None, + "source_location": None, + "source_unit": 'kg', + "source_categories": "things", + "edge_amount": 0.15, + "edge_type": 'biosphere', + }, { + "target_id": id_map['2'], + "target_database": "food", + "target_code": "2", + "target_activity": "dinner", + "target_reference_product": None, + "target_location": "CH", + "target_unit": "kg", + "target_type": "process", + "source_id": id_map['1'], + "source_database": 'food', + "source_code": '1', + "source_activity": 'lunch', + "source_product": None, + "source_location": 'CA', + "source_unit": 'kg', + "source_categories": None, + "edge_amount": 0.25, + "edge_type": 'technosphere', + }]) + assert_frame_equal( + df.sort_values(['target_id', 'source_id']).reset_index(drop=True), + expected.sort_values(['target_id', 'source_id']).reset_index(drop=True), + check_dtype=False + ) + + +@pytest.mark.skipif(not wurst, reason="wurst not installed") +def test_to_dataframe_categorical(df_fixture): + df = Database("food").edges_to_dataframe() + assert df.shape == (4, 18) + assert df['edge_type'].dtype.name == 'category' + + +@pytest.mark.skipif(not wurst, reason="wurst not installed") +def test_to_dataframe_formatters(df_fixture): + def foo(node, edge, row): + row['foo'] = 'bar' + + df = Database("food").edges_to_dataframe(formatters=[foo]) + assert_series_equal(df['foo'], pd.Series(['bar'] * 4, name='foo')) + From cf10469bbd7aa9cb157e673df406129ca26b9935 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Wed, 17 Aug 2022 09:08:25 +0200 Subject: [PATCH 12/13] Change _activity to _name in dataframe construction --- bw2data/backends/base.py | 12 ++++++------ tests/database.py | 23 ++++++++++++----------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py index df8f631b..f060d397 100644 --- a/bw2data/backends/base.py +++ b/bw2data/backends/base.py @@ -914,7 +914,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List target_id: int, target_database: str, target_code: str, - target_activity: Optional[str], + target_name: Optional[str], target_reference_product: Optional[str], target_location: Optional[str], target_unit: Optional[str], @@ -922,7 +922,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List source_id: int, source_database: str, source_code: str, - source_activity: Optional[str], + source_name: Optional[str], source_product: Optional[str], # Note different label source_location: Optional[str], source_unit: Optional[str], @@ -960,7 +960,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List "target_id": target["id"], "target_database": target["database"], "target_code": target["code"], - "target_activity": target.get("name"), + "target_name": target.get("name"), "target_reference_product": target.get("reference product"), "target_location": target.get("location"), "target_unit": target.get("unit"), @@ -968,7 +968,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List "source_id": edge["id"], "source_database": edge["database"], "source_code": edge["code"], - "source_activity": edge.get("name"), + "source_name": edge.get("name"), "source_product": edge.get("product"), "source_location": edge.get("location"), "source_unit": edge.get("unit"), @@ -987,14 +987,14 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List if categorical: categorical_columns = [ "target_database", - "target_activity", + "target_name", "target_reference_product", "target_location", "target_unit", "target_type", "source_database", "source_code", - "source_activity", + "source_name", "source_product", "source_location", "source_unit", diff --git a/tests/database.py b/tests/database.py index cb9a3305..a4941830 100644 --- a/tests/database.py +++ b/tests/database.py @@ -916,6 +916,7 @@ def test_add_geocollections_no_unable_for_product(capsys): assert db.metadata["geocollections"] == ["foo"] assert "Not able" not in capsys.readouterr().out + @pytest.fixture @bw2test def df_fixture(): @@ -924,7 +925,7 @@ def df_fixture(): @pytest.mark.skipif(not wurst, reason="wurst not installed") -def test_to_dataframe_simple(df_fixture): +def test_edges_to_dataframe_simple(df_fixture): df = Database("food").edges_to_dataframe(categorical=False) id_map = {obj['code']: obj.id for obj in Database("food")} @@ -932,7 +933,7 @@ def test_to_dataframe_simple(df_fixture): "target_id": id_map['1'], "target_database": "food", "target_code": "1", - "target_activity": "lunch", + "target_name": "lunch", "target_reference_product": None, "target_location": "CA", "target_unit": "kg", @@ -940,7 +941,7 @@ def test_to_dataframe_simple(df_fixture): "source_id": id_map['2'], "source_database": 'food', "source_code": '2', - "source_activity": 'dinner', + "source_name": 'dinner', "source_product": None, "source_location": 'CH', "source_unit": 'kg', @@ -951,7 +952,7 @@ def test_to_dataframe_simple(df_fixture): "target_id": id_map['1'], "target_database": "food", "target_code": "1", - "target_activity": "lunch", + "target_name": "lunch", "target_reference_product": None, "target_location": "CA", "target_unit": "kg", @@ -959,7 +960,7 @@ def test_to_dataframe_simple(df_fixture): "source_id": get_id(("biosphere", "1")), "source_database": 'biosphere', "source_code": '1', - "source_activity": 'an emission', + "source_name": 'an emission', "source_product": None, "source_location": None, "source_unit": 'kg', @@ -970,7 +971,7 @@ def test_to_dataframe_simple(df_fixture): "target_id": id_map['2'], "target_database": "food", "target_code": "2", - "target_activity": "dinner", + "target_name": "dinner", "target_reference_product": None, "target_location": "CH", "target_unit": "kg", @@ -978,7 +979,7 @@ def test_to_dataframe_simple(df_fixture): "source_id": get_id(("biosphere", "2")), "source_database": 'biosphere', "source_code": '2', - "source_activity": 'another emission', + "source_name": 'another emission', "source_product": None, "source_location": None, "source_unit": 'kg', @@ -989,7 +990,7 @@ def test_to_dataframe_simple(df_fixture): "target_id": id_map['2'], "target_database": "food", "target_code": "2", - "target_activity": "dinner", + "target_name": "dinner", "target_reference_product": None, "target_location": "CH", "target_unit": "kg", @@ -997,7 +998,7 @@ def test_to_dataframe_simple(df_fixture): "source_id": id_map['1'], "source_database": 'food', "source_code": '1', - "source_activity": 'lunch', + "source_name": 'lunch', "source_product": None, "source_location": 'CA', "source_unit": 'kg', @@ -1013,14 +1014,14 @@ def test_to_dataframe_simple(df_fixture): @pytest.mark.skipif(not wurst, reason="wurst not installed") -def test_to_dataframe_categorical(df_fixture): +def test_edges_to_dataframe_categorical(df_fixture): df = Database("food").edges_to_dataframe() assert df.shape == (4, 18) assert df['edge_type'].dtype.name == 'category' @pytest.mark.skipif(not wurst, reason="wurst not installed") -def test_to_dataframe_formatters(df_fixture): +def test_edges_to_dataframe_formatters(df_fixture): def foo(node, edge, row): row['foo'] = 'bar' From 73978ce8355ec0e9bda73fac83c3c00529472f39 Mon Sep 17 00:00:00 2001 From: Chris Mutel Date: Wed, 17 Aug 2022 09:17:21 +0200 Subject: [PATCH 13/13] Add nodes_to_dataframe tests --- tests/database.py | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/tests/database.py b/tests/database.py index a4941830..fae525ac 100644 --- a/tests/database.py +++ b/tests/database.py @@ -1028,3 +1028,51 @@ def foo(node, edge, row): df = Database("food").edges_to_dataframe(formatters=[foo]) assert_series_equal(df['foo'], pd.Series(['bar'] * 4, name='foo')) + +def test_nodes_to_dataframe_simple(df_fixture): + df = Database("food").nodes_to_dataframe() + expected = pd.DataFrame([{ + "categories": ["stuff", "meals"], + "code": "2", + "database": "food", + "id": get_id(("food", "2")), + "location": "CH", + "name": "dinner", + "type": "process", + "unit": "kg", + }, { + "categories": ("stuff", "meals"), + "code": "1", + "database": "food", + "id": get_id(("food", "1")), + "location": "CA", + "name": "lunch", + "type": "process", + "unit": "kg", + }]) + assert_frame_equal( + df.reset_index(drop=True), + expected.reset_index(drop=True), + ) + + +def test_nodes_to_dataframe_columns(df_fixture): + df = Database("food").nodes_to_dataframe(columns=['id', 'name', 'unit']) + expected = pd.DataFrame([{ + "id": get_id(("food", "2")), + "name": "dinner", + "unit": "kg", + }, { + "id": get_id(("food", "1")), + "name": "lunch", + "unit": "kg", + }]) + assert_frame_equal( + df.reset_index(drop=True), + expected.reset_index(drop=True), + ) + + +def test_nodes_to_dataframe_unsorted(df_fixture): + df = Database("food").nodes_to_dataframe() + assert df.shape == (2, 8)