From 5e50468fc8ed51c024d05b6bcca1053c002a84c0 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Tue, 16 Aug 2022 16:03:50 +0200
Subject: [PATCH 01/13] Create Speed tests DataFrame creation.ipynb

---
 dev/Speed tests DataFrame creation.ipynb | 335 +++++++++++++++++++++++
 1 file changed, 335 insertions(+)
 create mode 100644 dev/Speed tests DataFrame creation.ipynb

diff --git a/dev/Speed tests DataFrame creation.ipynb b/dev/Speed tests DataFrame creation.ipynb
new file mode 100644
index 00000000..62ae3277
--- /dev/null
+++ b/dev/Speed tests DataFrame creation.ipynb	
@@ -0,0 +1,335 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "5043e9fd-0df3-494f-8104-feea29aa1264",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wurst\n",
+    "import bw2data as bd\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "0722faf1-a608-4031-b8c0-adff2558d8da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bd.projects.set_current(\"ei 3.8 cutoff\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "d4ffb5e5-09f5-4b13-b0ba-d5ba59dd4012",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Getting activity data\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████████████████████████████████████| 19565/19565 [00:00<00:00, 122014.46it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Adding exchange data to activities\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|█████████████████████████████████████████| 629959/629959 [00:46<00:00, 13465.05it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Filling out exchange data\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|████████████████████████████████████████████| 19565/19565 [00:02<00:00, 6933.41it/s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "data = wurst.extract_brightway2_databases([\"ecoinvent 3.8 cutoff\"], add_identifiers=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "63016466-d913-4648-bed4-84fcc41301ac",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'classifications': [('ISIC rev.4 ecoinvent',\n",
+       "   '3510:Electric power generation, transmission and distribution'),\n",
+       "  ('CPC', '17100: Electrical energy')],\n",
+       " 'comment': 'This dataset changes the names of the (internally used) electricity product of waste incineration and connects it with the external grid and the respective average energy markets (grid electricity).\\nTime period:  The Annual Production volume is valid for the year 2012.',\n",
+       " 'location': 'GR',\n",
+       " 'database': 'ecoinvent 3.8 cutoff',\n",
+       " 'code': '00014e7e2dd160027166b7274d58b7cc',\n",
+       " 'name': 'electricity, from municipal waste incineration to generic market for electricity, medium voltage',\n",
+       " 'reference product': 'electricity, medium voltage',\n",
+       " 'unit': 'kilowatt hour',\n",
+       " 'exchanges': [{'uncertainty type': 0,\n",
+       "   'loc': 1.0,\n",
+       "   'amount': 1.0,\n",
+       "   'type': 'production',\n",
+       "   'production volume': 60000000.0,\n",
+       "   'product': 'electricity, medium voltage',\n",
+       "   'name': 'electricity, from municipal waste incineration to generic market for electricity, medium voltage',\n",
+       "   'unit': 'kilowatt hour',\n",
+       "   'location': 'GR',\n",
+       "   'database': 'ecoinvent 3.8 cutoff',\n",
+       "   'id': 23785,\n",
+       "   'code': '00014e7e2dd160027166b7274d58b7cc'},\n",
+       "  {'uncertainty type': 2,\n",
+       "   'loc': 0.0,\n",
+       "   'scale': 0.044721359549995794,\n",
+       "   'amount': 1.0,\n",
+       "   'pedigree': {'reliability': 1,\n",
+       "    'completeness': 1,\n",
+       "    'temporal correlation': 3,\n",
+       "    'geographical correlation': 1,\n",
+       "    'further technological correlation': 1},\n",
+       "   'type': 'technosphere',\n",
+       "   'production volume': 0.0,\n",
+       "   'product': 'electricity, for reuse in municipal waste incineration only',\n",
+       "   'name': 'market for electricity, for reuse in municipal waste incineration only',\n",
+       "   'unit': 'kilowatt hour',\n",
+       "   'location': 'RoW',\n",
+       "   'database': 'ecoinvent 3.8 cutoff',\n",
+       "   'id': 18374,\n",
+       "   'code': '358b14803d0148cddc4196e2c2454000'}],\n",
+       " 'parameters': {},\n",
+       " 'parameters full': [],\n",
+       " 'id': 23785}"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "d23e2644-1dc5-45dd-aa68-9021456dcf18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def as_naive_df(data, categorical=True):\n",
+    "    result = []\n",
+    "    \n",
+    "    for target in data:\n",
+    "        for edge in target['exchanges']:\n",
+    "            result.append({\n",
+    "                \"target_id\": target['id'],\n",
+    "                \"target_database\": target['database'],\n",
+    "                \"target_code\": target['code'],\n",
+    "                \"target_activity\": target.get('name'),\n",
+    "                \"target_reference_product\": target.get('reference product'),\n",
+    "                \"target_location\": target.get('location'),\n",
+    "                \"target_unit\": target.get('unit'),\n",
+    "                \"target_type\": target.get('type', 'process'),\n",
+    "                \"source_id\": edge['id'],\n",
+    "                \"source_database\": edge['database'],\n",
+    "                \"source_code\": edge['code'],\n",
+    "                \"source_activity\": edge.get('name'),\n",
+    "                \"source_product\": edge.get('product'),\n",
+    "                \"source_location\": edge.get('location'),\n",
+    "                \"source_unit\": edge.get('unit'),\n",
+    "                \"source_categories\": \"::\".join(edge.get('categories', ('',))),\n",
+    "                \"edge_amount\": edge['amount'],\n",
+    "                \"edge_type\": edge['type'],\n",
+    "            })\n",
+    "    \n",
+    "    df = pd.DataFrame(result)\n",
+    "    \n",
+    "    if categorical:\n",
+    "        categorical_columns = [\n",
+    "            \"target_database\",\n",
+    "            \"target_activity\",\n",
+    "            \"target_reference_product\",\n",
+    "            \"target_location\",\n",
+    "            \"target_unit\",\n",
+    "            \"target_type\",\n",
+    "            \"source_database\",\n",
+    "            \"source_code\",\n",
+    "            \"source_activity\",\n",
+    "            \"source_product\",\n",
+    "            \"source_location\",\n",
+    "            \"source_unit\",\n",
+    "            \"source_categories\",\n",
+    "        ]\n",
+    "        for column in categorical_columns:\n",
+    "            df[column] = df[column].astype(\"category\")  \n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "4f4906a9-db5f-42aa-bd26-d6140f97fe48",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5.85 s ± 79.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit as_naive_df(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "8d17c3cc-c709-456b-aaf3-3232f10eacfc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "3.71 s ± 69.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%timeit as_naive_df(data, False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "e6f04ac6-c34f-4362-86ed-9f35b827b1da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_compressed = as_naive_df(data)\n",
+    "df_full = as_naive_df(data, False)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d3e79afd-0851-4512-ae01-7dd367cb99ca",
+   "metadata": {},
+   "source": [
+    "Memory in MB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "6e740f9a-4848-4b95-982d-5b0fc43b32f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(37.66990280151367, 86.51182556152344)"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_compressed.memory_usage().sum() / 1024 ** 2, df_full.memory_usage().sum() / 1024 ** 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "6ab7ca99-3e55-44c3-a7f1-9e4bfb1aaba6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "target_id                      int64\n",
+       "target_database             category\n",
+       "target_code                   object\n",
+       "target_activity             category\n",
+       "target_reference_product    category\n",
+       "target_location             category\n",
+       "target_unit                 category\n",
+       "target_type                 category\n",
+       "source_id                      int64\n",
+       "source_database             category\n",
+       "source_code                 category\n",
+       "source_activity             category\n",
+       "source_product              category\n",
+       "source_location             category\n",
+       "source_unit                 category\n",
+       "source_categories           category\n",
+       "edge_amount                  float64\n",
+       "edge_type                     object\n",
+       "dtype: object"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_compressed.dtypes"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 602bed1ee3061fc60b452e144d092199b65611f8 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Tue, 16 Aug 2022 16:31:59 +0200
Subject: [PATCH 02/13] Add draft to_dataframe function

---
 bw2data/backends/base.py | 99 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 99 insertions(+)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index 63497da1..5f348660 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -13,6 +13,7 @@
 from bw_processing import clean_datapackage_name, create_datapackage
 from fs.zipfs import ZipFS
 from peewee import DoesNotExist, fn
+from tqdm import tqdm
 
 from .. import config, databases, geomapping
 from ..data_store import ProcessedDataStore
@@ -886,3 +887,101 @@ def get_uniqueness_key(exchange, fields):
                 for exc in lst[-1:0:-1]:
                     print("Deleting exchange:", exc)
                     exc.delete()
+
+    def to_dataframe(self, categorical=True, formatters=None):
+        """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are:
+
+            target_id: int,
+            target_database: str,
+            target_code: str,
+            target_activity: Optional[str],
+            target_reference_product: Optional[str],
+            target_location: Optional[str],
+            target_unit: Optional[str],
+            target_type: Optional[str]
+            source_id: int,
+            source_database: str,
+            source_code: str,
+            source_activity: Optional[str],
+            source_product: Optional[str],  # Note different label
+            source_location: Optional[str],
+            source_unit: Optional[str],
+            source_type: Optional[str]
+            source_categories: Optional[str]  # Tuple concatenated with "::" as in `bw2io`
+            edge_amount: float,
+            edge_type: str,
+
+        Target is the node consuming the edge, source is the node or flow being consumed. The terms target and source were chosen because they also work well for biosphere edges.
+
+        Args:
+
+        ``categorical`` will turn each string column in a `pandas Categorical Series <https://pandas.pydata.org/docs/reference/api/pandas.Categorical.html>`__. This takes 1-2 extra seconds, but saves around 50% of the memory consumption.
+
+        ``formatters`` is a list of callables that modify each row. These functions must take the following keyword arguments, and use the `Wurst internal data format <https://wurst.readthedocs.io/#internal-data-format>`__:
+
+            * ``node``: The target node, as a dict
+            * ``edge``: The edge, including attributes of the source node
+            * ``row``: The current row dict being modified. ``row`` must be returned as well.
+
+        Returns a pandas ``DataFrame``.
+
+        """
+        try:
+            from wurst import extract_brightway2_databases
+        except ImportError:
+            raise ImportError("This method requires the `wurst` library.")
+
+        result = []
+
+        for target in extract_brightway2_databases(self.name, add_identifiers=True):
+            for edge in target["exchanges"]:
+                row = {
+                    "target_id": target["id"],
+                    "target_database": target["database"],
+                    "target_code": target["code"],
+                    "target_activity": target.get("name"),
+                    "target_reference_product": target.get("reference product"),
+                    "target_location": target.get("location"),
+                    "target_unit": target.get("unit"),
+                    "target_type": target.get("type", "process"),
+                    "source_id": edge["id"],
+                    "source_database": edge["database"],
+                    "source_code": edge["code"],
+                    "source_activity": edge.get("name"),
+                    "source_product": edge.get("product"),
+                    "source_location": edge.get("location"),
+                    "source_unit": edge.get("unit"),
+                    "source_categories": "::".join(edge.get("categories", ("",))),
+                    "edge_amount": edge["amount"],
+                    "edge_type": edge["type"],
+                }
+                if formatters is not None:
+                    for func in formatters:
+                        row = func(node=target, edge=edge, row=row)
+                result.append(row)
+
+        print("Creating DataFrame")
+        df = pandas.DataFrame(result)
+
+        if categorical:
+            categorical_columns = [
+                "target_database",
+                "target_activity",
+                "target_reference_product",
+                "target_location",
+                "target_unit",
+                "target_type",
+                "source_database",
+                "source_code",
+                "source_activity",
+                "source_product",
+                "source_location",
+                "source_unit",
+                "source_categories",
+            ]
+            print("Compressing DataFrame")
+            for column in categorical_columns:
+                if column in df.columns:
+                    df[column] = df[column].astype("category")
+
+        return df

From 6aa64739795af2e8c7ddc3226716e400457e1ef2 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Tue, 16 Aug 2022 21:06:15 +0200
Subject: [PATCH 03/13] Add wurst as dependency for CI testing

---
 azure-pipelines.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 85347919..22676b3c 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -29,7 +29,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip
+      mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee wurst brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -90,7 +90,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pip
+      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -144,7 +144,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pip
+      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -198,7 +198,7 @@ jobs:
 
   - script: |
       call activate bw2
-      conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pywin32 pip
+      conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pywin32 pip
     displayName: Install Anaconda packages
 
   - script: |

From ad0489c6afa67a46c41a30bc9d498c5f92a45ebb Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Tue, 16 Aug 2022 21:07:06 +0200
Subject: [PATCH 04/13] Add typing for to_dataframe

---
 bw2data/backends/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index 5f348660..5c53e4b1 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -7,13 +7,13 @@
 import sqlite3
 import warnings
 from collections import defaultdict
+from typing import List, Callable, Optional
 
 import pandas
 import pyprind
 from bw_processing import clean_datapackage_name, create_datapackage
 from fs.zipfs import ZipFS
 from peewee import DoesNotExist, fn
-from tqdm import tqdm
 
 from .. import config, databases, geomapping
 from ..data_store import ProcessedDataStore
@@ -888,7 +888,7 @@ def get_uniqueness_key(exchange, fields):
                     print("Deleting exchange:", exc)
                     exc.delete()
 
-    def to_dataframe(self, categorical=True, formatters=None):
+    def to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame:
         """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are:
 
             target_id: int,

From 993a91e11caf47ceaa37cb432a15125aff153729 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Tue, 16 Aug 2022 21:07:20 +0200
Subject: [PATCH 05/13] Export `edge_type` as well

---
 bw2data/backends/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index 5c53e4b1..2b1f83d0 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -978,6 +978,7 @@ def to_dataframe(self, categorical: bool = True, formatters: Optional[List[Calla
                 "source_location",
                 "source_unit",
                 "source_categories",
+                "edge_type",
             ]
             print("Compressing DataFrame")
             for column in categorical_columns:

From fd2975b88d9f94053d5fd7fbeb4365e0e8131492 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Tue, 16 Aug 2022 21:08:33 +0200
Subject: [PATCH 06/13] Rename to prep for node dataframe

---
 bw2data/backends/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index 2b1f83d0..f9150052 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -888,7 +888,7 @@ def get_uniqueness_key(exchange, fields):
                     print("Deleting exchange:", exc)
                     exc.delete()
 
-    def to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame:
+    def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame:
         """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are:
 
             target_id: int,

From 430f50439db08e6f3c8e71783e50444987747345 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Tue, 16 Aug 2022 21:23:58 +0200
Subject: [PATCH 07/13] Add nodes_to_dataframe method

---
 bw2data/backends/base.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index f9150052..a10b96e8 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -888,6 +888,26 @@ def get_uniqueness_key(exchange, fields):
                     print("Deleting exchange:", exc)
                     exc.delete()
 
+    def nodes_to_dataframe(self, columns: Optional[List[str]] = None, return_sorted: bool = True) -> pandas.DataFrame:
+        """Return a pandas DataFrame with all database nodes. Uses the provided node attributes by default,  such as name, unit, location.
+
+        By default, returns a DataFrame sorted by name, reference product, location, and unit. Set ``return_sorted`` to ``False`` to skip sorting.
+
+        Specify ``columns`` to get custom columns. You will need to write your own function to get more customization, there are endless possibilities here.
+
+        Returns a pandas ``DataFrame``.
+
+        """
+        if columns is None:
+            # Feels like magic
+            df = pandas.DataFrame(self)
+        else:
+            df = pandas.DataFrame([{field: obj.get(field) for field in columns} for obj in self])
+        if return_sorted:
+            sort_columns = ['name', 'reference product', 'location', 'unit']
+            df = df.sort_values(by=[column for column in sort_columns if column in df.columns])
+        return df
+
     def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame:
         """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are:
 

From 54b85b6b9e73200e7031490f13e8c0a7515d8725 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Wed, 17 Aug 2022 09:04:21 +0200
Subject: [PATCH 08/13] Don't need to return row in DF formatters

---
 bw2data/backends/base.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index a10b96e8..fe3124c5 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -941,7 +941,9 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
 
             * ``node``: The target node, as a dict
             * ``edge``: The edge, including attributes of the source node
-            * ``row``: The current row dict being modified. ``row`` must be returned as well.
+            * ``row``: The current row dict being modified.
+
+        The functions in ``formatters`` don't need to return anything, they modify ``row`` in place.
 
         Returns a pandas ``DataFrame``.
 
@@ -977,7 +979,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
                 }
                 if formatters is not None:
                     for func in formatters:
-                        row = func(node=target, edge=edge, row=row)
+                        func(node=target, edge=edge, row=row)
                 result.append(row)
 
         print("Creating DataFrame")

From 79b3abf9de9f14e400b67b61ee788493302d2457 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Wed, 17 Aug 2022 09:04:43 +0200
Subject: [PATCH 09/13] Return `None` if `categories` key not present

---
 bw2data/backends/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index fe3124c5..25b43975 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -973,7 +973,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
                     "source_product": edge.get("product"),
                     "source_location": edge.get("location"),
                     "source_unit": edge.get("unit"),
-                    "source_categories": "::".join(edge.get("categories", ("",))),
+                    "source_categories": "::".join(edge["categories"]) if edge.get("categories") else None,
                     "edge_amount": edge["amount"],
                     "edge_type": edge["type"],
                 }

From 55820231c9595d7e7bca4108827cdf7060e5e1af Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Wed, 17 Aug 2022 09:04:55 +0200
Subject: [PATCH 10/13] source_type not extracted by wurst

---
 bw2data/backends/base.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index 25b43975..df8f631b 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -926,7 +926,6 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
             source_product: Optional[str],  # Note different label
             source_location: Optional[str],
             source_unit: Optional[str],
-            source_type: Optional[str]
             source_categories: Optional[str]  # Tuple concatenated with "::" as in `bw2io`
             edge_amount: float,
             edge_type: str,

From 994c67d87e4d07ddad0c08d4939ac47c95dea309 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Wed, 17 Aug 2022 09:05:10 +0200
Subject: [PATCH 11/13] Add edges_to_dataframe tests

---
 tests/database.py | 119 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 118 insertions(+), 1 deletion(-)

diff --git a/tests/database.py b/tests/database.py
index a12d69e8..cb9a3305 100644
--- a/tests/database.py
+++ b/tests/database.py
@@ -4,9 +4,14 @@
 
 import numpy as np
 import pandas as pd
-from pandas.testing import assert_frame_equal
+from pandas.testing import assert_frame_equal, assert_series_equal
 import pytest
 
+try:
+    import wurst
+except ImportError:
+    wurst = None
+
 from bw2data import geomapping, get_id, databases, Database, get_activity
 from bw2data.backends import Activity as PWActivity
 from bw2data.backends import sqlite3_lci_db
@@ -910,3 +915,115 @@ def test_add_geocollections_no_unable_for_product(capsys):
     )
     assert db.metadata["geocollections"] == ["foo"]
     assert "Not able" not in capsys.readouterr().out
+
+@pytest.fixture
+@bw2test
+def df_fixture():
+    Database("biosphere").write(biosphere)
+    Database("food").write(food_data)
+
+
+@pytest.mark.skipif(not wurst, reason="wurst not installed")
+def test_to_dataframe_simple(df_fixture):
+    df = Database("food").edges_to_dataframe(categorical=False)
+    id_map = {obj['code']: obj.id for obj in Database("food")}
+
+    expected = pd.DataFrame([{
+        "target_id": id_map['1'],
+        "target_database": "food",
+        "target_code": "1",
+        "target_activity": "lunch",
+        "target_reference_product": None,
+        "target_location": "CA",
+        "target_unit": "kg",
+        "target_type": "process",
+        "source_id": id_map['2'],
+        "source_database": 'food',
+        "source_code": '2',
+        "source_activity": 'dinner',
+        "source_product": None,
+        "source_location": 'CH',
+        "source_unit": 'kg',
+        "source_categories": None,
+        "edge_amount": 0.5,
+        "edge_type": 'technosphere',
+    }, {
+        "target_id": id_map['1'],
+        "target_database": "food",
+        "target_code": "1",
+        "target_activity": "lunch",
+        "target_reference_product": None,
+        "target_location": "CA",
+        "target_unit": "kg",
+        "target_type": "process",
+        "source_id": get_id(("biosphere", "1")),
+        "source_database": 'biosphere',
+        "source_code": '1',
+        "source_activity": 'an emission',
+        "source_product": None,
+        "source_location": None,
+        "source_unit": 'kg',
+        "source_categories": "things",
+        "edge_amount": 0.05,
+        "edge_type": 'biosphere',
+    }, {
+        "target_id": id_map['2'],
+        "target_database": "food",
+        "target_code": "2",
+        "target_activity": "dinner",
+        "target_reference_product": None,
+        "target_location": "CH",
+        "target_unit": "kg",
+        "target_type": "process",
+        "source_id": get_id(("biosphere", "2")),
+        "source_database": 'biosphere',
+        "source_code": '2',
+        "source_activity": 'another emission',
+        "source_product": None,
+        "source_location": None,
+        "source_unit": 'kg',
+        "source_categories": "things",
+        "edge_amount": 0.15,
+        "edge_type": 'biosphere',
+    }, {
+        "target_id": id_map['2'],
+        "target_database": "food",
+        "target_code": "2",
+        "target_activity": "dinner",
+        "target_reference_product": None,
+        "target_location": "CH",
+        "target_unit": "kg",
+        "target_type": "process",
+        "source_id": id_map['1'],
+        "source_database": 'food',
+        "source_code": '1',
+        "source_activity": 'lunch',
+        "source_product": None,
+        "source_location": 'CA',
+        "source_unit": 'kg',
+        "source_categories": None,
+        "edge_amount": 0.25,
+        "edge_type": 'technosphere',
+    }])
+    assert_frame_equal(
+        df.sort_values(['target_id', 'source_id']).reset_index(drop=True),
+        expected.sort_values(['target_id', 'source_id']).reset_index(drop=True),
+        check_dtype=False
+    )
+
+
+@pytest.mark.skipif(not wurst, reason="wurst not installed")
+def test_to_dataframe_categorical(df_fixture):
+    df = Database("food").edges_to_dataframe()
+    assert df.shape == (4, 18)
+    assert df['edge_type'].dtype.name == 'category'
+
+
+@pytest.mark.skipif(not wurst, reason="wurst not installed")
+def test_to_dataframe_formatters(df_fixture):
+    def foo(node, edge, row):
+        row['foo'] = 'bar'
+
+    df = Database("food").edges_to_dataframe(formatters=[foo])
+    assert_series_equal(df['foo'], pd.Series(['bar'] * 4, name='foo'))
+

From cf10469bbd7aa9cb157e673df406129ca26b9935 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Wed, 17 Aug 2022 09:08:25 +0200
Subject: [PATCH 12/13] Change _activity to _name in dataframe construction

---
 bw2data/backends/base.py | 12 ++++++------
 tests/database.py        | 23 ++++++++++++-----------
 2 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
index df8f631b..f060d397 100644
--- a/bw2data/backends/base.py
+++ b/bw2data/backends/base.py
@@ -914,7 +914,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
             target_id: int,
             target_database: str,
             target_code: str,
-            target_activity: Optional[str],
+            target_name: Optional[str],
             target_reference_product: Optional[str],
             target_location: Optional[str],
             target_unit: Optional[str],
@@ -922,7 +922,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
             source_id: int,
             source_database: str,
             source_code: str,
-            source_activity: Optional[str],
+            source_name: Optional[str],
             source_product: Optional[str],  # Note different label
             source_location: Optional[str],
             source_unit: Optional[str],
@@ -960,7 +960,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
                     "target_id": target["id"],
                     "target_database": target["database"],
                     "target_code": target["code"],
-                    "target_activity": target.get("name"),
+                    "target_name": target.get("name"),
                     "target_reference_product": target.get("reference product"),
                     "target_location": target.get("location"),
                     "target_unit": target.get("unit"),
@@ -968,7 +968,7 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
                     "source_id": edge["id"],
                     "source_database": edge["database"],
                     "source_code": edge["code"],
-                    "source_activity": edge.get("name"),
+                    "source_name": edge.get("name"),
                     "source_product": edge.get("product"),
                     "source_location": edge.get("location"),
                     "source_unit": edge.get("unit"),
@@ -987,14 +987,14 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
         if categorical:
             categorical_columns = [
                 "target_database",
-                "target_activity",
+                "target_name",
                 "target_reference_product",
                 "target_location",
                 "target_unit",
                 "target_type",
                 "source_database",
                 "source_code",
-                "source_activity",
+                "source_name",
                 "source_product",
                 "source_location",
                 "source_unit",
diff --git a/tests/database.py b/tests/database.py
index cb9a3305..a4941830 100644
--- a/tests/database.py
+++ b/tests/database.py
@@ -916,6 +916,7 @@ def test_add_geocollections_no_unable_for_product(capsys):
     assert db.metadata["geocollections"] == ["foo"]
     assert "Not able" not in capsys.readouterr().out
 
+
 @pytest.fixture
 @bw2test
 def df_fixture():
@@ -924,7 +925,7 @@ def df_fixture():
 
 
 @pytest.mark.skipif(not wurst, reason="wurst not installed")
-def test_to_dataframe_simple(df_fixture):
+def test_edges_to_dataframe_simple(df_fixture):
     df = Database("food").edges_to_dataframe(categorical=False)
     id_map = {obj['code']: obj.id for obj in Database("food")}
 
@@ -932,7 +933,7 @@ def test_to_dataframe_simple(df_fixture):
         "target_id": id_map['1'],
         "target_database": "food",
         "target_code": "1",
-        "target_activity": "lunch",
+        "target_name": "lunch",
         "target_reference_product": None,
         "target_location": "CA",
         "target_unit": "kg",
@@ -940,7 +941,7 @@ def test_to_dataframe_simple(df_fixture):
         "source_id": id_map['2'],
         "source_database": 'food',
         "source_code": '2',
-        "source_activity": 'dinner',
+        "source_name": 'dinner',
         "source_product": None,
         "source_location": 'CH',
         "source_unit": 'kg',
@@ -951,7 +952,7 @@ def test_to_dataframe_simple(df_fixture):
         "target_id": id_map['1'],
         "target_database": "food",
         "target_code": "1",
-        "target_activity": "lunch",
+        "target_name": "lunch",
         "target_reference_product": None,
         "target_location": "CA",
         "target_unit": "kg",
@@ -959,7 +960,7 @@ def test_to_dataframe_simple(df_fixture):
         "source_id": get_id(("biosphere", "1")),
         "source_database": 'biosphere',
         "source_code": '1',
-        "source_activity": 'an emission',
+        "source_name": 'an emission',
         "source_product": None,
         "source_location": None,
         "source_unit": 'kg',
@@ -970,7 +971,7 @@ def test_to_dataframe_simple(df_fixture):
         "target_id": id_map['2'],
         "target_database": "food",
         "target_code": "2",
-        "target_activity": "dinner",
+        "target_name": "dinner",
         "target_reference_product": None,
         "target_location": "CH",
         "target_unit": "kg",
@@ -978,7 +979,7 @@ def test_to_dataframe_simple(df_fixture):
         "source_id": get_id(("biosphere", "2")),
         "source_database": 'biosphere',
         "source_code": '2',
-        "source_activity": 'another emission',
+        "source_name": 'another emission',
         "source_product": None,
         "source_location": None,
         "source_unit": 'kg',
@@ -989,7 +990,7 @@ def test_to_dataframe_simple(df_fixture):
         "target_id": id_map['2'],
         "target_database": "food",
         "target_code": "2",
-        "target_activity": "dinner",
+        "target_name": "dinner",
         "target_reference_product": None,
         "target_location": "CH",
         "target_unit": "kg",
@@ -997,7 +998,7 @@ def test_to_dataframe_simple(df_fixture):
         "source_id": id_map['1'],
         "source_database": 'food',
         "source_code": '1',
-        "source_activity": 'lunch',
+        "source_name": 'lunch',
         "source_product": None,
         "source_location": 'CA',
         "source_unit": 'kg',
@@ -1013,14 +1014,14 @@ def test_to_dataframe_simple(df_fixture):
 
 
 @pytest.mark.skipif(not wurst, reason="wurst not installed")
-def test_to_dataframe_categorical(df_fixture):
+def test_edges_to_dataframe_categorical(df_fixture):
     df = Database("food").edges_to_dataframe()
     assert df.shape == (4, 18)
     assert df['edge_type'].dtype.name == 'category'
 
 
 @pytest.mark.skipif(not wurst, reason="wurst not installed")
-def test_to_dataframe_formatters(df_fixture):
+def test_edges_to_dataframe_formatters(df_fixture):
     def foo(node, edge, row):
         row['foo'] = 'bar'
 

From 73978ce8355ec0e9bda73fac83c3c00529472f39 Mon Sep 17 00:00:00 2001
From: Chris Mutel <cmutel@gmail.com>
Date: Wed, 17 Aug 2022 09:17:21 +0200
Subject: [PATCH 13/13] Add nodes_to_dataframe tests

---
 tests/database.py | 48 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/tests/database.py b/tests/database.py
index a4941830..fae525ac 100644
--- a/tests/database.py
+++ b/tests/database.py
@@ -1028,3 +1028,51 @@ def foo(node, edge, row):
     df = Database("food").edges_to_dataframe(formatters=[foo])
     assert_series_equal(df['foo'], pd.Series(['bar'] * 4, name='foo'))
 
+
+def test_nodes_to_dataframe_simple(df_fixture):
+    df = Database("food").nodes_to_dataframe()
+    expected = pd.DataFrame([{
+        "categories": ["stuff", "meals"],
+        "code": "2",
+        "database": "food",
+        "id": get_id(("food", "2")),
+        "location": "CH",
+        "name": "dinner",
+        "type": "process",
+        "unit": "kg",
+    }, {
+        "categories": ("stuff", "meals"),
+        "code": "1",
+        "database": "food",
+        "id": get_id(("food", "1")),
+        "location": "CA",
+        "name": "lunch",
+        "type": "process",
+        "unit": "kg",
+    }])
+    assert_frame_equal(
+        df.reset_index(drop=True),
+        expected.reset_index(drop=True),
+    )
+
+
+def test_nodes_to_dataframe_columns(df_fixture):
+    df = Database("food").nodes_to_dataframe(columns=['id', 'name', 'unit'])
+    expected = pd.DataFrame([{
+        "id": get_id(("food", "2")),
+        "name": "dinner",
+        "unit": "kg",
+    }, {
+        "id": get_id(("food", "1")),
+        "name": "lunch",
+        "unit": "kg",
+    }])
+    assert_frame_equal(
+        df.reset_index(drop=True),
+        expected.reset_index(drop=True),
+    )
+
+
+def test_nodes_to_dataframe_unsorted(df_fixture):
+    df = Database("food").nodes_to_dataframe()
+    assert df.shape == (2, 8)