Merge remote-tracking branch 'origin/dataframes_again' into IOTable-u…

…pdate
brightway-lca · Aug 17, 2022 · 00370df · 00370df
2 parents 77120b1 + 73978ce
commit 00370df
Show file tree

Hide file tree

Showing 4 changed files with 627 additions and 5 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -29,7 +29,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip
+      mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee wurst brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -90,7 +90,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pip
+      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -144,7 +144,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pip
+      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -198,7 +198,7 @@ jobs:
 
   - script: |
       call activate bw2
-      conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pywin32 pip
+      conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pywin32 pip
     displayName: Install Anaconda packages
 
   - script: |

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
@@ -7,6 +7,7 @@
 import sqlite3
 import warnings
 from collections import defaultdict
+from typing import List, Callable, Optional
 
 import pandas
 import pyprind
@@ -886,3 +887,123 @@ def get_uniqueness_key(exchange, fields):
                 for exc in lst[-1:0:-1]:
                     print("Deleting exchange:", exc)
                     exc.delete()
+
+    def nodes_to_dataframe(self, columns: Optional[List[str]] = None, return_sorted: bool = True) -> pandas.DataFrame:
+        """Return a pandas DataFrame with all database nodes. Uses the provided node attributes by default,  such as name, unit, location.
+
+        By default, returns a DataFrame sorted by name, reference product, location, and unit. Set ``return_sorted`` to ``False`` to skip sorting.
+
+        Specify ``columns`` to get custom columns. You will need to write your own function to get more customization, there are endless possibilities here.
+
+        Returns a pandas ``DataFrame``.
+
+        """
+        if columns is None:
+            # Feels like magic
+            df = pandas.DataFrame(self)
+        else:
+            df = pandas.DataFrame([{field: obj.get(field) for field in columns} for obj in self])
+        if return_sorted:
+            sort_columns = ['name', 'reference product', 'location', 'unit']
+            df = df.sort_values(by=[column for column in sort_columns if column in df.columns])
+        return df
+
+    def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame:
+        """Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are:
+
+            target_id: int,
+            target_database: str,
+            target_code: str,
+            target_name: Optional[str],
+            target_reference_product: Optional[str],
+            target_location: Optional[str],
+            target_unit: Optional[str],
+            target_type: Optional[str]
+            source_id: int,
+            source_database: str,
+            source_code: str,
+            source_name: Optional[str],
+            source_product: Optional[str],  # Note different label
+            source_location: Optional[str],
+            source_unit: Optional[str],
+            source_categories: Optional[str]  # Tuple concatenated with "::" as in `bw2io`
+            edge_amount: float,
+            edge_type: str,
+
+        Target is the node consuming the edge, source is the node or flow being consumed. The terms target and source were chosen because they also work well for biosphere edges.
+
+        Args:
+
+        ``categorical`` will turn each string column in a `pandas Categorical Series <https://pandas.pydata.org/docs/reference/api/pandas.Categorical.html>`__. This takes 1-2 extra seconds, but saves around 50% of the memory consumption.
+
+        ``formatters`` is a list of callables that modify each row. These functions must take the following keyword arguments, and use the `Wurst internal data format <https://wurst.readthedocs.io/#internal-data-format>`__:
+
+            * ``node``: The target node, as a dict
+            * ``edge``: The edge, including attributes of the source node
+            * ``row``: The current row dict being modified.
+
+        The functions in ``formatters`` don't need to return anything, they modify ``row`` in place.
+
+        Returns a pandas ``DataFrame``.
+
+        """
+        try:
+            from wurst import extract_brightway2_databases
+        except ImportError:
+            raise ImportError("This method requires the `wurst` library.")
+
+        result = []
+
+        for target in extract_brightway2_databases(self.name, add_identifiers=True):
+            for edge in target["exchanges"]:
+                row = {
+                    "target_id": target["id"],
+                    "target_database": target["database"],
+                    "target_code": target["code"],
+                    "target_name": target.get("name"),
+                    "target_reference_product": target.get("reference product"),
+                    "target_location": target.get("location"),
+                    "target_unit": target.get("unit"),
+                    "target_type": target.get("type", "process"),
+                    "source_id": edge["id"],
+                    "source_database": edge["database"],
+                    "source_code": edge["code"],
+                    "source_name": edge.get("name"),
+                    "source_product": edge.get("product"),
+                    "source_location": edge.get("location"),
+                    "source_unit": edge.get("unit"),
+                    "source_categories": "::".join(edge["categories"]) if edge.get("categories") else None,
+                    "edge_amount": edge["amount"],
+                    "edge_type": edge["type"],
+                }
+                if formatters is not None:
+                    for func in formatters:
+                        func(node=target, edge=edge, row=row)
+                result.append(row)
+
+        print("Creating DataFrame")
+        df = pandas.DataFrame(result)
+
+        if categorical:
+            categorical_columns = [
+                "target_database",
+                "target_name",
+                "target_reference_product",
+                "target_location",
+                "target_unit",
+                "target_type",
+                "source_database",
+                "source_code",
+                "source_name",
+                "source_product",
+                "source_location",
+                "source_unit",
+                "source_categories",
+                "edge_type",
+            ]
+            print("Compressing DataFrame")
+            for column in categorical_columns:
+                if column in df.columns:
+                    df[column] = df[column].astype("category")
+
+        return df