Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/dataframes_again' into IOTable-u…
Browse files Browse the repository at this point in the history
…pdate
  • Loading branch information
cmutel committed Aug 17, 2022
2 parents 77120b1 + 73978ce commit 00370df
Show file tree
Hide file tree
Showing 4 changed files with 627 additions and 5 deletions.
8 changes: 4 additions & 4 deletions azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ jobs:
- bash: |
source activate bw2
mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip
mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee wurst brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip
displayName: Install Anaconda packages
- bash: |
Expand Down Expand Up @@ -90,7 +90,7 @@ jobs:
- bash: |
source activate bw2
mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pip
mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip
displayName: Install Anaconda packages
- bash: |
Expand Down Expand Up @@ -144,7 +144,7 @@ jobs:
- bash: |
source activate bw2
mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pip
mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip
displayName: Install Anaconda packages
- bash: |
Expand Down Expand Up @@ -198,7 +198,7 @@ jobs:
- script: |
call activate bw2
conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee brightway25 pytest-azurepipelines">=1.0" pywin32 pip
conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pywin32 pip
displayName: Install Anaconda packages
- script: |
Expand Down
121 changes: 121 additions & 0 deletions bw2data/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import sqlite3
import warnings
from collections import defaultdict
from typing import List, Callable, Optional

import pandas
import pyprind
Expand Down Expand Up @@ -886,3 +887,123 @@ def get_uniqueness_key(exchange, fields):
for exc in lst[-1:0:-1]:
print("Deleting exchange:", exc)
exc.delete()

def nodes_to_dataframe(self, columns: Optional[List[str]] = None, return_sorted: bool = True) -> pandas.DataFrame:
"""Return a pandas DataFrame with all database nodes. Uses the provided node attributes by default, such as name, unit, location.
By default, returns a DataFrame sorted by name, reference product, location, and unit. Set ``return_sorted`` to ``False`` to skip sorting.
Specify ``columns`` to get custom columns. You will need to write your own function to get more customization, there are endless possibilities here.
Returns a pandas ``DataFrame``.
"""
if columns is None:
# Feels like magic
df = pandas.DataFrame(self)
else:
df = pandas.DataFrame([{field: obj.get(field) for field in columns} for obj in self])
if return_sorted:
sort_columns = ['name', 'reference product', 'location', 'unit']
df = df.sort_values(by=[column for column in sort_columns if column in df.columns])
return df

def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List[Callable]] = None) -> pandas.DataFrame:
"""Return a pandas DataFrame with all database exchanges. Standard DataFrame columns are:
target_id: int,
target_database: str,
target_code: str,
target_name: Optional[str],
target_reference_product: Optional[str],
target_location: Optional[str],
target_unit: Optional[str],
target_type: Optional[str]
source_id: int,
source_database: str,
source_code: str,
source_name: Optional[str],
source_product: Optional[str], # Note different label
source_location: Optional[str],
source_unit: Optional[str],
source_categories: Optional[str] # Tuple concatenated with "::" as in `bw2io`
edge_amount: float,
edge_type: str,
Target is the node consuming the edge, source is the node or flow being consumed. The terms target and source were chosen because they also work well for biosphere edges.
Args:
``categorical`` will turn each string column in a `pandas Categorical Series <https://pandas.pydata.org/docs/reference/api/pandas.Categorical.html>`__. This takes 1-2 extra seconds, but saves around 50% of the memory consumption.
``formatters`` is a list of callables that modify each row. These functions must take the following keyword arguments, and use the `Wurst internal data format <https://wurst.readthedocs.io/#internal-data-format>`__:
* ``node``: The target node, as a dict
* ``edge``: The edge, including attributes of the source node
* ``row``: The current row dict being modified.
The functions in ``formatters`` don't need to return anything, they modify ``row`` in place.
Returns a pandas ``DataFrame``.
"""
try:
from wurst import extract_brightway2_databases
except ImportError:
raise ImportError("This method requires the `wurst` library.")

result = []

for target in extract_brightway2_databases(self.name, add_identifiers=True):
for edge in target["exchanges"]:
row = {
"target_id": target["id"],
"target_database": target["database"],
"target_code": target["code"],
"target_name": target.get("name"),
"target_reference_product": target.get("reference product"),
"target_location": target.get("location"),
"target_unit": target.get("unit"),
"target_type": target.get("type", "process"),
"source_id": edge["id"],
"source_database": edge["database"],
"source_code": edge["code"],
"source_name": edge.get("name"),
"source_product": edge.get("product"),
"source_location": edge.get("location"),
"source_unit": edge.get("unit"),
"source_categories": "::".join(edge["categories"]) if edge.get("categories") else None,
"edge_amount": edge["amount"],
"edge_type": edge["type"],
}
if formatters is not None:
for func in formatters:
func(node=target, edge=edge, row=row)
result.append(row)

print("Creating DataFrame")
df = pandas.DataFrame(result)

if categorical:
categorical_columns = [
"target_database",
"target_name",
"target_reference_product",
"target_location",
"target_unit",
"target_type",
"source_database",
"source_code",
"source_name",
"source_product",
"source_location",
"source_unit",
"source_categories",
"edge_type",
]
print("Compressing DataFrame")
for column in categorical_columns:
if column in df.columns:
df[column] = df[column].astype("category")

return df
Loading

0 comments on commit 00370df

Please sign in to comment.