Move wurst extraction functions to bw2data

brightway-lca · Aug 17, 2022 · 55f1071 · 55f1071
1 parent 00370df
commit 55f1071
Show file tree

Hide file tree

Showing 9 changed files with 388 additions and 9 deletions.
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
@@ -29,7 +29,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee wurst brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip
+      mamba install --yes --quiet -c defaults -c conda-forge -c cmutel --name bw2 bw_processing python=$PYTHON_VERSION peewee tqdm brightway25 pytest pytest-azurepipelines">=1.0" pytest-cov pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -90,7 +90,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip
+      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee tqdm brightway25 pytest-azurepipelines">=1.0" pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -144,7 +144,7 @@ jobs:
 
   - bash: |
       source activate bw2
-      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pip
+      mamba install --yes -c defaults -c conda-forge -c cmutel --name bw2 python=$PYTHON_VERSION bw_processing pytest peewee tqdm brightway25 pytest-azurepipelines">=1.0" pip
     displayName: Install Anaconda packages
 
   - bash: |
@@ -198,7 +198,7 @@ jobs:
 
   - script: |
       call activate bw2
-      conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee wurst brightway25 pytest-azurepipelines">=1.0" pywin32 pip
+      conda install --yes -c defaults -c conda-forge -c cmutel -c haasad --name bw2 python=%PYTHON_VERSION% bw_processing pytest peewee tqdm brightway25 pytest-azurepipelines">=1.0" pywin32 pip
     displayName: Install Anaconda packages
 
   - script: |

diff --git a/bw2data/__init__.py b/bw2data/__init__.py
@@ -7,6 +7,7 @@
     "databases",
     "DataStore",
     "Edge",
+    "extract_brightway_databases",
     "get_activity",
     "get_node",
     "get_id",
@@ -72,6 +73,7 @@
 from .weighting_normalization import Weighting, Normalization
 from .backends import convert_backend, get_id, Node, Edge
 from .compat import prepare_lca_inputs, Mapping
+from .backends.wurst_extraction import extract_brightway_databases
 
 mapping = Mapping()
 

diff --git a/bw2data/backends/base.py b/bw2data/backends/base.py
@@ -947,14 +947,11 @@ def edges_to_dataframe(self, categorical: bool = True, formatters: Optional[List
         Returns a pandas ``DataFrame``.
 
         """
-        try:
-            from wurst import extract_brightway2_databases
-        except ImportError:
-            raise ImportError("This method requires the `wurst` library.")
+        from .wurst_extraction import extract_brightway_databases
 
         result = []
 
-        for target in extract_brightway2_databases(self.name, add_identifiers=True):
+        for target in extract_brightway_databases(self.name, add_identifiers=True):
             for edge in target["exchanges"]:
                 row = {
                     "target_id": target["id"],

diff --git a/bw2data/backends/wurst_extraction.py b/bw2data/backends/wurst_extraction.py
@@ -0,0 +1,171 @@
+from tqdm import tqdm
+import copy
+
+from ..database import DatabaseChooser
+from . import SQLiteBackend, ActivityDataset, ExchangeDataset
+
+
+def _list_or_dict(obj):
+    if isinstance(obj, dict):
+        for key, value in obj.items():
+            cp = copy.deepcopy(value)
+            cp["name"] = key
+            yield cp
+    else:
+        for tmp in obj:
+            yield (tmp)
+
+
+def extract_activity(proxy, add_identifiers=False):
+    """Get data in Wurst internal format for an ``ActivityDataset``"""
+    assert isinstance(proxy, ActivityDataset)
+
+    obj = {
+        "classifications": proxy.data.get("classifications", []),
+        "comment": proxy.data.get("comment", ""),
+        "location": proxy.location,
+        "database": proxy.database,
+        "code": proxy.code,
+        "name": proxy.name,
+        "reference product": proxy.product,
+        "unit": proxy.data.get("unit", ""),
+        "exchanges": [],
+        "parameters": {
+            obj["name"]: obj["amount"]
+            for obj in _list_or_dict(proxy.data.get("parameters", []))
+        },
+        "parameters full": list(_list_or_dict(proxy.data.get("parameters", []))),
+    }
+    if add_identifiers:
+        obj["id"] = proxy.id
+    return obj
+
+
+def extract_exchange(proxy, add_properties=False):
+    """Get data in Wurst internal format for an ``ExchangeDataset``"""
+    assert isinstance(proxy, ExchangeDataset)
+
+    uncertainty_fields = (
+        "uncertainty type",
+        "loc",
+        "scale",
+        "shape",
+        "minimum",
+        "maximum",
+        "amount",
+        "pedigree",
+    )
+    data = {key: proxy.data[key] for key in uncertainty_fields if key in proxy.data}
+    assert "amount" in data, "Exchange has no `amount` field"
+    if "uncertainty type" not in data:
+        data["uncertainty type"] = 0
+        data["loc"] = data["amount"]
+    data["type"] = proxy.type
+    data["production volume"] = proxy.data.get("production volume")
+    data["input"] = (proxy.input_database, proxy.input_code)
+    data["output"] = (proxy.output_database, proxy.output_code)
+    if add_properties:
+        data["properties"] = proxy.data.get("properties", {})
+    return data
+
+
+def add_exchanges_to_consumers(activities, exchange_qs, add_properties=False, add_identifiers=False):
+    """Retrieve exchanges from database, and add to activities.
+
+    Assumes that activities are single output, and that the exchange code is the same as the activity code. This assumption is valid for ecoinvent 3.3 cutoff imported into Brightway2."""
+    lookup = {(o["database"], o["code"]): o for o in activities}
+
+    with tqdm(total=exchange_qs.count()) as pbar:
+        for i, exc in enumerate(exchange_qs):
+            exc = extract_exchange(exc, add_properties=add_properties)
+            output = tuple(exc.pop("output"))
+            lookup[output]["exchanges"].append(exc)
+            pbar.update(1)
+    return activities
+
+
+def add_input_info_for_indigenous_exchanges(activities, names, add_identifiers=False):
+    """Add details on exchange inputs if these activities are already available"""
+    names = set(names)
+    lookup = {(o["database"], o["code"]): o for o in activities}
+
+    for ds in activities:
+        for exc in ds["exchanges"]:
+            if "input" not in exc or exc["input"][0] not in names:
+                continue
+            obj = lookup[exc["input"]]
+            exc["product"] = obj.get("reference product")
+            exc["name"] = obj.get("name")
+            exc["unit"] = obj.get("unit")
+            exc["location"] = obj.get("location")
+            exc["database"] = obj.get("database")
+            if add_identifiers:
+                exc["id"] = obj['id']
+                exc['code'] = obj['code']
+            if exc["type"] == "biosphere":
+                exc["categories"] = obj.get("categories")
+            exc.pop("input")
+
+
+def add_input_info_for_external_exchanges(activities, names, add_identifiers=False):
+    """Add details on exchange inputs from other databases"""
+    names = set(names)
+    cache = {}
+
+    for ds in tqdm(activities):
+        for exc in ds["exchanges"]:
+            if "input" not in exc or exc["input"][0] in names:
+                continue
+            if exc["input"] not in cache:
+                cache[exc["input"]] = ActivityDataset.get(
+                    ActivityDataset.database == exc["input"][0],
+                    ActivityDataset.code == exc["input"][1],
+                )
+            obj = cache[exc["input"]]
+            exc["name"] = obj.name
+            exc["product"] = obj.product
+            exc["unit"] = obj.data.get("unit")
+            exc["location"] = obj.location
+            exc["database"] = obj.database
+            if add_identifiers:
+                exc["id"] = obj.id
+                exc['code'] = obj.code
+            if exc["type"] == "biosphere":
+                exc["categories"] = obj.data.get("categories")
+
+
+def extract_brightway_databases(database_names, add_properties=False, add_identifiers=False):
+    """Extract a Brightway2 SQLiteBackend database to the Wurst internal format.
+
+    ``database_names`` is a list of database names. You should already be in the correct project.
+
+    Returns a list of dataset documents."""
+    ERROR = "Must pass list of database names"
+    if isinstance(database_names, str):
+        database_names = [database_names]
+    assert isinstance(database_names, (list, tuple, set)), ERROR
+
+    databases = [DatabaseChooser(name) for name in database_names]
+    ERROR = "Wrong type of database object (must be SQLiteBackend)"
+    assert all(isinstance(obj, SQLiteBackend) for obj in databases), ERROR
+
+    # Construct generators for both activities and exchanges
+    # Need to be clever to minimize copying and memory use
+    activity_qs = ActivityDataset.select().where(
+        ActivityDataset.database << database_names
+    )
+    exchange_qs = ExchangeDataset.select().where(
+        ExchangeDataset.output_database << database_names
+    )
+
+    # Retrieve all activity data
+    print("Getting activity data")
+    activities = [extract_activity(o, add_identifiers=add_identifiers) for o in tqdm(activity_qs)]
+    # Add each exchange to the activity list of exchanges
+    print("Adding exchange data to activities")
+    add_exchanges_to_consumers(activities, exchange_qs, add_properties)
+    # Add details on exchanges which come from our databases
+    print("Filling out exchange data")
+    add_input_info_for_indigenous_exchanges(activities, database_names, add_identifiers=add_identifiers)
+    add_input_info_for_external_exchanges(activities, database_names, add_identifiers=add_identifiers)
+    return activities
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -15,6 +15,7 @@ python-coveralls
 requests>=1.1.0
 scipy
 stats_arrays
+tqdm
 voluptuous
 whoosh
 wrapt
diff --git a/requirements.txt b/requirements.txt
@@ -11,6 +11,7 @@ pyprind
 requests>=1.1.0
 scipy
 stats_arrays
+tqdm
 voluptuous
 whoosh
 wrapt
diff --git a/setup.py b/setup.py
@@ -41,6 +41,7 @@
         "requests>=1.1.0",
         "scipy",
         "stats_arrays",
+        "tqdm",
         "voluptuous",
         "whoosh",
         "wrapt",

diff --git a/tests/wurst_extraction/extraction.py b/tests/wurst_extraction/extraction.py
@@ -0,0 +1,124 @@
+from fixtures import test_bw2_database
+
+from bw2data import extract_brightway_databases
+from bw2data.tests import bw2test
+import pytest
+
+
+def test_extraction(test_bw2_database):
+    expected = [
+        {
+            "classifications": [42],
+            "code": "1",
+            "comment": "Yep",
+            "database": "food",
+            "exchanges": [
+                {
+                    "name": "dinner",
+                    "amount": 0.5,
+                    "database": "food",
+                    "loc": 0.5,
+                    "location": "CH",
+                    "product": None,
+                    "production volume": 13,
+                    "type": "technosphere",
+                    "uncertainty type": 0,
+                    "unit": "kg",
+                },
+                {
+                    "name": "an emission",
+                    "amount": 0.05,
+                    "categories": ["things"],
+                    "input": ("biosphere", "1"),
+                    "database": "biosphere",
+                    "location": None,
+                    "product": "find me!",
+                    "production volume": None,
+                    "type": "biosphere",
+                    "uncertainty type": 4,
+                    "unit": "kg",
+                },
+            ],
+            "location": "CA",
+            "name": "lunch",
+            "reference product": "stuff",
+            "unit": "kg",
+            "parameters": {"losses_gross_net": 0.01},
+            "parameters full": [{"amount": 0.01, "name": "losses_gross_net"}],
+        },
+        {
+            "classifications": [],
+            "code": "2",
+            "comment": "",
+            "database": "food",
+            "exchanges": [
+                {
+                    "name": "lunch",
+                    "amount": 0.25,
+                    "location": "CA",
+                    "product": "stuff",
+                    "database": "food",
+                    "production volume": None,
+                    "type": "technosphere",
+                    "uncertainty type": 0,
+                    "unit": "kg",
+                },
+                {
+                    "name": "another emission",
+                    "amount": 0.15,
+                    "categories": ["things"],
+                    "input": ("biosphere", "2"),
+                    "database": "biosphere",
+                    "location": None,
+                    "product": None,
+                    "production volume": None,
+                    "type": "biosphere",
+                    "uncertainty type": 0,
+                    "unit": "kg",
+                },
+            ],
+            "location": "CH",
+            "name": "dinner",
+            "reference product": None,
+            "unit": "kg",
+            "parameters": {"rara": 13},
+            "parameters full": [
+                {"name": "rara", "amount": 13, "something": "else"}
+            ],
+        },
+    ]
+
+    assert sorted(
+        extract_brightway_databases("food"), key=lambda x: x["code"]
+    ) == sorted(expected, key=lambda x: x["code"])
+
+
+@bw2test
+def test_extraction_missing_database():
+    with pytest.raises(AssertionError):
+        assert extract_brightway_databases("biosphere3")
+
+
+def test_extraction_input_formats(test_bw2_database):
+    assert extract_brightway_databases("food")
+    assert extract_brightway_databases(["food"])
+    assert extract_brightway_databases(("food",))
+    assert extract_brightway_databases({"food"})
+    with pytest.raises(AssertionError):
+        assert extract_brightway_databases({"food": None})
+
+
+def test_extraction_with_properties():
+    data = extract_brightway_databases("food")
+    assert all("properties" not in exc for ds in data for exc in ds["exchanges"])
+    data = extract_brightway_databases("food", add_properties=True)
+    assert all("properties" in exc for ds in data for exc in ds["exchanges"])
+
+
+def test_extraction_with_identifiers():
+    data = extract_brightway_databases("food")
+    assert all("properties" not in exc for ds in data for exc in ds["exchanges"])
+    data = extract_brightway_databases("food", add_identifiers=True)
+    assert all("id" in ds for ds in data)
+    assert all("id" in exc for ds in data for exc in ds["exchanges"])
+    assert all("code" in exc for ds in data for exc in ds["exchanges"])