Pass unit test

znicholls · znicholls · commit d1ceaf937e66 · 2025-05-10T08:26:13.000+02:00
diff --git a/src/pandas_openscm/db/loading.py b/src/pandas_openscm/db/loading.py
@@ -33,6 +33,7 @@ def load_data(  # noqa: PLR0913
     backend_data: OpenSCMDBDataBackend,
     db_index: pd.DataFrame,
     db_file_map: pd.Series[Path],  # type: ignore # pandas type hints confused about what they support
+    db_dir: Path,
     selector: pd.Index[Any] | pd.MultiIndex | pix.selectors.Selector | None = None,
     out_columns_type: type | None = None,
     parallel_op_config: ParallelOpConfig | None = None,
@@ -53,6 +54,9 @@ def load_data(  # noqa: PLR0913
     db_file_map
         File map of the database from which to load
 
+    db_dir
+        The directory in which the database lives
+
     selector
         Selector to use to choose the data to load
 
@@ -97,7 +101,7 @@ def load_data(  # noqa: PLR0913
     else:
         index_to_load = mi_loc(db_index, selector)
 
-    files_to_load = (Path(v) for v in db_file_map[index_to_load["file_id"].unique()])
+    files_to_load = (db_dir / v for v in db_file_map[index_to_load["file_id"].unique()])
     loaded_l = load_data_files(
         files_to_load=files_to_load,
         backend_data=backend_data,
diff --git a/src/pandas_openscm/db/openscm_db.py b/src/pandas_openscm/db/openscm_db.py
@@ -21,6 +21,7 @@
     load_db_index,
     load_db_metadata,
 )
+from pandas_openscm.db.path_handling import DBPath
 from pandas_openscm.db.reader import OpenSCMDBReader
 from pandas_openscm.db.rewriting import make_move_plan, rewrite_files
 from pandas_openscm.db.saving import save_data
@@ -297,6 +298,30 @@ def from_gzipped_tar_archive(
         backend_data: OpenSCMDBDataBackend | None = None,
         backend_index: OpenSCMDBIndexBackend | None = None,
     ) -> OpenSCMDB:
+        """
+        Initialise from a gzipped tar archive
+
+        This also unpacks the files to disk
+
+        Parameters
+        ----------
+        tar_archive
+            Tar archive from which to initialise
+
+        db_dir
+            Directory in which to unpack the database
+
+        backend_data
+            Backend to use for handling the data
+
+        backend_index
+            Backend to use for handling the index
+
+        Returns
+        -------
+        :
+            Initialised database
+        """
         with tarfile.open(tar_archive, "r") as tar:
             for member in tar.getmembers():
                 if not member.isreg():
@@ -317,7 +342,7 @@ def from_gzipped_tar_archive(
 
         return res
 
-    def get_new_data_file_path(self, file_id: int) -> Path:
+    def get_new_data_file_path(self, file_id: int) -> DBPath:
         """
         Get the path in which to write a new data file
 
@@ -329,7 +354,7 @@ def get_new_data_file_path(self, file_id: int) -> Path:
         Returns
         -------
         :
-            File in which to write the new data
+            Information about the path in which to write the new data
 
         Raises
         ------
@@ -341,7 +366,7 @@ def get_new_data_file_path(self, file_id: int) -> Path:
         if file_path.exists():
             raise FileExistsError(file_path)
 
-        return file_path
+        return DBPath.from_abs_path_and_db_dir(abs=file_path, db_dir=self.db_dir)
 
     def load(  # noqa: PLR0913
         self,
@@ -421,6 +446,7 @@ def load(  # noqa: PLR0913
                 backend_data=self.backend_data,
                 db_index=index,
                 db_file_map=file_map,
+                db_dir=self.db_dir,
                 selector=selector,
                 out_columns_type=out_columns_type,
                 parallel_op_config=parallel_op_config,
@@ -738,6 +764,24 @@ def save(  # noqa: PLR0913
                 )
 
     def to_gzipped_tar_archive(self, out_file: Path, mode: str = "w:gz") -> Path:
+        """
+        Convert to a gzipped tar archive
+
+        Parameters
+        ----------
+        out_file
+            File in which to write the output
+
+        mode
+            Mode to use to open `out_file`
+
+        Returns
+        -------
+        :
+            Path to the gzipped tar archive
+
+            This is the same as `out_file`, but is returned for convenience.
+        """
         with tarfile.open(out_file, mode) as tar:
             tar.add(self.db_dir, arcname="db")
 
diff --git a/src/pandas_openscm/db/path_handling.py b/src/pandas_openscm/db/path_handling.py
@@ -0,0 +1,72 @@
+"""
+Functionality for handling paths
+
+In order to make our databases portable,
+we need to be a bit smarter than just using raw paths.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import attr
+from attrs import define, field
+
+
+@define
+class DBPath:
+    """
+    Database-related path
+
+    Carries the information required to write paths with certainty
+    and keep the database portable.
+    """
+
+    abs: Path
+    """The absolute path for the file"""
+
+    rel_db: Path = field()
+    """The path relative to the database's directory"""
+
+    @rel_db.validator
+    def rel_db_validator(self, attribute: attr.Attribute[Any], value: Path) -> None:
+        """
+        Validate the value of `rel_db`
+
+        Parameters
+        ----------
+        attribute
+            Attribute being set
+
+        value
+            Value to use
+
+        Raises
+        ------
+        AssertionError
+            `value` is not within `self.abs`
+        """
+        if not str(self.abs).endswith(str(value)):
+            msg = f"{value} for {attribute.name} is not within {self.abs=}"
+            raise AssertionError(msg)
+
+    @classmethod
+    def from_abs_path_and_db_dir(cls, abs: Path, db_dir: Path) -> DBPath:
+        """
+        Initialise from an absolute path and a database directory
+
+        Parameters
+        ----------
+        abs
+            Absolute path
+
+        db_dir
+            Database directory
+
+        Returns
+        -------
+        :
+            Initialised `DBPath`
+        """
+        return cls(abs=abs, rel_db=abs.relative_to(db_dir))
diff --git a/src/pandas_openscm/db/saving.py b/src/pandas_openscm/db/saving.py
@@ -15,6 +15,7 @@
 from attrs import define
 
 from pandas_openscm.db.interfaces import OpenSCMDBDataBackend, OpenSCMDBIndexBackend
+from pandas_openscm.db.path_handling import DBPath
 from pandas_openscm.index_manipulation import (
     unify_index_levels_check_index_types,
 )
@@ -63,7 +64,7 @@ def save_data(  # noqa: PLR0913
     data: pd.DataFrame,
     *,
     backend_data: OpenSCMDBDataBackend,
-    get_new_data_file_path: Callable[[int], Path],
+    get_new_data_file_path: Callable[[int], DBPath],
     backend_index: OpenSCMDBIndexBackend,
     index_file: Path,
     file_map_file: Path,
@@ -84,8 +85,20 @@ def save_data(  # noqa: PLR0913
     data
         Data to save
 
-    db
-        Database in which to save the data
+    backend_data
+        Backend to use to save the data
+
+    get_new_data_file_path
+        Callable which, given an integer, returns the path info for the new data file
+
+    backend_index
+        Backend to use to save the index
+
+    index_file
+        File in which to save the index
+
+    file_map_file
+        File in which to save the file map
 
     index_non_data
         Index that is already in the database but isn't related to data.
@@ -94,7 +107,7 @@ def save_data(  # noqa: PLR0913
         before we write the database's index.
 
     file_map_non_data
-        File map that is already in the database but isn't related to data.
+        File map that is already in the database but isn't related to `data`.
 
         If supplied, this is combined with the file map generated for `data`
         before we write the database's file map.
@@ -179,9 +192,9 @@ def save_data(  # noqa: PLR0913
     for increment, (_, df) in enumerate(grouper):
         file_id = min_file_id + increment
 
-        new_file_path = get_new_data_file_path(file_id)
+        new_db_path = get_new_data_file_path(file_id)
 
-        file_map_out.loc[file_id] = new_file_path  # type: ignore # pandas types confused about what they support
+        file_map_out.loc[file_id] = new_db_path.rel_db  # type: ignore # pandas types confused about what they support
         if index_non_data_unified_index is None:
             df_index_unified = df.index
         else:
@@ -202,7 +215,7 @@ def save_data(  # noqa: PLR0913
                 info=df,
                 info_kind=DBFileType.DATA,
                 backend=backend_data,
-                save_path=new_file_path,
+                save_path=new_db_path.abs,
             )
         )
 
diff --git a/tests/integration/database/test_integration_database_portability.py b/tests/integration/database/test_integration_database_portability.py
@@ -8,16 +8,16 @@
 import pandas as pd
 import pytest
 
-from pandas_openscm.db import CSVDataBackend, CSVIndexBackend, OpenSCMDB
+from pandas_openscm.db import FeatherDataBackend, FeatherIndexBackend, OpenSCMDB
 from pandas_openscm.testing import assert_frame_alike
 
 
 @pytest.mark.parametrize(
     "backend_data_for_class_method, backend_index_for_class_method",
     (
         pytest.param(
-            CSVDataBackend(),
-            CSVIndexBackend(),
+            FeatherDataBackend(),
+            FeatherIndexBackend(),
             id="provided",
         ),
         pytest.param(
@@ -39,8 +39,8 @@ def test_move_db(
 
     db = OpenSCMDB(
         db_dir=initial_db_dir,
-        backend_data=CSVDataBackend(),
-        backend_index=CSVIndexBackend(),
+        backend_data=FeatherDataBackend(),
+        backend_index=FeatherIndexBackend(),
     )
 
     df_timeseries_like = pd.DataFrame(