Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/19.breaking.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- Required `db_dir` to be passed when initialising [pandas_openscm.db.reader.OpenSCMDBReader][]. This is required to support portable databases
- Renamed `out_column_type` to `out_columns_type` in [pandas_openscm.io.load_timeseries_csv][] for consistency with the rest of the API
- Bumped the minimum supported version of [filelock](https://py-filelock.readthedocs.io/) to 3.12.3, as only this version handles automatic creation of directories for the lock
3 changes: 3 additions & 0 deletions changelog/19.feature.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
- Made the database portable by only storing relative paths in the file map. This allows the database to be converted to an archive with [pandas_openscm.db.OpenSCMDB.to_gzipped_tar_archive][] and then unpacked elsewhere with [pandas_openscm.db.OpenSCMDB.from_gzipped_tar_archive][]
- Added [pandas_openscm.db.path_handling][] to clarify how we handle paths internally to support portability
- Added support for specifying the name of the output columns via [pandas_openscm.db.OpenSCMDB.load][], [pandas_openscm.db.reader.OpenSCMDBReader.load][] and [pandas_openscm.io.load_timeseries_csv][]
2 changes: 2 additions & 0 deletions changelog/19.improvement.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
- Added the explicit [pandas_openscm.db.backends][] module to handle the backends we support more clearly
- Added [pandas_openscm.db.backends.DataBackendOptions.guess_backend][] and [pandas_openscm.db.backends.IndexBackendOptions.guess_backend][] to allow for move convenient inference of the backend to use with different files
1 change: 1 addition & 0 deletions changelog/19.trivial.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Moved DATA_BACKENDS and INDEX_BACKENDS to [pandas_openscm.db.backends][], out of the top level [pandas_openscm.db][] module
77 changes: 77 additions & 0 deletions docs/how-to-guides/how-to-use-openscmdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
import concurrent.futures
import contextlib
import itertools
import tarfile
import tempfile
import traceback
from functools import partial
Expand Down Expand Up @@ -200,6 +201,82 @@
# %% [markdown]
# ## Advanced topics

# %% [markdown]
# ### Sharing the database
#
# If you need to share a database,
# you can zip it and pass it to someone else.

# %% [markdown]
# We start by putting some data in a database.

# %%
top_level_dir = Path(tempfile.mkdtemp())

# %%
db_start = OpenSCMDB(
db_dir=top_level_dir / "start",
backend_data=DATA_BACKENDS.get_instance("csv"),
backend_index=INDEX_BACKENDS.get_instance("csv"),
)
db_start.save(df_timeseries_like)

# %% [markdown]
# Then we create a gzipped tar archive of our database.

# %%
gzipped = top_level_dir / "db_archive.tar.gz"
db_start.to_gzipped_tar_archive(gzipped)

# %% [markdown]
# To demonstrate that this does not rely on the original data,
# we delete the original database.

# %%
db_start.delete()

# %% [markdown]
# We can inspect the tar file's contents.

# %%
with tarfile.open(gzipped) as tar:
print(f"{tar.getmembers()=}")

# %% [markdown]
# A new database can be initialised from the gzipped tar archive.

# %%
db_moved = OpenSCMDB.from_gzipped_tar_archive(
gzipped,
db_dir=top_level_dir / "moved",
)
db_moved

# %% [markdown]
# As above, we remove the archive
# to demonstrate that there is no reliance on it
# for the following operations.

# %%
gzipped.unlink()

# %% [markdown]
# You can then use this database like normal,
# but now from the new location
# (whether on your machine or someone else's).

# %%
db_moved.load()

# %%
db_moved.load(pix.isin(unit="J"))

# %% [markdown]
# We clean up the files before moving onto the next demonstration.

# %%
db_moved.delete()

# %% [markdown]
# ### Grouping data
#
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ Issues = "https://github.com/openscm/pandas-openscm/issues"

[project.optional-dependencies]
db = [
"filelock>=3.0.0",
"filelock>=3.12.3",
]
db-full = [
"netcdf4>=1.7.2",
Expand Down
122 changes: 1 addition & 121 deletions src/pandas_openscm/db/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,134 +4,14 @@

from __future__ import annotations

from attrs import frozen

from pandas_openscm.db.backends import DATA_BACKENDS, INDEX_BACKENDS
from pandas_openscm.db.csv import CSVDataBackend, CSVIndexBackend
from pandas_openscm.db.feather import FeatherDataBackend, FeatherIndexBackend
from pandas_openscm.db.in_memory import InMemoryDataBackend, InMemoryIndexBackend
from pandas_openscm.db.interfaces import OpenSCMDBDataBackend, OpenSCMDBIndexBackend
from pandas_openscm.db.netcdf import netCDFDataBackend, netCDFIndexBackend
from pandas_openscm.db.openscm_db import AlreadyInDBError, EmptyDBError, OpenSCMDB


@frozen
class DataBackendOptions:
"""A collection of data back-end options"""

options: tuple[ # type hint doesn't work properly, but ok
tuple[str, type[OpenSCMDBDataBackend]], ...
]
"""
Options

The first element of each option is the option's short name.
The second element is the class that matches that option.
"""

def get_instance(self, option: str) -> OpenSCMDBDataBackend:
"""
Get an instance of one of the options

Parameters
----------
option
Option for which to get a data back-end instance

Returns
-------
:
Initialised instance

Raises
------
KeyError
The option is not supported
"""
for short_name, option_cls in self.options:
if short_name == option:
return option_cls()

msg = (
f"{option=} is not supported. "
f"Available options: {tuple(v[1] for v in self.options)}"
)
raise KeyError(msg)


DATA_BACKENDS = DataBackendOptions(
( # type: ignore # using class with protocol doesn't work properly
("csv", CSVDataBackend),
("feather", FeatherDataBackend),
("in_memory", InMemoryDataBackend),
("netCDF", netCDFDataBackend),
# Other options to consider:
#
# - pretty netCDF, where we try and save the data with dimensions where possible
#
# - HDF5: https://pandas.pydata.org/docs/user_guide/io.html#hdf5-pytables
# - sqllite
)
)
"""Inbuilt data back-ends"""


@frozen
class IndexBackendOptions:
"""A collection of index back-end options"""

options: tuple[tuple[str, type[OpenSCMDBIndexBackend]], ...]
"""
Options

The first element of each option is the option's short name.
The second element is the class that matches that option.
"""

def get_instance(self, option: str) -> OpenSCMDBIndexBackend:
"""
Get an instance of one of the options

Parameters
----------
option
Option for which to get a index back-end instance

Returns
-------
:
Initialised instance

Raises
------
KeyError
The option is not supported
"""
for short_name, option_cls in self.options:
if short_name == option:
return option_cls()

msg = (
f"{option=} is not supported. "
f"Available options: {tuple(v[1] for v in self.options)}"
)
raise KeyError(msg)


INDEX_BACKENDS = IndexBackendOptions(
( # type: ignore # using class with protocol doesn't work properly
("csv", CSVIndexBackend),
("feather", FeatherIndexBackend),
("in_memory", InMemoryIndexBackend),
("netCDF", netCDFIndexBackend),
# Other options to consider:
#
# - HDF5: https://pandas.pydata.org/docs/user_guide/io.html#hdf5-pytables
# - sqllite
)
)
"""Inbuilt index back-ends"""


__all__ = [
"DATA_BACKENDS",
"INDEX_BACKENDS",
Expand Down
Loading