Skip to content

Commit d467417

Browse files
authored
Merge pull request #19 from openscm/portable-db
Make database portable
2 parents bf65fb5 + 7bd7aaa commit d467417

23 files changed

+826
-169
lines changed

changelog/19.breaking.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
- Required `db_dir` to be passed when initialising [pandas_openscm.db.reader.OpenSCMDBReader][]. This is required to support portable databases
2+
- Renamed `out_column_type` to `out_columns_type` in [pandas_openscm.io.load_timeseries_csv][] for consistency with the rest of the API
3+
- Bumped the minimum supported version of [filelock](https://py-filelock.readthedocs.io/) to 3.12.3, as only this version handles automatic creation of directories for the lock

changelog/19.feature.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
- Made the database portable by only storing relative paths in the file map. This allows the database to be converted to an archive with [pandas_openscm.db.OpenSCMDB.to_gzipped_tar_archive][] and then unpacked elsewhere with [pandas_openscm.db.OpenSCMDB.from_gzipped_tar_archive][]
2+
- Added [pandas_openscm.db.path_handling][] to clarify how we handle paths internally to support portability
3+
- Added support for specifying the name of the output columns via [pandas_openscm.db.OpenSCMDB.load][], [pandas_openscm.db.reader.OpenSCMDBReader.load][] and [pandas_openscm.io.load_timeseries_csv][]

changelog/19.improvement.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
- Added the explicit [pandas_openscm.db.backends][] module to handle the backends we support more clearly
2+
- Added [pandas_openscm.db.backends.DataBackendOptions.guess_backend][] and [pandas_openscm.db.backends.IndexBackendOptions.guess_backend][] to allow for move convenient inference of the backend to use with different files

changelog/19.trivial.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Moved DATA_BACKENDS and INDEX_BACKENDS to [pandas_openscm.db.backends][], out of the top level [pandas_openscm.db][] module

docs/how-to-guides/how-to-use-openscmdb.py

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
import concurrent.futures
2727
import contextlib
2828
import itertools
29+
import tarfile
2930
import tempfile
3031
import traceback
3132
from functools import partial
@@ -200,6 +201,82 @@
200201
# %% [markdown]
201202
# ## Advanced topics
202203

204+
# %% [markdown]
205+
# ### Sharing the database
206+
#
207+
# If you need to share a database,
208+
# you can zip it and pass it to someone else.
209+
210+
# %% [markdown]
211+
# We start by putting some data in a database.
212+
213+
# %%
214+
top_level_dir = Path(tempfile.mkdtemp())
215+
216+
# %%
217+
db_start = OpenSCMDB(
218+
db_dir=top_level_dir / "start",
219+
backend_data=DATA_BACKENDS.get_instance("csv"),
220+
backend_index=INDEX_BACKENDS.get_instance("csv"),
221+
)
222+
db_start.save(df_timeseries_like)
223+
224+
# %% [markdown]
225+
# Then we create a gzipped tar archive of our database.
226+
227+
# %%
228+
gzipped = top_level_dir / "db_archive.tar.gz"
229+
db_start.to_gzipped_tar_archive(gzipped)
230+
231+
# %% [markdown]
232+
# To demonstrate that this does not rely on the original data,
233+
# we delete the original database.
234+
235+
# %%
236+
db_start.delete()
237+
238+
# %% [markdown]
239+
# We can inspect the tar file's contents.
240+
241+
# %%
242+
with tarfile.open(gzipped) as tar:
243+
print(f"{tar.getmembers()=}")
244+
245+
# %% [markdown]
246+
# A new database can be initialised from the gzipped tar archive.
247+
248+
# %%
249+
db_moved = OpenSCMDB.from_gzipped_tar_archive(
250+
gzipped,
251+
db_dir=top_level_dir / "moved",
252+
)
253+
db_moved
254+
255+
# %% [markdown]
256+
# As above, we remove the archive
257+
# to demonstrate that there is no reliance on it
258+
# for the following operations.
259+
260+
# %%
261+
gzipped.unlink()
262+
263+
# %% [markdown]
264+
# You can then use this database like normal,
265+
# but now from the new location
266+
# (whether on your machine or someone else's).
267+
268+
# %%
269+
db_moved.load()
270+
271+
# %%
272+
db_moved.load(pix.isin(unit="J"))
273+
274+
# %% [markdown]
275+
# We clean up the files before moving onto the next demonstration.
276+
277+
# %%
278+
db_moved.delete()
279+
203280
# %% [markdown]
204281
# ### Grouping data
205282
#

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ Issues = "https://github.com/openscm/pandas-openscm/issues"
4141

4242
[project.optional-dependencies]
4343
db = [
44-
"filelock>=3.0.0",
44+
"filelock>=3.12.3",
4545
]
4646
db-full = [
4747
"netcdf4>=1.7.2",

src/pandas_openscm/db/__init__.py

Lines changed: 1 addition & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -4,134 +4,14 @@
44

55
from __future__ import annotations
66

7-
from attrs import frozen
8-
7+
from pandas_openscm.db.backends import DATA_BACKENDS, INDEX_BACKENDS
98
from pandas_openscm.db.csv import CSVDataBackend, CSVIndexBackend
109
from pandas_openscm.db.feather import FeatherDataBackend, FeatherIndexBackend
1110
from pandas_openscm.db.in_memory import InMemoryDataBackend, InMemoryIndexBackend
1211
from pandas_openscm.db.interfaces import OpenSCMDBDataBackend, OpenSCMDBIndexBackend
1312
from pandas_openscm.db.netcdf import netCDFDataBackend, netCDFIndexBackend
1413
from pandas_openscm.db.openscm_db import AlreadyInDBError, EmptyDBError, OpenSCMDB
1514

16-
17-
@frozen
18-
class DataBackendOptions:
19-
"""A collection of data back-end options"""
20-
21-
options: tuple[ # type hint doesn't work properly, but ok
22-
tuple[str, type[OpenSCMDBDataBackend]], ...
23-
]
24-
"""
25-
Options
26-
27-
The first element of each option is the option's short name.
28-
The second element is the class that matches that option.
29-
"""
30-
31-
def get_instance(self, option: str) -> OpenSCMDBDataBackend:
32-
"""
33-
Get an instance of one of the options
34-
35-
Parameters
36-
----------
37-
option
38-
Option for which to get a data back-end instance
39-
40-
Returns
41-
-------
42-
:
43-
Initialised instance
44-
45-
Raises
46-
------
47-
KeyError
48-
The option is not supported
49-
"""
50-
for short_name, option_cls in self.options:
51-
if short_name == option:
52-
return option_cls()
53-
54-
msg = (
55-
f"{option=} is not supported. "
56-
f"Available options: {tuple(v[1] for v in self.options)}"
57-
)
58-
raise KeyError(msg)
59-
60-
61-
DATA_BACKENDS = DataBackendOptions(
62-
( # type: ignore # using class with protocol doesn't work properly
63-
("csv", CSVDataBackend),
64-
("feather", FeatherDataBackend),
65-
("in_memory", InMemoryDataBackend),
66-
("netCDF", netCDFDataBackend),
67-
# Other options to consider:
68-
#
69-
# - pretty netCDF, where we try and save the data with dimensions where possible
70-
#
71-
# - HDF5: https://pandas.pydata.org/docs/user_guide/io.html#hdf5-pytables
72-
# - sqllite
73-
)
74-
)
75-
"""Inbuilt data back-ends"""
76-
77-
78-
@frozen
79-
class IndexBackendOptions:
80-
"""A collection of index back-end options"""
81-
82-
options: tuple[tuple[str, type[OpenSCMDBIndexBackend]], ...]
83-
"""
84-
Options
85-
86-
The first element of each option is the option's short name.
87-
The second element is the class that matches that option.
88-
"""
89-
90-
def get_instance(self, option: str) -> OpenSCMDBIndexBackend:
91-
"""
92-
Get an instance of one of the options
93-
94-
Parameters
95-
----------
96-
option
97-
Option for which to get a index back-end instance
98-
99-
Returns
100-
-------
101-
:
102-
Initialised instance
103-
104-
Raises
105-
------
106-
KeyError
107-
The option is not supported
108-
"""
109-
for short_name, option_cls in self.options:
110-
if short_name == option:
111-
return option_cls()
112-
113-
msg = (
114-
f"{option=} is not supported. "
115-
f"Available options: {tuple(v[1] for v in self.options)}"
116-
)
117-
raise KeyError(msg)
118-
119-
120-
INDEX_BACKENDS = IndexBackendOptions(
121-
( # type: ignore # using class with protocol doesn't work properly
122-
("csv", CSVIndexBackend),
123-
("feather", FeatherIndexBackend),
124-
("in_memory", InMemoryIndexBackend),
125-
("netCDF", netCDFIndexBackend),
126-
# Other options to consider:
127-
#
128-
# - HDF5: https://pandas.pydata.org/docs/user_guide/io.html#hdf5-pytables
129-
# - sqllite
130-
)
131-
)
132-
"""Inbuilt index back-ends"""
133-
134-
13515
__all__ = [
13616
"DATA_BACKENDS",
13717
"INDEX_BACKENDS",

0 commit comments

Comments
 (0)