Skip to content

Commit 95f53d1

Browse files
committed
WIP: Implement
1 parent aea1e89 commit 95f53d1

File tree

3 files changed

+230
-121
lines changed

3 files changed

+230
-121
lines changed

src/pandas_openscm/db/__init__.py

Lines changed: 1 addition & 121 deletions
Original file line numberDiff line numberDiff line change
@@ -4,134 +4,14 @@
44

55
from __future__ import annotations
66

7-
from attrs import frozen
8-
7+
from pandas_openscm.db.backends import DATA_BACKENDS, INDEX_BACKENDS
98
from pandas_openscm.db.csv import CSVDataBackend, CSVIndexBackend
109
from pandas_openscm.db.feather import FeatherDataBackend, FeatherIndexBackend
1110
from pandas_openscm.db.in_memory import InMemoryDataBackend, InMemoryIndexBackend
1211
from pandas_openscm.db.interfaces import OpenSCMDBDataBackend, OpenSCMDBIndexBackend
1312
from pandas_openscm.db.netcdf import netCDFDataBackend, netCDFIndexBackend
1413
from pandas_openscm.db.openscm_db import AlreadyInDBError, EmptyDBError, OpenSCMDB
1514

16-
17-
@frozen
18-
class DataBackendOptions:
19-
"""A collection of data back-end options"""
20-
21-
options: tuple[ # type hint doesn't work properly, but ok
22-
tuple[str, type[OpenSCMDBDataBackend]], ...
23-
]
24-
"""
25-
Options
26-
27-
The first element of each option is the option's short name.
28-
The second element is the class that matches that option.
29-
"""
30-
31-
def get_instance(self, option: str) -> OpenSCMDBDataBackend:
32-
"""
33-
Get an instance of one of the options
34-
35-
Parameters
36-
----------
37-
option
38-
Option for which to get a data back-end instance
39-
40-
Returns
41-
-------
42-
:
43-
Initialised instance
44-
45-
Raises
46-
------
47-
KeyError
48-
The option is not supported
49-
"""
50-
for short_name, option_cls in self.options:
51-
if short_name == option:
52-
return option_cls()
53-
54-
msg = (
55-
f"{option=} is not supported. "
56-
f"Available options: {tuple(v[1] for v in self.options)}"
57-
)
58-
raise KeyError(msg)
59-
60-
61-
DATA_BACKENDS = DataBackendOptions(
62-
( # type: ignore # using class with protocol doesn't work properly
63-
("csv", CSVDataBackend),
64-
("feather", FeatherDataBackend),
65-
("in_memory", InMemoryDataBackend),
66-
("netCDF", netCDFDataBackend),
67-
# Other options to consider:
68-
#
69-
# - pretty netCDF, where we try and save the data with dimensions where possible
70-
#
71-
# - HDF5: https://pandas.pydata.org/docs/user_guide/io.html#hdf5-pytables
72-
# - sqllite
73-
)
74-
)
75-
"""Inbuilt data back-ends"""
76-
77-
78-
@frozen
79-
class IndexBackendOptions:
80-
"""A collection of index back-end options"""
81-
82-
options: tuple[tuple[str, type[OpenSCMDBIndexBackend]], ...]
83-
"""
84-
Options
85-
86-
The first element of each option is the option's short name.
87-
The second element is the class that matches that option.
88-
"""
89-
90-
def get_instance(self, option: str) -> OpenSCMDBIndexBackend:
91-
"""
92-
Get an instance of one of the options
93-
94-
Parameters
95-
----------
96-
option
97-
Option for which to get a index back-end instance
98-
99-
Returns
100-
-------
101-
:
102-
Initialised instance
103-
104-
Raises
105-
------
106-
KeyError
107-
The option is not supported
108-
"""
109-
for short_name, option_cls in self.options:
110-
if short_name == option:
111-
return option_cls()
112-
113-
msg = (
114-
f"{option=} is not supported. "
115-
f"Available options: {tuple(v[1] for v in self.options)}"
116-
)
117-
raise KeyError(msg)
118-
119-
120-
INDEX_BACKENDS = IndexBackendOptions(
121-
( # type: ignore # using class with protocol doesn't work properly
122-
("csv", CSVIndexBackend),
123-
("feather", FeatherIndexBackend),
124-
("in_memory", InMemoryIndexBackend),
125-
("netCDF", netCDFIndexBackend),
126-
# Other options to consider:
127-
#
128-
# - HDF5: https://pandas.pydata.org/docs/user_guide/io.html#hdf5-pytables
129-
# - sqllite
130-
)
131-
)
132-
"""Inbuilt index back-ends"""
133-
134-
13515
__all__ = [
13616
"DATA_BACKENDS",
13717
"INDEX_BACKENDS",

src/pandas_openscm/db/backends.py

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
"""
2+
Available back-ends
3+
4+
This is just a shortcut/convenience module
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from pathlib import Path
10+
11+
from attrs import frozen
12+
13+
from pandas_openscm.db.csv import CSVDataBackend, CSVIndexBackend
14+
from pandas_openscm.db.feather import FeatherDataBackend, FeatherIndexBackend
15+
from pandas_openscm.db.in_memory import InMemoryDataBackend, InMemoryIndexBackend
16+
from pandas_openscm.db.interfaces import OpenSCMDBDataBackend, OpenSCMDBIndexBackend
17+
from pandas_openscm.db.netcdf import netCDFDataBackend, netCDFIndexBackend
18+
19+
20+
@frozen
21+
class DataBackendOptions:
22+
"""A collection of data back-end options"""
23+
24+
options: tuple[ # type hint doesn't work properly, but ok
25+
tuple[str, type[OpenSCMDBDataBackend]], ...
26+
]
27+
"""
28+
Options
29+
30+
The first element of each option is the option's short name.
31+
The second element is the class that matches that option.
32+
"""
33+
34+
def get_instance(self, option: str) -> OpenSCMDBDataBackend:
35+
"""
36+
Get an instance of one of the options
37+
38+
Parameters
39+
----------
40+
option
41+
Option for which to get a data back-end instance
42+
43+
Returns
44+
-------
45+
:
46+
Initialised instance
47+
48+
Raises
49+
------
50+
KeyError
51+
The option is not supported
52+
"""
53+
for short_name, option_cls in self.options:
54+
if short_name == option:
55+
return option_cls()
56+
57+
msg = (
58+
f"{option=} is not supported. "
59+
f"Available options: {tuple(v[1] for v in self.options)}"
60+
)
61+
raise KeyError(msg)
62+
63+
def guess_backend(self, data_file_name: str) -> OpenSCMDBDataBackend:
64+
"""
65+
Guess backend from a file name
66+
67+
Parameters
68+
----------
69+
index_file_name
70+
Name of the data file from which to guess the backend
71+
72+
Returns
73+
-------
74+
:
75+
Guessed backend
76+
77+
Raises
78+
------
79+
ValueError
80+
The backend could not be guessed from `data_file_name`
81+
"""
82+
ext = Path(data_file_name).suffix.strip(".")
83+
try:
84+
return self.get_instance(ext)
85+
except KeyError as exc:
86+
msg = (
87+
f"Could not guess backend from filename {data_file_name}. "
88+
f"we assumed the extension was {ext}."
89+
)
90+
raise ValueError(msg) from exc
91+
92+
93+
DATA_BACKENDS = DataBackendOptions(
94+
( # type: ignore # using class with protocol doesn't work properly
95+
("csv", CSVDataBackend),
96+
("feather", FeatherDataBackend),
97+
("in_memory", InMemoryDataBackend),
98+
("netCDF", netCDFDataBackend),
99+
# Other options to consider:
100+
#
101+
# - pretty netCDF, where we try and save the data with dimensions where possible
102+
#
103+
# - HDF5: https://pandas.pydata.org/docs/user_guide/io.html#hdf5-pytables
104+
# - sqllite
105+
)
106+
)
107+
"""Inbuilt data back-ends"""
108+
109+
110+
@frozen
111+
class IndexBackendOptions:
112+
"""A collection of index back-end options"""
113+
114+
options: tuple[tuple[str, type[OpenSCMDBIndexBackend]], ...]
115+
"""
116+
Options
117+
118+
The first element of each option is the option's short name.
119+
The second element is the class that matches that option.
120+
"""
121+
122+
def get_instance(self, option: str) -> OpenSCMDBIndexBackend:
123+
"""
124+
Get an instance of one of the options
125+
126+
Parameters
127+
----------
128+
option
129+
Option for which to get a index back-end instance
130+
131+
Returns
132+
-------
133+
:
134+
Initialised instance
135+
136+
Raises
137+
------
138+
KeyError
139+
The option is not supported
140+
"""
141+
for short_name, option_cls in self.options:
142+
if short_name == option:
143+
return option_cls()
144+
145+
msg = (
146+
f"{option=} is not supported. "
147+
f"Available options: {tuple(v[1] for v in self.options)}"
148+
)
149+
raise KeyError(msg)
150+
151+
def guess_backend(self, index_file_name: str) -> OpenSCMDBDataBackend:
152+
"""
153+
Guess backend from a file name
154+
155+
Parameters
156+
----------
157+
index_file_name
158+
Name of the index file from which to guess the backend
159+
160+
Returns
161+
-------
162+
:
163+
Guessed backend
164+
165+
Raises
166+
------
167+
ValueError
168+
The backend could not be guessed from `index_file_name`
169+
"""
170+
ext = Path(index_file_name).suffix.strip(".")
171+
try:
172+
return self.get_instance(ext)
173+
except KeyError as exc:
174+
msg = (
175+
f"Could not guess backend from filename {index_file_name}. "
176+
f"we assumed the extension was {ext}."
177+
)
178+
raise ValueError(msg) from exc
179+
180+
181+
INDEX_BACKENDS = IndexBackendOptions(
182+
( # type: ignore # using class with protocol doesn't work properly
183+
("csv", CSVIndexBackend),
184+
("feather", FeatherIndexBackend),
185+
("in_memory", InMemoryIndexBackend),
186+
("netCDF", netCDFIndexBackend),
187+
# Other options to consider:
188+
#
189+
# - HDF5: https://pandas.pydata.org/docs/user_guide/io.html#hdf5-pytables
190+
# - sqllite
191+
)
192+
)
193+
"""Inbuilt index back-ends"""

src/pandas_openscm/db/openscm_db.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,15 @@
44

55
from __future__ import annotations
66

7+
import tarfile
78
import warnings
89
from pathlib import Path
910
from typing import TYPE_CHECKING, Any
1011

1112
import pandas as pd
1213
from attrs import define, field
1314

15+
from pandas_openscm.db.backends import DATA_BACKENDS, INDEX_BACKENDS
1416
from pandas_openscm.db.deleting import delete_files
1517
from pandas_openscm.db.interfaces import OpenSCMDBDataBackend, OpenSCMDBIndexBackend
1618
from pandas_openscm.db.loading import (
@@ -287,6 +289,34 @@ def delete(
287289
max_workers=max_workers,
288290
)
289291

292+
@classmethod
293+
def from_gzipped_tar_archive(
294+
cls,
295+
tar_archive: Path,
296+
db_dir: Path,
297+
backend_data: OpenSCMDBDataBackend | None = None,
298+
backend_index: OpenSCMDBIndexBackend | None = None,
299+
) -> OpenSCMDB:
300+
with tarfile.open(tar_archive, "r") as tar:
301+
for member in tar.getmembers():
302+
if not member.isreg():
303+
# Only extract files
304+
continue
305+
# Extract to the db_dir
306+
member.name = Path(member.name).name
307+
tar.extract(member, db_dir)
308+
if backend_index is None and member.name.startswith("index"):
309+
backend_index = INDEX_BACKENDS.guess_backend(member.name)
310+
311+
if backend_data is None and not any(
312+
member.name.startswith(v) for v in ["index", "filemap"]
313+
):
314+
backend_data = DATA_BACKENDS.guess_backend(member.name)
315+
316+
res = cls(backend_data=backend_data, backend_index=backend_index, db_dir=db_dir)
317+
318+
return res
319+
290320
def get_new_data_file_path(self, file_id: int) -> Path:
291321
"""
292322
Get the path in which to write a new data file
@@ -706,3 +736,9 @@ def save( # noqa: PLR0913
706736
progress=progress,
707737
max_workers=max_workers,
708738
)
739+
740+
def to_gzipped_tar_archive(self, out_file: Path, mode: str = "w:gz") -> Path:
741+
with tarfile.open(out_file, mode) as tar:
742+
tar.add(self.db_dir, arcname="db")
743+
744+
return out_file

0 commit comments

Comments
 (0)