Skip to content

Commit 174b8fa

Browse files
committed
Sketch out new interfaces for querying multiple dataset types.
1 parent 201ee96 commit 174b8fa

File tree

7 files changed

+645
-10
lines changed

7 files changed

+645
-10
lines changed

python/lsst/daf/butler/__init__.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,16 @@
3232
# Some components are not auto-imported since they can have additional runtime
3333
# dependencies.
3434

35-
from . import logging # most symbols are helpers only
36-
from . import progress # most symbols are only used by handler implementors
37-
from . import ddl, time_utils
35+
from . import (
36+
ddl,
37+
logging, # most symbols are helpers only
38+
progress, # most symbols are only used by handler implementors
39+
time_utils,
40+
)
3841
from ._butler import *
3942
from ._butler_collections import *
4043
from ._butler_config import *
44+
from ._butler_dataset_types import *
4145
from ._butler_repo_index import *
4246
from ._collection_type import CollectionType
4347
from ._column_categorization import *

python/lsst/daf/butler/_butler.py

Lines changed: 98 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141

4242
from ._butler_collections import ButlerCollections
4343
from ._butler_config import ButlerConfig, ButlerType
44+
from ._butler_dataset_types import ButlerDatasetTypes
4445
from ._butler_instance_options import ButlerInstanceOptions
4546
from ._butler_repo_index import ButlerRepoIndex
4647
from ._config import Config, ConfigSubset
@@ -840,6 +841,7 @@ def getURI(
840841
)
841842
return primary
842843

844+
# TODO: RFC deprecating this in favor of butler.dataset_types.get.
843845
@abstractmethod
844846
def get_dataset_type(self, name: str) -> DatasetType:
845847
"""Get the `DatasetType`.
@@ -1448,6 +1450,16 @@ def run(self) -> str | None:
14481450
"""
14491451
raise NotImplementedError()
14501452

1453+
# TODO: make this abstract and implement in derived classes.
1454+
@property
1455+
def dataset_types(self) -> ButlerDatasetTypes:
1456+
"""Object with methods for modifying and querying dataset types
1457+
(`~lsst.daf.butler.ButlerDatasettypes`).
1458+
1459+
Use of this object is preferred over `registry` wherever possible.
1460+
"""
1461+
raise NotImplementedError()
1462+
14511463
@property
14521464
@abstractmethod
14531465
def registry(self) -> Registry:
@@ -1572,22 +1584,20 @@ def _query_datasets(
15721584
explain: bool = True,
15731585
**kwargs: Any,
15741586
) -> list[DatasetRef]:
1575-
"""Query for dataset references matching user-provided criteria.
1587+
"""Query for dataset references of a single dataset type.
15761588
15771589
Parameters
15781590
----------
15791591
dataset_type : `str` or `DatasetType`
15801592
Dataset type object or name to search for.
15811593
collections : collection expression, optional
15821594
A collection name or iterable of collection names to search. If not
1583-
provided, the default collections are used. See
1584-
:ref:`daf_butler_collection_expressions` for more information.
1595+
provided, the default collections are used.
15851596
find_first : `bool`, optional
15861597
If `True` (default), for each result data ID, only yield one
15871598
`DatasetRef` of each `DatasetType`, from the first collection in
15881599
which a dataset of that dataset type appears (according to the
1589-
order of ``collections`` passed in). If `True`, ``collections``
1590-
must not contain regular expressions and may not be ``...``.
1600+
order of ``collections`` passed in).
15911601
data_id : `dict` or `DataCoordinate`, optional
15921602
A data ID whose key-value pairs are used as equality constraints in
15931603
the query.
@@ -1739,6 +1749,89 @@ def _query_dimension_records(
17391749
raise EmptyQueryResultError(list(result.explain_no_results()))
17401750
return dimension_records
17411751

1752+
def _query_all_datasets(
1753+
self,
1754+
collections: str | Iterable[str] | None = None,
1755+
*,
1756+
name: str | Iterable[str] = "*",
1757+
at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
1758+
exact_dimensions: Iterable[str] | DimensionGroup | None = None,
1759+
storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
1760+
is_calibration: bool | None = None,
1761+
find_first: bool = True,
1762+
data_id: DataId | None = None,
1763+
where: str = "",
1764+
bind: Mapping[str, Any] | None = None,
1765+
explain: bool = True,
1766+
**kwargs: Any,
1767+
) -> list[DatasetRef]:
1768+
"""Query for datasets of potentially multiple types.
1769+
1770+
Parameters
1771+
----------
1772+
collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
1773+
The collection or collections to search, in order. If not provided
1774+
or `None`, the default collection search path for this butler is
1775+
used.
1776+
name : `str` or `~collections.abc.Iterable` [ `str` ], optional
1777+
Names or name patterns (glob-style) that returned dataset type
1778+
names must match. If an iterable, items are OR'd together. The
1779+
default is to include all dataset types in the given collections.
1780+
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
1781+
optional
1782+
Dimensions that returned dataset types must have as a subset.
1783+
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
1784+
optional
1785+
Dimensions that returned dataset types must have exactly.
1786+
with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\
1787+
or `StorageClass` or \
1788+
`~collections.abc.Iterable` [ `StorageClass` ], optional
1789+
Storage classes or storage class names that returned dataset types
1790+
must have. If an iterable, items are OR'd together.
1791+
is_calibration : `bool` or `None`, optional
1792+
If `None`, constrain returned dataset types to be or not be
1793+
calibrations.
1794+
find_first : `bool`, optional
1795+
If `True` (default), for each result data ID, only yield one
1796+
`DatasetRef` of each `DatasetType`, from the first collection in
1797+
which a dataset of that dataset type appears (according to the
1798+
order of ``collections`` passed in).
1799+
data_id : `dict` or `DataCoordinate`, optional
1800+
A data ID whose key-value pairs are used as equality constraints in
1801+
the query.
1802+
where : `str`, optional
1803+
A string expression similar to a SQL WHERE clause. May involve any
1804+
column of a dimension table or (as a shortcut for the primary key
1805+
column of a dimension table) dimension name. See
1806+
:ref:`daf_butler_dimension_expressions` for more information.
1807+
bind : `~collections.abc.Mapping`, optional
1808+
Mapping containing literal values that should be injected into the
1809+
``where`` expression, keyed by the identifiers they replace. Values
1810+
of collection type can be expanded in some cases; see
1811+
:ref:`daf_butler_dimension_expressions_identifiers` for more
1812+
information.
1813+
explain : `bool`, optional
1814+
If `True` (default) then `EmptyQueryResultError` exception is
1815+
raised when resulting list is empty. The exception contains
1816+
non-empty list of strings explaining possible causes for empty
1817+
result.
1818+
**kwargs
1819+
Additional keyword arguments are forwarded to
1820+
`DataCoordinate.standardize` when processing the ``data_id``
1821+
argument (and may be used to provide a constraining data ID even
1822+
when the ``data_id`` argument is `None`).
1823+
1824+
Returns
1825+
-------
1826+
refs : `list` [ `DatasetRef` ]
1827+
Dataset references matching the given query criteria. Nested data
1828+
IDs are guaranteed to include values for all implied dimensions
1829+
(i.e. `DataCoordinate.hasFull` will return `True`), but will not
1830+
include dimension records (`DataCoordinate.hasRecords` will be
1831+
`False`).
1832+
"""
1833+
raise NotImplementedError()
1834+
17421835
@abstractmethod
17431836
def _clone(
17441837
self,
Lines changed: 220 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,220 @@
1+
# This file is part of daf_butler.
2+
#
3+
# Developed for the LSST Data Management System.
4+
# This product includes software developed by the LSST Project
5+
# (http://www.lsst.org).
6+
# See the COPYRIGHT file at the top-level directory of this distribution
7+
# for details of code ownership.
8+
#
9+
# This software is dual licensed under the GNU General Public License and also
10+
# under a 3-clause BSD license. Recipients may choose which of these licenses
11+
# to use; please see the files gpl-3.0.txt and/or bsd_license.txt,
12+
# respectively. If you choose the GPL option then the following text applies
13+
# (but note that there is still no warranty even if you opt for BSD instead):
14+
#
15+
# This program is free software: you can redistribute it and/or modify
16+
# it under the terms of the GNU General Public License as published by
17+
# the Free Software Foundation, either version 3 of the License, or
18+
# (at your option) any later version.
19+
#
20+
# This program is distributed in the hope that it will be useful,
21+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
22+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23+
# GNU General Public License for more details.
24+
#
25+
# You should have received a copy of the GNU General Public License
26+
# along with this program. If not, see <http://www.gnu.org/licenses/>.
27+
28+
from __future__ import annotations
29+
30+
__all__ = ("ButlerDatasetTypes",)
31+
32+
from abc import ABC, abstractmethod
33+
from collections.abc import Iterable, Sequence, Set
34+
from typing import Any, overload
35+
36+
from pydantic import BaseModel
37+
38+
from ._dataset_type import DatasetType
39+
from ._storage_class import StorageClass
40+
from .dimensions import DimensionGroup
41+
42+
43+
class ButlerDatasetTypes(ABC, Sequence):
44+
"""Methods for working with the dataset types known to the Butler."""
45+
46+
@abstractmethod
47+
def get(self, name: str) -> DatasetType:
48+
"""Return the dataset type with the given name.
49+
50+
Returns
51+
-------
52+
dataset_type : `DatasetType`
53+
Dataset type object with the given name.
54+
55+
Raises
56+
------
57+
MissingDatasetTypeError
58+
Raised if there is no dataset type with the given name.
59+
"""
60+
raise NotImplementedError()
61+
62+
@abstractmethod
63+
def query(
64+
self,
65+
name: str | Iterable[str],
66+
*,
67+
at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
68+
exact_dimensions: Iterable[str] | DimensionGroup | None = None,
69+
storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
70+
is_calibration: bool | None = None,
71+
) -> Iterable[DatasetType]:
72+
"""Query for dataset types matching the given criteria.
73+
74+
Parameters
75+
----------
76+
name : `str` or `~collections.abc.Iterable` [ `str` ]
77+
Names or name patterns (glob-style) that returned dataset type
78+
names must match. If an iterable, items are OR'd together.
79+
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
80+
optional
81+
Dimensions that returned dataset types must have as a subset.
82+
at_least_dimensions : `Iterable` [ `str` ] or `DimensionGroup`,\
83+
optional
84+
Dimensions that returned dataset types must have exactly.
85+
with_storage_class : `str` or `~collections.abc.Iterable` [ `str` ],\
86+
or `StorageClass` or \
87+
`~collections.abc.Iterable` [ `StorageClass` ], optional
88+
Storage classes or storage class names that returned dataset types
89+
must have. If an iterable, items are OR'd together.
90+
is_calibration : `bool` or `None`, optional
91+
If `None`, constrain returned dataset types to be or not be
92+
calibrations.
93+
94+
Returns
95+
-------
96+
dataset_types : `~collections.abc.Iterable` [ `DatasetType`
97+
An iterable of dataset types. This is guaranteed to be a regular
98+
Python in-memory container, not a lazy single-pass iterator, but
99+
the type of container is currently left unspecified in order to
100+
leave room for future convenience behavior.
101+
102+
Notes
103+
-----
104+
This method queries all registered dataset types in registry. To query
105+
for the types of datasets that are in a collection, instead use::
106+
107+
info = butler.collections.query_info(
108+
collections,
109+
include_summaries=True,
110+
)
111+
112+
for a simple summary of the dataset types in each collection (see
113+
`lsst.daf.butler.ButlerCollections.query_info`). Or, for
114+
more complex but powerful queries (including constraints on data IDs or
115+
dataset counts), use::
116+
117+
with butler.query() as q:
118+
dataset_types = q.dataset_types(collections)
119+
120+
See `lsst.daf.butler.queries.Query.dataset_types` for details.
121+
"""
122+
raise NotImplementedError()
123+
124+
@abstractmethod
125+
def query_names(
126+
self,
127+
name: str | Iterable[str],
128+
*,
129+
at_least_dimensions: Iterable[str] | DimensionGroup | None = None,
130+
exact_dimensions: Iterable[str] | DimensionGroup | None = None,
131+
storage_class: str | Iterable[str] | StorageClass | Iterable[StorageClass] | None = None,
132+
is_calibration: bool | None = None,
133+
) -> Iterable[str]:
134+
"""Query for the names of dataset types matching the given criteria.
135+
136+
See `query` for parameter descriptions.
137+
"""
138+
raise NotImplementedError()
139+
140+
@abstractmethod
141+
def register(
142+
self,
143+
name_or_type: str,
144+
/,
145+
dimensions: Iterable[str] | DimensionGroup | None = None,
146+
storage_class: str | StorageClass | None = None,
147+
is_calibration: bool | None = None,
148+
) -> bool:
149+
"""Register a dataset type.
150+
151+
It is not an error to register the same `DatasetType` twice.
152+
153+
Parameters
154+
----------
155+
name_or_type : `str` or `DatasetType`
156+
The name of the dataset type to be added, or a complete
157+
`DatasetType` type object to add.
158+
dimensions : `~colletions.abc.Iterable` [ `str` ] or `DimensionGroup`,\
159+
optional
160+
Dimensions for the dataset type. Required if the first argument
161+
is just a `str`, and overrides the dimensions if the first argument
162+
is a `DatasetType`.
163+
storage_class : `str` or `StorageClass`, optional
164+
Storage class for the dataset type. Required if the first argument
165+
is just a `str`, and overrides the storage class if the first
166+
arguemnt is a `DatasetType`.
167+
is_calibration: `bool`, optional
168+
Whether the dataset type is a calibration. If the first argument
169+
is a `str`, defaults to `False`. If the first argument is a
170+
`DatasetType` and this argument is not `None`, it overrides the
171+
value on the `DatasetType`.
172+
173+
Returns
174+
-------
175+
inserted : `bool`
176+
`True` if a new dataset type was inserted, `False` if an identical
177+
existing dataset type was found. Note that in either case the
178+
dataset type is guaranteed to be defined in the repository
179+
consistently with the given definition.
180+
181+
Raises
182+
------
183+
ValueError
184+
Raised if the dimensions or storage class are invalid.
185+
lsst.daf.butler.registry.ConflictingDefinitionError
186+
Raised if this dataset type is already registered with a different
187+
definition.
188+
189+
"""
190+
raise NotImplementedError()
191+
192+
@abstractmethod
193+
def remove(self, name: str) -> None:
194+
"""Remove the dataset type with the given name.
195+
196+
.. warning::
197+
198+
Butler implementations can cache the dataset type definitions.
199+
This means that deleting the dataset type definition may result in
200+
unexpected behavior from other butler processes that are active
201+
that have not seen the deletion.
202+
203+
Parameters
204+
----------
205+
name : `str` or `tuple` [`str`]
206+
Name of the type to be removed or tuple containing a list of type
207+
names to be removed. Wildcards are allowed.
208+
209+
Raises
210+
------
211+
lsst.daf.butler.registry.OrphanedRecordError
212+
Raised if an attempt is made to remove the dataset type definition
213+
when there are still datasets associated with it.
214+
215+
Notes
216+
-----
217+
If the dataset type is not registered the method will return without
218+
action.
219+
"""
220+
raise NotImplementedError()

python/lsst/daf/butler/queries/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
from ._base import *
2929
from ._data_coordinate_query_results import *
3030
from ._dataset_query_results import *
31+
from ._dataset_type_results import *
3132
from ._dimension_record_query_results import *
3233
from ._general_query_results import *
34+
from ._heterogeneous_dataset_results import *
3335
from ._query import *

0 commit comments

Comments
 (0)