Skip to content

Commit 35ec7c2

Browse files
committed
Implement findDataset using new query system
Replace the implementation of Registry.findDataset with the new query system, to allow us to retire daf_relation.
1 parent c6f4b96 commit 35ec7c2

File tree

6 files changed

+74
-176
lines changed

6 files changed

+74
-176
lines changed

doc/changes/DM-52398.bugfix.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Query generation will no longer throw `sqlalchemy.exc.ArgumentError` when attempting to use a dataset calibration timespan constraint for a dataset search without any calibration collections.

doc/changes/DM-52398.misc.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
`Butler.find_dataset()`/`Butler.registry.findDataset()` have been re-implemented using the same query framework backing `Butler.query()`.

python/lsst/daf/butler/_registry_shim.py

Lines changed: 66 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from ._collection_type import CollectionType
3737
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
3838
from ._dataset_type import DatasetType
39+
from ._exceptions import CalibrationLookupError
3940
from ._storage_class import StorageClassFactory
4041
from ._timespan import Timespan
4142
from .dimensions import (
@@ -48,7 +49,9 @@
4849
)
4950
from .registry._collection_summary import CollectionSummary
5051
from .registry._defaults import RegistryDefaults
52+
from .registry._exceptions import NoDefaultCollectionError
5153
from .registry._registry_base import RegistryBase
54+
from .registry.queries._query_common import resolve_collections
5255

5356
if TYPE_CHECKING:
5457
from .direct_butler import DirectButler
@@ -182,13 +185,74 @@ def findDataset(
182185
*,
183186
collections: CollectionArgType | None = None,
184187
timespan: Timespan | None = None,
188+
datastore_records: bool = False,
185189
**kwargs: Any,
186190
) -> DatasetRef | None:
187191
# Docstring inherited from a base class.
188-
return self._registry.findDataset(
189-
datasetType, dataId, collections=collections, timespan=timespan, **kwargs
192+
if not isinstance(datasetType, DatasetType):
193+
datasetType = self.getDatasetType(datasetType)
194+
195+
dataId = DataCoordinate.standardize(
196+
dataId,
197+
dimensions=datasetType.dimensions,
198+
universe=self.dimensions,
199+
defaults=self.defaults.dataId,
200+
**kwargs,
190201
)
191202

203+
with self._butler.query() as query:
204+
resolved_collections = resolve_collections(self._butler, collections)
205+
if not resolved_collections:
206+
if collections is None:
207+
raise NoDefaultCollectionError("No collections provided, and no default collections set")
208+
else:
209+
return None
210+
211+
if datasetType.isCalibration() and timespan is None:
212+
# Filter out calibration collections, because with no timespan
213+
# we have no way of selecting a dataset from them.
214+
collection_info = self._butler.collections.query_info(
215+
resolved_collections, flatten_chains=True
216+
)
217+
resolved_collections = [
218+
info.name for info in collection_info if info.type != CollectionType.CALIBRATION
219+
]
220+
if not resolved_collections:
221+
return None
222+
223+
result = query.datasets(datasetType, resolved_collections, find_first=True).limit(2)
224+
dataset_type_name = result.dataset_type.name
225+
result = result.where(dataId)
226+
if (
227+
datasetType.isCalibration()
228+
and timespan is not None
229+
and (timespan.begin is not None or timespan.end is not None)
230+
):
231+
timespan_column = query.expression_factory[dataset_type_name].timespan
232+
# The 'logical_or(timespan.is_null)' allows non-calibration
233+
# collections to participate in the search, with the assumption
234+
# that they are valid for any time range.
235+
result = result.where(timespan_column.overlaps(timespan).logical_or(timespan_column.is_null))
236+
237+
datasets = list(result)
238+
if len(datasets) == 1:
239+
ref = datasets[0]
240+
if dataId.hasRecords():
241+
ref = ref.expanded(dataId)
242+
# Propagate storage class from user-provided DatasetType, which
243+
# may not match the definition in the database.
244+
ref = ref.overrideStorageClass(datasetType.storageClass_name)
245+
if datastore_records:
246+
ref = self._registry.get_datastore_records(ref)
247+
return ref
248+
elif len(datasets) == 0:
249+
return None
250+
else:
251+
raise CalibrationLookupError(
252+
f"Ambiguous calibration lookup for {datasetType} with timespan {timespan}"
253+
f" in collections {resolved_collections}."
254+
)
255+
192256
def insertDatasets(
193257
self,
194258
datasetType: DatasetType | str,

python/lsst/daf/butler/direct_butler/_direct_butler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1302,7 +1302,7 @@ def find_dataset(
13021302

13031303
data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)
13041304

1305-
ref = self._registry.findDataset(
1305+
ref = self.registry.findDataset(
13061306
parent_type,
13071307
data_id,
13081308
collections=collections,

python/lsst/daf/butler/registry/sql_registry.py

Lines changed: 2 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -36,24 +36,18 @@
3636
import warnings
3737
from collections import defaultdict
3838
from collections.abc import Iterable, Iterator, Mapping, Sequence
39-
from typing import TYPE_CHECKING, Any, cast
39+
from typing import TYPE_CHECKING, Any
4040

4141
import sqlalchemy
4242

4343
from lsst.resources import ResourcePathExpression
4444
from lsst.utils.iteration import ensure_iterable
4545

4646
from .._collection_type import CollectionType
47-
from .._column_tags import DatasetColumnTag
4847
from .._config import Config
4948
from .._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
5049
from .._dataset_type import DatasetType
51-
from .._exceptions import (
52-
CalibrationLookupError,
53-
DataIdValueError,
54-
DimensionNameError,
55-
InconsistentDataIdError,
56-
)
50+
from .._exceptions import DataIdValueError, DimensionNameError, InconsistentDataIdError
5751
from .._storage_class import StorageClassFactory
5852
from .._timespan import Timespan
5953
from ..dimensions import (
@@ -90,7 +84,6 @@
9084
from .._butler_config import ButlerConfig
9185
from ..datastore._datastore import DatastoreOpaqueTable
9286
from ..datastore.stored_file_info import StoredDatastoreItemInfo
93-
from ..registry._registry import CollectionArgType
9487
from ..registry.interfaces import (
9588
CollectionRecord,
9689
Database,
@@ -830,169 +823,6 @@ def supportsIdGenerationMode(self, mode: DatasetIdGenEnum) -> bool:
830823
"""
831824
return True
832825

833-
def findDataset(
834-
self,
835-
datasetType: DatasetType | str,
836-
dataId: DataId | None = None,
837-
*,
838-
collections: CollectionArgType | None = None,
839-
timespan: Timespan | None = None,
840-
datastore_records: bool = False,
841-
**kwargs: Any,
842-
) -> DatasetRef | None:
843-
"""Find a dataset given its `DatasetType` and data ID.
844-
845-
This can be used to obtain a `DatasetRef` that permits the dataset to
846-
be read from a `Datastore`. If the dataset is a component and can not
847-
be found using the provided dataset type, a dataset ref for the parent
848-
will be returned instead but with the correct dataset type.
849-
850-
Parameters
851-
----------
852-
datasetType : `DatasetType` or `str`
853-
A `DatasetType` or the name of one. If this is a `DatasetType`
854-
instance, its storage class will be respected and propagated to
855-
the output, even if it differs from the dataset type definition
856-
in the registry, as long as the storage classes are convertible.
857-
dataId : `dict` or `DataCoordinate`, optional
858-
A `dict`-like object containing the `Dimension` links that identify
859-
the dataset within a collection.
860-
collections : collection expression, optional
861-
An expression that fully or partially identifies the collections to
862-
search for the dataset; see
863-
:ref:`daf_butler_collection_expressions` for more information.
864-
Defaults to ``self.defaults.collections``.
865-
timespan : `Timespan`, optional
866-
A timespan that the validity range of the dataset must overlap.
867-
If not provided, any `~CollectionType.CALIBRATION` collections
868-
matched by the ``collections`` argument will not be searched.
869-
datastore_records : `bool`, optional
870-
Whether to attach datastore records to the `DatasetRef`.
871-
**kwargs
872-
Additional keyword arguments passed to
873-
`DataCoordinate.standardize` to convert ``dataId`` to a true
874-
`DataCoordinate` or augment an existing one.
875-
876-
Returns
877-
-------
878-
ref : `DatasetRef`
879-
A reference to the dataset, or `None` if no matching Dataset
880-
was found.
881-
882-
Raises
883-
------
884-
lsst.daf.butler.registry.NoDefaultCollectionError
885-
Raised if ``collections`` is `None` and
886-
``self.defaults.collections`` is `None`.
887-
LookupError
888-
Raised if one or more data ID keys are missing.
889-
lsst.daf.butler.registry.MissingDatasetTypeError
890-
Raised if the dataset type does not exist.
891-
lsst.daf.butler.registry.MissingCollectionError
892-
Raised if any of ``collections`` does not exist in the registry.
893-
894-
Notes
895-
-----
896-
This method simply returns `None` and does not raise an exception even
897-
when the set of collections searched is intrinsically incompatible with
898-
the dataset type, e.g. if ``datasetType.isCalibration() is False``, but
899-
only `~CollectionType.CALIBRATION` collections are being searched.
900-
This may make it harder to debug some lookup failures, but the behavior
901-
is intentional; we consider it more important that failed searches are
902-
reported consistently, regardless of the reason, and that adding
903-
additional collections that do not contain a match to the search path
904-
never changes the behavior.
905-
906-
This method handles component dataset types automatically, though most
907-
other registry operations do not.
908-
"""
909-
if collections is None:
910-
if not self.defaults.collections:
911-
raise NoDefaultCollectionError(
912-
"No collections provided to findDataset, and no defaults from registry construction."
913-
)
914-
collections = self.defaults.collections
915-
backend = queries.SqlQueryBackend(self._db, self._managers, self.dimension_record_cache)
916-
with backend.caching_context():
917-
collection_wildcard = CollectionWildcard.from_expression(collections, require_ordered=True)
918-
if collection_wildcard.empty():
919-
return None
920-
matched_collections = backend.resolve_collection_wildcard(collection_wildcard)
921-
resolved_dataset_type = backend.resolve_single_dataset_type_wildcard(datasetType)
922-
dataId = DataCoordinate.standardize(
923-
dataId,
924-
dimensions=resolved_dataset_type.dimensions,
925-
universe=self.dimensions,
926-
defaults=self.defaults.dataId,
927-
**kwargs,
928-
)
929-
governor_constraints = {name: {cast(str, dataId[name])} for name in dataId.dimensions.governors}
930-
(filtered_collections,) = backend.filter_dataset_collections(
931-
[resolved_dataset_type],
932-
matched_collections,
933-
governor_constraints=governor_constraints,
934-
).values()
935-
if not filtered_collections:
936-
return None
937-
if timespan is None:
938-
filtered_collections = [
939-
collection_record
940-
for collection_record in filtered_collections
941-
if collection_record.type is not CollectionType.CALIBRATION
942-
]
943-
if filtered_collections:
944-
requested_columns = {"dataset_id", "run", "collection"}
945-
with backend.context() as context:
946-
predicate = context.make_data_coordinate_predicate(
947-
dataId.subset(resolved_dataset_type.dimensions), full=False
948-
)
949-
if timespan is not None:
950-
requested_columns.add("timespan")
951-
predicate = predicate.logical_and(
952-
context.make_timespan_overlap_predicate(
953-
DatasetColumnTag(resolved_dataset_type.name, "timespan"), timespan
954-
)
955-
)
956-
relation = backend.make_dataset_query_relation(
957-
resolved_dataset_type, filtered_collections, requested_columns, context
958-
).with_rows_satisfying(predicate)
959-
rows = list(context.fetch_iterable(relation))
960-
else:
961-
rows = []
962-
if not rows:
963-
return None
964-
elif len(rows) == 1:
965-
best_row = rows[0]
966-
else:
967-
rank_by_collection_key = {record.key: n for n, record in enumerate(filtered_collections)}
968-
collection_tag = DatasetColumnTag(resolved_dataset_type.name, "collection")
969-
row_iter = iter(rows)
970-
best_row = next(row_iter)
971-
best_rank = rank_by_collection_key[best_row[collection_tag]]
972-
have_tie = False
973-
for row in row_iter:
974-
if (rank := rank_by_collection_key[row[collection_tag]]) < best_rank:
975-
best_row = row
976-
best_rank = rank
977-
have_tie = False
978-
elif rank == best_rank:
979-
have_tie = True
980-
assert timespan is not None, "Rank ties should be impossible given DB constraints."
981-
if have_tie:
982-
raise CalibrationLookupError(
983-
f"Ambiguous calibration lookup for {resolved_dataset_type.name} in collections "
984-
f"{collection_wildcard.strings} with timespan {timespan}."
985-
)
986-
reader = queries.DatasetRefReader(
987-
resolved_dataset_type,
988-
translate_collection=lambda k: self._managers.collections[k].name,
989-
)
990-
ref = reader.read(best_row, data_id=dataId)
991-
if datastore_records:
992-
ref = self.get_datastore_records(ref)
993-
994-
return ref
995-
996826
@transactional
997827
def insertDatasets(
998828
self,

python/lsst/daf/butler/remote_butler/_remote_butler.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
from .._utilities.locked_object import LockedObject
6161
from ..datastore import DatasetRefURIs, DatastoreConfig
6262
from ..datastore.cache_manager import AbstractDatastoreCacheManager, DatastoreCacheManager
63-
from ..dimensions import DataIdValue, DimensionConfig, DimensionUniverse, SerializedDataId
63+
from ..dimensions import DataCoordinate, DataIdValue, DimensionConfig, DimensionUniverse, SerializedDataId
6464
from ..queries import Query
6565
from ..queries.tree import make_column_literal
6666
from ..registry import CollectionArgType, NoDefaultCollectionError, Registry, RegistryDefaults
@@ -435,6 +435,8 @@ def find_dataset(
435435
return None
436436

437437
ref = DatasetRef.from_simple(model.dataset_ref, universe=self.dimensions)
438+
if isinstance(data_id, DataCoordinate) and data_id.hasRecords():
439+
ref = ref.expanded(data_id)
438440
return apply_storage_class_override(ref, dataset_type, storage_class)
439441

440442
def _retrieve_artifacts(

0 commit comments

Comments
 (0)