Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions doc/changes/DM-52397.misc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
`Butler.registry.queryDatasets`, `Butler.registry.queryDataIds`, and `Butler.registry.queryDimensionRecords` have been re-implemented using the same query framework backing `Butler.query()` and the various `Butler.query_*()` methods. Though the `Butler.registry.*` methods are not yet deprecated, users are encouraged to begin migrating to the equivalent `Butler.query_*` functions. The new implementation comes with a number of minor behavior changes:
- We no longer raise an exception if governor dimension values in a query constraint are not known to the registry. Instead, we return no results.
- Result rows are now deduplicated.
- `findFirst` searches are now supported in calibration collections.
4 changes: 4 additions & 0 deletions doc/changes/DM-52397.removal.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Removed deprecated `DatasetQueryResults.materialize()` method.
Removed deprecated `offset` parameter from the `limit` method in the `DataCoordinateQueryResults`, `DatasetQueryResults`, and `DimensionRecordQueryResults` classes.
Removed the deprecated feature where HTM and HEALPix spatial dimensions like `htm11` or `healpix10` could be used in data ID constraints passed to queries. The exception is `htm7`, which will continue to work. Users should instead use region `OVERLAPS` constraints in query `where` expressions.
Removed support for `<` and `>` operators in `Butler.registry.query*()` `where` strings for comparisons of `Timespan` vs `Timespan`, or `Timespan` vs `Time`. Instead, use `timespan.begin < ts` or `timespan.end > ts` to explicitly indicate which timespan bound you are comparing with.
1 change: 1 addition & 0 deletions doc/changes/DM-52398.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Query generation will no longer throw `sqlalchemy.exc.ArgumentError` when attempting to use a dataset calibration timespan constraint for a dataset search without any calibration collections.
1 change: 1 addition & 0 deletions doc/changes/DM-52398.misc.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
`Butler.find_dataset()`/`Butler.registry.findDataset()` have been re-implemented using the same query framework backing `Butler.query()`.
173 changes: 75 additions & 98 deletions python/lsst/daf/butler/_registry_shim.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,16 +27,17 @@

from __future__ import annotations

__all__ = ("Registry",)
__all__ = ("RegistryShim",)

import contextlib
from collections.abc import Iterable, Iterator, Mapping, Sequence
from typing import TYPE_CHECKING, Any

from ._collection_type import CollectionType
from ._dataset_association import DatasetAssociation
from ._dataset_ref import DatasetId, DatasetIdGenEnum, DatasetRef
from ._dataset_type import DatasetType
from ._exceptions import CalibrationLookupError
from ._storage_class import StorageClassFactory
from ._timespan import Timespan
from .dimensions import (
DataCoordinate,
Expand All @@ -46,18 +47,19 @@
DimensionRecord,
DimensionUniverse,
)
from .registry import Registry
from .registry._collection_summary import CollectionSummary
from .registry._defaults import RegistryDefaults
from .registry.queries import DataCoordinateQueryResults, DatasetQueryResults, DimensionRecordQueryResults
from .registry._exceptions import NoDefaultCollectionError
from .registry._registry_base import RegistryBase
from .registry.queries._query_common import resolve_collections

if TYPE_CHECKING:
from .direct_butler import DirectButler
from .registry._registry import CollectionArgType
from .registry.interfaces import ObsCoreTableManager


class RegistryShim(Registry):
class RegistryShim(RegistryBase):
"""Implementation of `Registry` interface exposed to clients by `Butler`.

Parameters
Expand All @@ -74,7 +76,7 @@ class RegistryShim(Registry):
"""

def __init__(self, butler: DirectButler):
self._butler = butler
super().__init__(butler)
self._registry = butler._registry

def isWriteable(self) -> bool:
Expand Down Expand Up @@ -183,13 +185,74 @@ def findDataset(
*,
collections: CollectionArgType | None = None,
timespan: Timespan | None = None,
datastore_records: bool = False,
**kwargs: Any,
) -> DatasetRef | None:
# Docstring inherited from a base class.
return self._registry.findDataset(
datasetType, dataId, collections=collections, timespan=timespan, **kwargs
if not isinstance(datasetType, DatasetType):
datasetType = self.getDatasetType(datasetType)

dataId = DataCoordinate.standardize(
dataId,
dimensions=datasetType.dimensions,
universe=self.dimensions,
defaults=self.defaults.dataId,
**kwargs,
)

with self._butler.query() as query:
resolved_collections = resolve_collections(self._butler, collections)
if not resolved_collections:
if collections is None:
raise NoDefaultCollectionError("No collections provided, and no default collections set")
else:
return None

if datasetType.isCalibration() and timespan is None:
# Filter out calibration collections, because with no timespan
# we have no way of selecting a dataset from them.
collection_info = self._butler.collections.query_info(
resolved_collections, flatten_chains=True
)
resolved_collections = [
info.name for info in collection_info if info.type != CollectionType.CALIBRATION
]
if not resolved_collections:
return None

result = query.datasets(datasetType, resolved_collections, find_first=True).limit(2)
dataset_type_name = result.dataset_type.name
result = result.where(dataId)
if (
datasetType.isCalibration()
and timespan is not None
and (timespan.begin is not None or timespan.end is not None)
):
timespan_column = query.expression_factory[dataset_type_name].timespan
# The 'logical_or(timespan.is_null)' allows non-calibration
# collections to participate in the search, with the assumption
# that they are valid for any time range.
result = result.where(timespan_column.overlaps(timespan).logical_or(timespan_column.is_null))

datasets = list(result)
if len(datasets) == 1:
ref = datasets[0]
if dataId.hasRecords():
ref = ref.expanded(dataId)
# Propagate storage class from user-provided DatasetType, which
# may not match the definition in the database.
ref = ref.overrideStorageClass(datasetType.storageClass_name)
if datastore_records:
ref = self._registry.get_datastore_records(ref)
return ref
elif len(datasets) == 0:
return None
else:
raise CalibrationLookupError(
f"Ambiguous calibration lookup for {datasetType} with timespan {timespan}"
f" in collections {resolved_collections}."
)

def insertDatasets(
self,
datasetType: DatasetType | str,
Expand Down Expand Up @@ -299,97 +362,11 @@ def queryCollections(
expression, datasetType, collectionTypes, flattenChains, includeChains
)

def queryDatasets(
self,
datasetType: Any,
*,
collections: CollectionArgType | None = None,
dimensions: Iterable[str] | None = None,
dataId: DataId | None = None,
where: str = "",
findFirst: bool = False,
bind: Mapping[str, Any] | None = None,
check: bool = True,
**kwargs: Any,
) -> DatasetQueryResults:
# Docstring inherited from a base class.
return self._registry.queryDatasets(
datasetType,
collections=collections,
dimensions=dimensions,
dataId=dataId,
where=where,
findFirst=findFirst,
bind=bind,
check=check,
**kwargs,
)

def queryDataIds(
self,
dimensions: DimensionGroup | Iterable[str] | str,
*,
dataId: DataId | None = None,
datasets: Any = None,
collections: CollectionArgType | None = None,
where: str = "",
bind: Mapping[str, Any] | None = None,
check: bool = True,
**kwargs: Any,
) -> DataCoordinateQueryResults:
# Docstring inherited from a base class.
return self._registry.queryDataIds(
dimensions,
dataId=dataId,
datasets=datasets,
collections=collections,
where=where,
bind=bind,
check=check,
**kwargs,
)

def queryDimensionRecords(
self,
element: DimensionElement | str,
*,
dataId: DataId | None = None,
datasets: Any = None,
collections: CollectionArgType | None = None,
where: str = "",
bind: Mapping[str, Any] | None = None,
check: bool = True,
**kwargs: Any,
) -> DimensionRecordQueryResults:
# Docstring inherited from a base class.
return self._registry.queryDimensionRecords(
element,
dataId=dataId,
datasets=datasets,
collections=collections,
where=where,
bind=bind,
check=check,
**kwargs,
)

def queryDatasetAssociations(
self,
datasetType: str | DatasetType,
collections: CollectionArgType | None = ...,
*,
collectionTypes: Iterable[CollectionType] = CollectionType.all(),
flattenChains: bool = False,
) -> Iterator[DatasetAssociation]:
# Docstring inherited from a base class.
return self._registry.queryDatasetAssociations(
datasetType,
collections,
collectionTypes=collectionTypes,
flattenChains=flattenChains,
)

@property
def obsCoreTableManager(self) -> ObsCoreTableManager | None:
# Docstring inherited from a base class.
return self._registry.obsCoreTableManager

@property
def storageClasses(self) -> StorageClassFactory:
return self._registry.storageClasses
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/direct_butler/_direct_butler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1302,7 +1302,7 @@ def find_dataset(

data_id, kwargs = self._rewrite_data_id(data_id, parent_type, **kwargs)

ref = self._registry.findDataset(
ref = self.registry.findDataset(
parent_type,
data_id,
collections=collections,
Expand Down
6 changes: 5 additions & 1 deletion python/lsst/daf/butler/direct_query_driver/_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -641,7 +641,11 @@ def _analyze_query_tree(self, tree: qt.QueryTree, allow_duplicate_overlaps: bool
# datasets later.
predicate_constraints = PredicateConstraintsSummary(tree.predicate)
# Use the default data ID to apply additional constraints where needed.
predicate_constraints.apply_default_data_id(self._default_data_id, tree.dimensions)
predicate_constraints.apply_default_data_id(
self._default_data_id,
tree.dimensions,
validate_governor_constraints=tree.validateGovernorConstraints,
)
predicate = predicate_constraints.predicate
# Delegate to the dimensions manager to rewrite the predicate and start
# a SqlSelectBuilder to cover any spatial overlap joins or constraints.
Expand Down
2 changes: 1 addition & 1 deletion python/lsst/daf/butler/queries/_expression_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def visitIdentifier(self, name: str, node: Node) -> _VisitorResult:
def visitBind(self, name: str, node: Node) -> _VisitorResult:
name = name.lower()
if name not in self.context.bind:
raise InvalidQueryError("Name {name!r} is not in the bind map.")
raise InvalidQueryError(f"Name {name!r} is not in the bind map.")
# Logic in visitIdentifier handles binds.
return self.visitIdentifier(name, node)

Expand Down
4 changes: 4 additions & 0 deletions python/lsst/daf/butler/queries/_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -746,6 +746,10 @@ def where(
driver=self._driver,
)

def _skip_governor_validation(self) -> Query:
tree = self._tree.model_copy(update={"validateGovernorConstraints": False})
return Query(tree=tree, driver=self._driver)

def _join_dataset_search_impl(
self,
dataset_type: str | DatasetType,
Expand Down
11 changes: 8 additions & 3 deletions python/lsst/daf/butler/queries/predicate_constraints_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,10 @@ def __init__(self, predicate: qt.Predicate) -> None:
)

def apply_default_data_id(
self, default_data_id: DataCoordinate, query_dimensions: DimensionGroup
self,
default_data_id: DataCoordinate,
query_dimensions: DimensionGroup,
validate_governor_constraints: bool,
) -> None:
"""Augment the predicate and summary by adding missing constraints for
governor dimensions using a default data ID.
Expand All @@ -81,9 +84,11 @@ def apply_default_data_id(
default_data_id : `DataCoordinate`
Data ID values that will be used to constrain the query if governor
dimensions have not already been constrained by the predicate.

query_dimensions : `DimensionGroup`
The set of dimensions returned in result rows from the query.
validate_governor_constraints : `bool`
If `True`, enforce the requirement that governor dimensions must be
constrained if any dimensions that depend on them have constraints.
"""
# Find governor dimensions required by the predicate.
# If these are not constrained by the predicate or the default data ID,
Expand All @@ -108,7 +113,7 @@ def apply_default_data_id(
self.predicate = self.predicate.logical_and(
_create_data_id_predicate(governor, data_id_value, query_dimensions.universe)
)
elif governor in where_governors:
elif governor in where_governors and validate_governor_constraints:
# Check that the predicate doesn't reference any dimensions
# without constraining their governor dimensions, since
# that's a particularly easy mistake to make and it's
Expand Down
5 changes: 5 additions & 0 deletions python/lsst/daf/butler/queries/tree/_query_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@ class QueryTree(QueryTreeBase):
predicate: Predicate = Predicate.from_bool(True)
"""Boolean expression trees whose logical AND defines a row filter."""

validateGovernorConstraints: bool = True
"""If True, enforce the requirement that governor dimensions must be
constrained if any dimensions that depend on them have constraints.
"""

def iter_all_dataset_searches(self) -> Iterator[tuple[str | AnyDatasetType, DatasetSearch]]:
yield from self.datasets.items()
if self.any_dataset is not None:
Expand Down
9 changes: 6 additions & 3 deletions python/lsst/daf/butler/registry/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1449,6 +1449,9 @@ def obsCoreTableManager(self) -> ObsCoreTableManager | None:
"""
return None

storageClasses: StorageClassFactory
"""All storage classes known to the registry (`StorageClassFactory`).
"""
@property
def storageClasses(self) -> StorageClassFactory:
"""All storage classes known to the registry
(`StorageClassFactory`).
"""
raise NotImplementedError()
Loading
Loading