Skip to content

Commit 16374f1

Browse files
committed
Add initial implementation of query_all_datasets
Add a method for querying multiple dataset types simultaneously, currently hidden as `Butler._query_all_datasets`. This implementation uses the existing logic from the query-datasets CLI for doing the search.
1 parent a23c5b2 commit 16374f1

File tree

5 files changed

+255
-16
lines changed

5 files changed

+255
-16
lines changed

python/lsst/daf/butler/_butler.py

Lines changed: 109 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1729,13 +1729,6 @@ def query_datasets(
17291729
collection wildcard is passed when ``find_first`` is `True`, or
17301730
when ``collections`` is `None` and default butler collections are
17311731
not defined.
1732-
1733-
Notes
1734-
-----
1735-
When multiple dataset types are queried in a single call, the results
1736-
of this operation are equivalent to querying for each dataset type
1737-
separately in turn, and no information about the relationships between
1738-
datasets of different types is included.
17391732
"""
17401733
if data_id is None:
17411734
data_id = DataCoordinate.make_empty(self.dimensions)
@@ -1878,6 +1871,115 @@ def query_dimension_records(
18781871
raise EmptyQueryResultError(list(result.explain_no_results()))
18791872
return dimension_records
18801873

1874+
def _query_all_datasets(
1875+
self,
1876+
collections: str | Iterable[str] | None = None,
1877+
*,
1878+
name: str | Iterable[str] = "*",
1879+
find_first: bool = True,
1880+
data_id: DataId | None = None,
1881+
where: str = "",
1882+
bind: Mapping[str, Any] | None = None,
1883+
limit: int | None = -20_000,
1884+
**kwargs: Any,
1885+
) -> list[DatasetRef]:
1886+
"""Query for datasets of potentially multiple types.
1887+
1888+
Parameters
1889+
----------
1890+
collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
1891+
The collection or collections to search, in order. If not provided
1892+
or `None`, the default collection search path for this butler is
1893+
used.
1894+
name : `str` or `~collections.abc.Iterable` [ `str` ], optional
1895+
Names or name patterns (glob-style) that returned dataset type
1896+
names must match. If an iterable, items are OR'd together. The
1897+
default is to include all dataset types in the given collections.
1898+
find_first : `bool`, optional
1899+
If `True` (default), for each result data ID, only yield one
1900+
`DatasetRef` of each `DatasetType`, from the first collection in
1901+
which a dataset of that dataset type appears (according to the
1902+
order of ``collections`` passed in).
1903+
data_id : `dict` or `DataCoordinate`, optional
1904+
A data ID whose key-value pairs are used as equality constraints in
1905+
the query.
1906+
where : `str`, optional
1907+
A string expression similar to a SQL WHERE clause. May involve any
1908+
column of a dimension table or (as a shortcut for the primary key
1909+
column of a dimension table) dimension name. See
1910+
:ref:`daf_butler_dimension_expressions` for more information.
1911+
bind : `~collections.abc.Mapping`, optional
1912+
Mapping containing literal values that should be injected into the
1913+
``where`` expression, keyed by the identifiers they replace. Values
1914+
of collection type can be expanded in some cases; see
1915+
:ref:`daf_butler_dimension_expressions_identifiers` for more
1916+
information.
1917+
limit : `int` or `None`, optional
1918+
Upper limit on the number of returned records. `None` can be used
1919+
if no limit is wanted. A limit of ``0`` means that the query will
1920+
be executed and validated but no results will be returned.
1921+
If a negative value is given a warning will be issued if the number
1922+
of results is capped by that limit. If no limit is provided, by
1923+
default a maximum of 20,000 records will be returned.
1924+
**kwargs
1925+
Additional keyword arguments are forwarded to
1926+
`DataCoordinate.standardize` when processing the ``data_id``
1927+
argument (and may be used to provide a constraining data ID even
1928+
when the ``data_id`` argument is `None`).
1929+
1930+
Raises
1931+
------
1932+
MissingDatasetTypeError
1933+
When no dataset types match ``name``, or an explicit (non-glob)
1934+
dataset type in ``name`` does not exist.
1935+
InvalidQueryError
1936+
If the parameters to the query are inconsistent or malformed.
1937+
MissingCollectionError
1938+
If a given collection is not found.
1939+
1940+
Returns
1941+
-------
1942+
refs : `list` [ `DatasetRef` ]
1943+
Dataset references matching the given query criteria. Nested data
1944+
IDs are guaranteed to include values for all implied dimensions
1945+
(i.e. `DataCoordinate.hasFull` will return `True`), but will not
1946+
include dimension records (`DataCoordinate.hasRecords` will be
1947+
`False`).
1948+
"""
1949+
from ._query_all_datasets import query_all_datasets
1950+
1951+
if collections is None:
1952+
collections = list(self.collections.defaults)
1953+
else:
1954+
collections = list(ensure_iterable(collections))
1955+
1956+
warn_limit = False
1957+
if limit is not None and limit < 0:
1958+
# Add one to the limit so we can detect if we have exceeded it.
1959+
limit = abs(limit) + 1
1960+
warn_limit = True
1961+
1962+
result = []
1963+
for page in query_all_datasets(
1964+
self,
1965+
collections=collections,
1966+
name=name,
1967+
find_first=find_first,
1968+
data_id=data_id,
1969+
where=where,
1970+
limit=limit,
1971+
bind=bind,
1972+
**kwargs,
1973+
):
1974+
result.extend(page.data)
1975+
1976+
if warn_limit and limit is not None and len(result) >= limit:
1977+
# Remove the extra dataset we added for the limit check.
1978+
result.pop()
1979+
_LOG.warning("More datasets are available than the requested limit of %d.", limit - 1)
1980+
1981+
return result
1982+
18811983
def clone(
18821984
self,
18831985
*,

python/lsst/daf/butler/_query_all_datasets.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ class DatasetsPage(NamedTuple):
5252
def query_all_datasets(
5353
butler: Butler,
5454
*,
55-
collections: str | Iterable[str] | None = None,
55+
collections: list[str],
5656
name: str | Iterable[str] = "*",
5757
find_first: bool = True,
5858
data_id: DataId | None = None,
@@ -69,8 +69,8 @@ def query_all_datasets(
6969
----------
7070
butler : `Butler`
7171
Butler instance to use for executing queries.
72-
collections : `str` or `~collections.abc.Iterable` [ `str` ], optional
73-
The collection or collections to search, in order. If not provided
72+
collections : `list` [ `str` ]
73+
The collections to search, in order. If not provided
7474
or `None`, the default collection search path for this butler is
7575
used.
7676
name : `str` or `~collections.abc.Iterable` [ `str` ], optional
@@ -128,10 +128,6 @@ def query_all_datasets(
128128
`DatasetRef` results matching the given query criteria, grouped by
129129
dataset type.
130130
"""
131-
if collections is None:
132-
collections = list(butler.collections.defaults)
133-
else:
134-
collections = list(ensure_iterable(collections))
135131
if find_first and has_globs(collections):
136132
raise InvalidQueryError("Can not use wildcards in collections when find_first=True")
137133

python/lsst/daf/butler/script/queryDatasets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ def getDatasets(self) -> Iterator[list[DatasetRef]]:
259259
Dataset references matching the given query criteria grouped
260260
by dataset type.
261261
"""
262-
query_collections: Iterable[str] = self._collections_wildcard or ["*"]
262+
query_collections = self._collections_wildcard or ["*"]
263263

264264
warn_limit = False
265265
if self._limit < 0:

python/lsst/daf/butler/tests/butler_queries.py

Lines changed: 135 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,14 @@
4444

4545
from .._butler import Butler
4646
from .._collection_type import CollectionType
47+
from .._dataset_ref import DatasetRef
4748
from .._dataset_type import DatasetType
48-
from .._exceptions import EmptyQueryResultError, InvalidQueryError
49+
from .._exceptions import (
50+
EmptyQueryResultError,
51+
InvalidQueryError,
52+
MissingCollectionError,
53+
MissingDatasetTypeError,
54+
)
4955
from .._timespan import Timespan
5056
from ..dimensions import DataCoordinate, DimensionRecord
5157
from ..direct_query_driver import DirectQueryDriver
@@ -2007,6 +2013,130 @@ def test_unusual_column_literals(self) -> None:
20072013
names = [x.full_name for x in result]
20082014
self.assertEqual(names, ["Ba"])
20092015

2016+
def test_query_all_datasets(self) -> None:
2017+
butler = self.make_butler("base.yaml", "datasets.yaml")
2018+
2019+
# Make sure that refs are coming out well-formed.
2020+
datasets = butler._query_all_datasets("imported_r", where="detector = 2", instrument="Cam1")
2021+
datasets.sort(key=lambda ref: ref.datasetType.name)
2022+
self.assertEqual(len(datasets), 2)
2023+
bias = datasets[0]
2024+
self.assertEqual(bias.datasetType.name, "bias")
2025+
self.assertEqual(bias.dataId["instrument"], "Cam1")
2026+
self.assertEqual(bias.dataId["detector"], 2)
2027+
self.assertEqual(bias.run, "imported_r")
2028+
self.assertEqual(bias.id, UUID("87f3e68d-258d-41b7-8ea5-edf3557ccb30"))
2029+
flat = datasets[1]
2030+
self.assertEqual(flat.datasetType.name, "flat")
2031+
self.assertEqual(flat.dataId["instrument"], "Cam1")
2032+
self.assertEqual(flat.dataId["detector"], 2)
2033+
self.assertEqual(flat.dataId["physical_filter"], "Cam1-R1")
2034+
self.assertEqual(flat.dataId["band"], "r")
2035+
self.assertEqual(flat.run, "imported_r")
2036+
self.assertEqual(flat.id, UUID("c1296796-56c5-4acf-9b49-40d920c6f840"))
2037+
2038+
# Querying for everything finds everything.
2039+
results = butler._query_all_datasets("*", find_first=False)
2040+
self.assertEqual(len(results), 13)
2041+
2042+
# constraining by data ID works
2043+
detector_1_ids = ("d0bb04cd-d697-4a83-ba53-cdfcd58e3a0c", "e15ab039-bc8b-4135-87c5-90902a7c0b22")
2044+
results = butler._query_all_datasets(
2045+
"*", data_id={"detector": 1, "instrument": "Cam1"}, find_first=False
2046+
)
2047+
self.assertCountEqual(detector_1_ids, _ref_uuids(results))
2048+
2049+
# bind values work.
2050+
results = butler._query_all_datasets(
2051+
"*", where="detector=my_bind and instrument='Cam1'", bind={"my_bind": 1}, find_first=False
2052+
)
2053+
self.assertCountEqual(detector_1_ids, _ref_uuids(results))
2054+
2055+
# find_first requires ordered collections.
2056+
with self.assertRaisesRegex(InvalidQueryError, "Can not use wildcards"):
2057+
results = butler._query_all_datasets("*")
2058+
2059+
butler.collections.register("chain", CollectionType.CHAINED)
2060+
butler.collections.redefine_chain("chain", ["imported_g", "imported_r"])
2061+
results = butler._query_all_datasets(
2062+
"chain", where="detector=2 and instrument = 'Cam1'", find_first=True
2063+
)
2064+
# find_first searches the collection chain in order.
2065+
self.assertCountEqual(
2066+
_ref_uuids(results),
2067+
[
2068+
"51352db4-a47a-447c-b12d-a50b206b17cd", # imported_g bias
2069+
"60c8a65c-7290-4c38-b1de-e3b1cdcf872d", # imported_g flat
2070+
"c1296796-56c5-4acf-9b49-40d920c6f840", # imported_r flat
2071+
# There is also a bias dataset with detector=2 in imported_r,
2072+
# but it is masked by the presence of the same data ID in
2073+
# imported_g.
2074+
],
2075+
)
2076+
2077+
# collection searches work.
2078+
results = butler._query_all_datasets(
2079+
"*g", where="detector=1 and instrument = 'Cam1'", find_first=False
2080+
)
2081+
self.assertEqual(_ref_uuids(results), ["e15ab039-bc8b-4135-87c5-90902a7c0b22"])
2082+
2083+
# we raise for missing collections with explicit names.
2084+
with self.assertRaises(MissingCollectionError):
2085+
results = butler._query_all_datasets("nonexistent")
2086+
# we don't raise for collection wildcard searches that find nothing.
2087+
results = butler._query_all_datasets("nonexistent*", find_first=False)
2088+
self.assertEqual(results, [])
2089+
2090+
# dataset type searches work.
2091+
results = butler._query_all_datasets(
2092+
"*", name="b*", where="detector=1 and instrument = 'Cam1'", find_first=False
2093+
)
2094+
self.assertEqual(_ref_uuids(results), ["e15ab039-bc8b-4135-87c5-90902a7c0b22"])
2095+
2096+
# Missing dataset types raise.
2097+
with self.assertRaises(MissingDatasetTypeError):
2098+
results = butler._query_all_datasets("chain", name=["notfound", "flat"])
2099+
with self.assertRaises(MissingDatasetTypeError):
2100+
results = butler._query_all_datasets("chain", name="notfound*")
2101+
2102+
# Limit of 3 lands at the boundary of a dataset type.
2103+
# Limit of 4 is in the middle of a dataset type.
2104+
for limit in [3, 4]:
2105+
with self.subTest(limit=limit):
2106+
results = butler._query_all_datasets("imported_g", limit=limit)
2107+
self.assertEqual(len(results), limit)
2108+
with self.assertLogs(level="WARNING") as log:
2109+
results = butler._query_all_datasets("imported_g", limit=-limit)
2110+
self.assertEqual(len(results), limit)
2111+
self.assertIn("requested limit", log.output[0])
2112+
2113+
results = butler._query_all_datasets("imported_g", limit=0)
2114+
self.assertEqual(len(results), 0)
2115+
2116+
# 'where' constraints that don't apply to all dataset types follow the
2117+
# same rules as query_datasets.
2118+
results = butler._query_all_datasets(
2119+
"*", where="detector = 2 and band = 'g' and instrument = 'Cam1'", find_first=False
2120+
)
2121+
self.assertCountEqual(
2122+
_ref_uuids(results),
2123+
[
2124+
# bias does not have 'band'
2125+
"51352db4-a47a-447c-b12d-a50b206b17cd",
2126+
"87f3e68d-258d-41b7-8ea5-edf3557ccb30",
2127+
# flat does have 'band', and we filter based on it
2128+
"60c8a65c-7290-4c38-b1de-e3b1cdcf872d",
2129+
],
2130+
)
2131+
2132+
# Default collections and data ID apply.
2133+
butler.registry.defaults = RegistryDefaults(collections="imported_g")
2134+
results = butler._query_all_datasets(where="detector = 2")
2135+
self.assertCountEqual(
2136+
_ref_uuids(results),
2137+
["51352db4-a47a-447c-b12d-a50b206b17cd", "60c8a65c-7290-4c38-b1de-e3b1cdcf872d"],
2138+
)
2139+
20102140

20112141
def _get_exposure_ids_from_dimension_records(dimension_records: Iterable[DimensionRecord]) -> list[int]:
20122142
output = []
@@ -2016,3 +2146,7 @@ def _get_exposure_ids_from_dimension_records(dimension_records: Iterable[Dimensi
20162146
output.append(id)
20172147

20182148
return output
2149+
2150+
2151+
def _ref_uuids(refs: list[DatasetRef]) -> list[str]:
2152+
return [str(ref.id) for ref in refs]

tests/test_cliCmdQueryDatasets.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,13 @@ def test_limit_order(self):
357357
]
358358
self.assertAstropyTablesEqual(tables, expectedTables, filterColumns=True)
359359

360+
# Same as previous test, but with positive limit so no warning is
361+
# issued.
362+
tables = self._queryDatasets(
363+
repo=testRepo.butler, limit=1, order_by=("visit"), collections="*", glob="*"
364+
)
365+
self.assertAstropyTablesEqual(tables, expectedTables, filterColumns=True)
366+
360367
with self.assertLogs("lsst.daf.butler.script.queryDatasets", level="WARNING") as cm:
361368
tables = self._queryDatasets(
362369
repo=testRepo.butler, limit=-1, order_by=("-visit"), collections="*", glob="*"

0 commit comments

Comments
 (0)