Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace Collection with lightweight _SearchIndexer. #667

Merged
merged 30 commits into from
Feb 28, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
2d76e71
Add optimized _SlimCollection.
bdice Jan 30, 2022
600243d
Further minimize _SlimCollection.
bdice Jan 30, 2022
3457cfe
Remove unused methods from _SlimCollection.
bdice Jan 31, 2022
3da53db
Make _SlimCollection a dict.
bdice Jan 31, 2022
39b64b5
Add tests.
bdice Feb 5, 2022
ce0ae32
Add test for invalid filter.
bdice Feb 8, 2022
e3b0c8e
Change _SlimCollection to dict initialization semantics, remove _id f…
bdice Feb 8, 2022
ddbb5f2
Update _SlimCollection tests to use dict semantics.
bdice Feb 8, 2022
b6613aa
Improve docs, move build_index to method.
bdice Feb 8, 2022
c955cf4
Update error handling.
bdice Feb 8, 2022
e8d240b
Add test coverage to Collection for invalid filter.
bdice Feb 8, 2022
8737d8f
Remove handling of unexpected errors.
bdice Feb 8, 2022
ccdf175
Remove if statement that is impossible to reach.
bdice Feb 8, 2022
d50f9da
Remove optimization note about searching by primary key.
bdice Feb 8, 2022
b233502
Add early exit for logical expressions.
bdice Feb 8, 2022
8bb2325
Rename _SlimCollection to _Index.
bdice Feb 8, 2022
2efc63d
Update docs.
bdice Feb 12, 2022
7780167
Update docs.
bdice Feb 12, 2022
007c17d
Refactoring/PR comments.
bdice Feb 12, 2022
ad84704
More refactoring/PR comments.
bdice Feb 12, 2022
adb406f
Simplify error check.
bdice Feb 12, 2022
227fabe
Use tighter logic in build_index.
bdice Feb 12, 2022
faea964
Fix test name.
bdice Feb 12, 2022
6d8e057
Update signac/contrib/_index.py
bdice Feb 18, 2022
f7f2224
Use copy and pop optimization.
bdice Feb 18, 2022
3699e18
Revise documentation.
bdice Feb 18, 2022
6707f1d
Use isinstance when True is expected for micro-optimization.
bdice Feb 18, 2022
4db874e
Remove extraneous helper function and unnecessary data copy operation.
bdice Feb 18, 2022
6187c01
Apply suggestions from code review
vyasr Feb 19, 2022
43a0177
Rename to _SearchIndexer.
bdice Feb 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
299 changes: 299 additions & 0 deletions signac/contrib/_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,299 @@
# Copyright (c) 2022 The Regents of the University of Michigan
# All rights reserved.
# This software is licensed under the BSD 3-Clause License.
"""Implement class for indexing signac Projects."""

import json
import logging
from numbers import Number

from ..errors import InvalidKeyError
from .collection import (
_INDEX_OPERATORS,
_check_logical_operator_argument,
_DictPlaceholder,
_find_with_index_operator,
_float,
_TypedSetDefaultDict,
)
from .utility import _nested_dicts_to_dotted_keys, _to_hashable

logger = logging.getLogger(__name__)

_PRIMARY_KEY = "_id"


class _SearchIndexer(dict):
"""A searchable collection of dicts.

The _SearchIndexer class is a :class:`dict` that maps from ids to
:class:`dict`s. The :class:`dict`s stored as values can be searched by
their contained keys and values, returning ids for the values matching the
provided query. The query syntax is based on MongoDB, though this class
does not aim to match the API of MongoDB's Collection class.

The dictionary values may be nested (may contain other dicts or lists), but
have two restrictions. First, the data must be JSON-encodable. Second, the
keys in the dictionary may not contain dots (``.``).

For example, suppose we are given dictionaries of member data containing a
`name` key and an `age` key along with unique identifiers acting as a
primary key for each member. We can find the name of all members that are
age 32 like this:

.. code-block:: python

members = _SearchIndexer({
'0': {'name': 'John', 'age': 32},
'1': {'name': 'Alice', 'age': 28},
'2': {'name': 'Kevin', 'age': 32},
# ...
})

for member_id in members.find({'age': 32}):
print(member_id) # prints 0 and 2

Because this class inherits from :class:`dict`, it can be constructed in
any of the same ways as a :class:`dict`, like ``_SearchIndexer(**kwargs)``,
``_SearchIndexer(mapping, **kwargs)``, or
``_SearchIndexer(iterable, **kwargs)``.

"""

def build_index(self, key):
"""Build index for a given key.

This is a highly performance critical code path.

Parameters
----------
key : str
The key on which the index is built.

Returns
-------
:class:`~_TypedSetDefaultDict`
Index for key.

Raises
------
:class:`~signac.errors.InvalidKeyError`
The dict contains invalid keys.

"""
logger.debug(f"Building index for key '{key}'...")
nodes = key.split(".")
index = _TypedSetDefaultDict()

for _id, doc in self.items():
try:
v = doc
# Recursively access nested values from dotted keys.
for n in nodes:
v = v[n]
except (KeyError, TypeError):
pass
else:
# `isinstance(instance, cls)` is typically faster than `type(instance) is cls`
# when the answer is True, but it is slower when it is False. Since we
# expect lists and dicts to occur infrequently here, we optimize for the
# False path using the `type` based check.
if type(v) is list:
index[_to_hashable(v)].add(_id)
elif type(v) is dict:
index[_DictPlaceholder].add(_id)
else:
index[v].add(_id)

# Raise an exception if the original key is present and has dots.
if len(nodes) > 1 and key in doc:
raise InvalidKeyError(
"Keys with dots ('.') are invalid.\n\n"
"See https://signac.io/document-wide-migration/ "
"for a recipe on how to replace dots in existing keys."
)
logger.debug(f"Built index for key '{key}'.")
return index

def _find_expression(self, key, value):
"""Find ids of dicts with keys matching a value expression.

Parameters
----------
key : str
The dict key to match.
value
The value expression to match.

Returns
-------
set
The ids of dicts matching the value expression.

Raises
------
KeyError
An invalid operator was given.
ValueError
The value is not bool for '$exists' operator or not a
supported type for '$type' operator.

"""
logger.debug(f"Find ids matching expression '{key}: {value}'.")
if "$" in key:
if key.count("$") > 1:
raise KeyError(f"Invalid operator expression '{key}'.")
nodes = key.split(".")
op = nodes[-1]
if not op.startswith("$"):
raise KeyError(f"Invalid operator placement '{key}'.")
key = ".".join(nodes[:-1])
if op in _INDEX_OPERATORS:
index = self.build_index(key)
return _find_with_index_operator(index, op, value)
elif op == "$exists":
if not isinstance(value, bool):
raise ValueError(
"The value of the '$exists' operator must be boolean."
)
index = self.build_index(key)
match = {elem for elems in index.values() for elem in elems}
return match if value else set(self).difference(match)
else:
raise KeyError(f"Unknown operator '{op}'.")
else:
index = self.build_index(key)
# Check to see if 'value' is a floating point type but an
# integer value (e.g., 4.0), and search for both the int and float
# values. This allows the user to find statepoints that have
# integer-valued keys that are stored as floating point types.
# Note that this both cases: 1) user searches for an int and hopes
# to find values that are stored as integer-valued floats and 2) user
# searches for a integer-valued float and hopes to find ints.
# This way, both `signac find x 4.0` and `signac find x 4` would
# return jobs where `sp.x` is stored as either 4.0 or 4.
if isinstance(value, Number) and float(value).is_integer():
result_float = index.get(_float(value), set())
result_int = index.get(int(value), set())
return result_int.union(result_float)
else:
return index.get(value, set())

def _find_result(self, expr):
"""Find ids of dicts matching a dict of filter expressions.

Parameters
----------
expr : dict
The filter of expressions to match (Default value = None).

Returns
-------
set
A set of ids of dicts that match the given filter.

"""
if not expr:
# Empty expression yields all ids.
return set(self)

result_ids = None

def reduce_results(match):
"""Reduce the results by intersection of matches.

Parameters
----------
match : set
match for the given expression.

"""
nonlocal result_ids
if result_ids is None: # First match
result_ids = match
else: # Update previous match
result_ids = result_ids.intersection(match)

# Check if filter contains primary key, in which case we can
# immediately reduce the result.
_id = expr.pop(_PRIMARY_KEY, None)
if _id is not None:
reduce_results({_id} if _id in self else set())

# Extract all logical-operator expressions for now.
or_expressions = expr.pop("$or", None)
and_expressions = expr.pop("$and", None)
not_expression = expr.pop("$not", None)

# Reduce the result based on the remaining non-logical expression:
for key, value in _nested_dicts_to_dotted_keys(expr):
reduce_results(self._find_expression(key, value))
if not result_ids:
# No matches, so exit early.
return set()

# Reduce the result based on the logical-operator expressions:
if not_expression is not None:
not_match = self._find_result(not_expression)
reduce_results(set(self).difference(not_match))
if not result_ids:
# No matches, so exit early.
return set()

if and_expressions is not None:
_check_logical_operator_argument("$and", and_expressions)
for expr_ in and_expressions:
reduce_results(self._find_result(expr_))
if not result_ids:
# No matches, so exit early.
return set()

if or_expressions is not None:
_check_logical_operator_argument("$or", or_expressions)
or_results = set()
for expr_ in or_expressions:
or_results.update(self._find_result(expr_))
reduce_results(or_results)

return result_ids

def find(self, filter_=None):
"""Find ids of dicts matching a dict of filter expressions.

This function normalizes the filter argument and then attempts to
build a set of ids matching the given key-value queries.
For each key that is queried, an internal index is built and then
searched.

The results are a set of ids, where each id is the value of the
primary key of a dict that matches the given filter.

The find() method uses the following optimizations:

1. If the filter is None, a set of all ids is returned.
2. The filter is processed key by key. Once the set of matches is
empty it is immediately returned.

Parameters
----------
filter_ : dict
The filter of expressions to match (Default value = None).

Returns
-------
set
A set of ids of dicts that match the given filter.

Raises
------
ValueError
When the filter argument is invalid.

"""
if not filter_:
return set(self)

filter_ = json.loads(json.dumps(filter_)) # Normalize
if not isinstance(filter_, dict):
raise ValueError(f"Invalid filter: {filter_}")
return self._find_result(filter_)
9 changes: 5 additions & 4 deletions signac/contrib/import_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
import shutil
import tarfile
import zipfile
from collections import Counter, OrderedDict
from collections import Counter
from contextlib import closing, contextmanager
from string import Formatter
from tempfile import TemporaryDirectory
from zipfile import ZIP_DEFLATED, ZipFile

from ._index import _SearchIndexer
from .errors import DestinationExistsError, StatepointParsingError
from .utility import _dotted_dict_to_nested_dicts, _mkdir_p

Expand Down Expand Up @@ -62,8 +63,8 @@ def _make_schema_based_path_function(jobs, exclude_keys=None, delimiter_nested="
# signature of the path function below.
return lambda job, sep=None: ""

index = [{"_id": job.id, "sp": job.sp()} for job in jobs]
statepoint_index = OrderedDict(
index = _SearchIndexer((job.id, {"sp": job.sp()}) for job in jobs)
statepoint_index = dict(
_build_job_statepoint_index(exclude_const=True, index=index)
)

Expand All @@ -75,7 +76,7 @@ def _make_schema_based_path_function(jobs, exclude_keys=None, delimiter_nested="
for value, group in values.items():
path_tokens = key, str(value)
for job_id in group:
paths.setdefault(job_id, list())
paths.setdefault(job_id, [])
paths[job_id].extend(path_tokens)

def path(job, sep=None):
Expand Down
Loading