Skip to content

Commit

Permalink
update memory driver search duplicates implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
Ariana Barzinpour committed Dec 4, 2023
1 parent dffa40d commit 94d6a5b
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 11 deletions.
20 changes: 12 additions & 8 deletions datacube/index/memory/_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,34 +131,38 @@ def search_product_duplicates(self,
product: Product,
*args: Union[str, Field]
) -> Iterable[Tuple[Tuple, Iterable[UUID]]]:
field_names: List[str] = [arg.name if isinstance(arg, Field) else arg for arg in args]
# Typing note: mypy cannot handle dynamically created namedtuples
GroupedVals = namedtuple('search_result', field_names) # type: ignore[misc]

"""
Find dataset ids of a given product that have duplicates of the given set of field names.
Returns each set of those field values and the datasets that have them.
Note that this implementation does not account for slight timestamp discrepancies.
"""
def to_field(f: Union[str, Field]) -> Field:
if isinstance(f, str):
f = product.metadata_type.dataset_fields[f]
assert isinstance(f, Field), "Not a field: %r" % (f,)
return f

fields = [to_field(f) for f in args]
# Typing note: mypy cannot handle dynamically created namedtuples
GroupedVals = namedtuple('search_result', list(f.name for f in fields)) # type: ignore[misc]

def values(ds: Dataset) -> GroupedVals:
vals = []
for field in fields:
vals.append(field.extract(ds.metadata_doc)) # type: ignore[attr-defined]
return GroupedVals(*vals)

dups: Dict[Tuple, List[UUID]] = {}
dups: Dict[Tuple, set[UUID]] = {}
for ds in self.active_by_id.values():
if ds.product.name != product.name:
continue
vals = values(ds)
if vals in dups:
dups[vals].append(ds.id)
dups[vals].add(ds.id)
else:
dups[vals] = [ds.id]
return list(dups.items())
dups[vals] = set([ds.id]) # avoid duplicate entries
# only return entries with more than one dataset
return list({k: v for k, v in dups.items() if len(v) > 1})

def can_update(self,
dataset: Dataset,
Expand Down
4 changes: 1 addition & 3 deletions integration_tests/index/test_memory_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,9 +201,7 @@ def test_mem_ds_search_dups(mem_eo3_data):
dc, ls8_id, wo_id = mem_eo3_data
ls8_ds = dc.index.datasets.get(ls8_id)
dup_results = dc.index.datasets.search_product_duplicates(ls8_ds.product, "cloud_cover", "dataset_maturity")
assert len(dup_results) == 1
assert dup_results[0][0].cloud_cover == ls8_ds.metadata.cloud_cover
assert ls8_id in dup_results[0][1]
assert len(dup_results) == 0


def test_mem_ds_locations(mem_eo3_data):
Expand Down

0 comments on commit 94d6a5b

Please sign in to comment.