update memory driver search duplicates implementation

opendatacube · Dec 4, 2023 · 94d6a5b · 94d6a5b
1 parent dffa40d
commit 94d6a5b
Show file tree

Hide file tree

Showing 2 changed files with 13 additions and 11 deletions.
diff --git a/datacube/index/memory/_datasets.py b/datacube/index/memory/_datasets.py
@@ -131,34 +131,38 @@ def search_product_duplicates(self,
                                   product: Product,
                                   *args: Union[str, Field]
                                   ) -> Iterable[Tuple[Tuple, Iterable[UUID]]]:
-        field_names: List[str] = [arg.name if isinstance(arg, Field) else arg for arg in args]
-        #   Typing note: mypy cannot handle dynamically created namedtuples
-        GroupedVals = namedtuple('search_result', field_names)  # type: ignore[misc]
-
+        """
+        Find dataset ids of a given product that have duplicates of the given set of field names.
+        Returns each set of those field values and the datasets that have them.
+        Note that this implementation does not account for slight timestamp discrepancies.
+        """
         def to_field(f: Union[str, Field]) -> Field:
             if isinstance(f, str):
                 f = product.metadata_type.dataset_fields[f]
             assert isinstance(f, Field), "Not a field: %r" % (f,)
             return f
 
         fields = [to_field(f) for f in args]
+        # Typing note: mypy cannot handle dynamically created namedtuples
+        GroupedVals = namedtuple('search_result', list(f.name for f in fields))  # type: ignore[misc]
 
         def values(ds: Dataset) -> GroupedVals:
             vals = []
             for field in fields:
                 vals.append(field.extract(ds.metadata_doc))  # type: ignore[attr-defined]
             return GroupedVals(*vals)
 
-        dups: Dict[Tuple, List[UUID]] = {}
+        dups: Dict[Tuple, set[UUID]] = {}
         for ds in self.active_by_id.values():
             if ds.product.name != product.name:
                 continue
             vals = values(ds)
             if vals in dups:
-                dups[vals].append(ds.id)
+                dups[vals].add(ds.id)
             else:
-                dups[vals] = [ds.id]
-        return list(dups.items())
+                dups[vals] = set([ds.id])  # avoid duplicate entries
+        # only return entries with more than one dataset
+        return list({k: v for k, v in dups.items() if len(v) > 1})
 
     def can_update(self,
                    dataset: Dataset,

diff --git a/integration_tests/index/test_memory_index.py b/integration_tests/index/test_memory_index.py
@@ -201,9 +201,7 @@ def test_mem_ds_search_dups(mem_eo3_data):
     dc, ls8_id, wo_id = mem_eo3_data
     ls8_ds = dc.index.datasets.get(ls8_id)
     dup_results = dc.index.datasets.search_product_duplicates(ls8_ds.product, "cloud_cover", "dataset_maturity")
-    assert len(dup_results) == 1
-    assert dup_results[0][0].cloud_cover == ls8_ds.metadata.cloud_cover
-    assert ls8_id in dup_results[0][1]
+    assert len(dup_results) == 0
 
 
 def test_mem_ds_locations(mem_eo3_data):