add dask to requirements and fix missing pattern for unit filter

jleifnf · jleifnf · commit 8bcb3c1d3df5 · 2023-08-11T11:25:54.000-07:00
diff --git a/btrdb/stream.py b/btrdb/stream.py
@@ -210,7 +210,7 @@ def count(
             pointwidth = min(
                 pointwidth,
                 pw.from_nanoseconds(to_nanoseconds(end) - to_nanoseconds(start)) - 1,
-                )
+            )
             points = self.aligned_windows(start, end, pointwidth, version)
             return sum([point.count for point, _ in points])
 
@@ -583,7 +583,7 @@ def insert(self, data, merge="never"):
         version = 0
         i = 0
         while i < len(data):
-            thisBatch = data[i: i + INSERT_BATCH_SIZE]
+            thisBatch = data[i : i + INSERT_BATCH_SIZE]
             version = self._btrdb.ep.insert(self._uuid, thisBatch, merge)
             i += INSERT_BATCH_SIZE
         return version
@@ -1435,20 +1435,22 @@ def __repr__(self):
 ## StreamSet  Classes
 ##########################################################################
 
+
 @delayed
 def get_metadata(stream):
-    columns = ['collection', 'tags', 'annotations', 'stream', 'uuid']
+    columns = ["collection", "tags", "annotations", "stream", "uuid"]
     stream.refresh_metadata()
-    metadata = {c: (getattr(stream, f"_{c}") if c != 'stream' else stream)
-                for c in columns}
+    metadata = {
+        c: (getattr(stream, f"_{c}") if c != "stream" else stream) for c in columns
+    }
     return metadata
 
+
 class StreamSetBase(Sequence):
     """
     A lighweight wrapper around a list of stream objects
     """
 
-
     def __init__(self, streams):
         self._streams = streams
         if len(self._streams) < 1:
@@ -1467,11 +1469,11 @@ def __init__(self, streams):
         self.depth = 0
 
         # create a DataFrame to store the metadata for filtering
-        _columns = ['collection', 'tags', 'annotations', 'stream', 'uuid']
         _metadata = compute([get_metadata(s) for s in self._streams])[0]
         _metadata = pd.DataFrame(_metadata)
-        self._metadata = (_metadata.join(pd.json_normalize(_metadata['tags']))
-                          .drop(columns=['tags', 'annotations']))
+        self._metadata = _metadata.join(pd.json_normalize(_metadata["tags"])).drop(
+            columns=["tags", "annotations"]
+        )
 
     @property
     def allow_window(self):
@@ -1735,55 +1737,67 @@ def filter(
         # filter by collection
         if collection is not None:
             if isinstance(collection, RE_PATTERN):
-                tf = (tf & obj._metadata.collection.str.contains(collection.pattern, case=False, regex=True))
+                tf = tf & obj._metadata.collection.str.contains(
+                    collection.pattern, case=False, regex=True
+                )
             elif isinstance(collection, str):
-                tf = (tf & obj._metadata.collection.str.contains(collection, case=False, regex=False))
+                tf = tf & obj._metadata.collection.str.contains(
+                    collection, case=False, regex=False
+                )
             else:
                 raise BTRDBTypeError("collection must be string or compiled regex")
 
         # filter by name
         if name is not None:
             if isinstance(name, RE_PATTERN):
-                tf = (tf & obj._metadata.name.str.contains(name.pattern, case=False, regex=True))
+                tf = tf & obj._metadata.name.str.contains(
+                    name.pattern, case=False, regex=True
+                )
             elif isinstance(name, str):
-                tf = (tf & obj._metadata.name.str.contains(name, case=False, regex=False))
+                tf = tf & obj._metadata.name.str.contains(name, case=False, regex=False)
             else:
                 raise BTRDBTypeError("name must be string or compiled regex")
 
         # filter by unit
         if unit is not None:
             if isinstance(unit, RE_PATTERN):
-                tf = (tf & obj._metadata.unit.str.contains(unit, case=False, regex=True))
+                tf = tf & obj._metadata.unit.str.contains(
+                    unit.pattern, case=False, regex=True
+                )
             elif isinstance(unit, str):
-                tf = (tf & obj._metadata.name.str.contains(unit, case=False, regex=False))
+                tf = tf & obj._metadata.name.str.contains(unit, case=False, regex=False)
             else:
                 raise BTRDBTypeError("unit must be string or compiled regex")
 
         # filter by tags
         if tags:
-            tf = (tf & obj._metadata.loc[:, obj._metadata.columns.isin(tags.keys())]
-                  .apply(lambda x: x.str.contains(tags[x.name], case=False, regex=False))
-                  .all(axis=1))
+            tf = tf & obj._metadata.loc[
+                :, obj._metadata.columns.isin(tags.keys())
+            ].apply(
+                lambda x: x.str.contains(tags[x.name], case=False, regex=False)
+            ).all(
+                axis=1
+            )
         obj._metadata = obj._metadata[tf]
 
         # filter by annotations
         if annotations:
-            _annotations = pd.json_normalize(obj._metadata['annotations'])
+            _annotations = pd.json_normalize(obj._metadata["annotations"])
             if not _annotations.columns.isin(annotations.keys()).any():
                 raise BTRDBValueError("annotations key not found")
-            _metadata = obj._metadata.join(
-                _annotations,
-                rsuffix='_annotations'
-            ).drop(columns=['annotations'])
+            obj._metadata = obj._metadata.join(
+                _annotations, rsuffix="_annotations"
+            ).drop(columns=["annotations"])
 
-            _columns = list(annotations.keys()) + list(map(lambda s: "".join([s,'_annotations']), annotations.keys()))
+            _columns = list(annotations.keys()) + list(
+                map(lambda s: "".join([s, "_annotations"]), annotations.keys())
+            )
             # filters if the subset of the annotations matches the given annotations
-            tf = (tf
-                  & obj._metadata.loc[:, obj._metadata.columns.isin(_columns)]
-                  .apply(lambda x: x.str.contains(annotations[x.name], case=False, regex=False))
-                  .all(axis=1))
+            tf = tf & obj._metadata.loc[:, obj._metadata.columns.isin(_columns)].apply(
+                lambda x: x.str.contains(annotations[x.name], case=False, regex=False)
+            ).all(axis=1)
             obj._metadata = obj._metadata[tf]
-        obj._streams = obj._metadata['stream']
+        obj._streams = obj._metadata["stream"]
         return obj
 
     def clone(self):
@@ -2195,7 +2209,7 @@ def arrow_values(
                         pa.field(str(s.uuid), pa.float64(), nullable=False)
                         for s in self._streams
                     ],
-                    )
+                )
                 data = pa.Table.from_arrays(
                     [pa.array([]) for i in range(1 + len(self._streams))], schema=schema
                 )
@@ -2277,4 +2291,4 @@ def _coalesce_table_deque(tables: deque):
         main_table = main_table.join(
             t2, "time", join_type="full outer", right_suffix=f"_{idx}"
         )
-    return main_table
+    return main_table
diff --git a/requirements.txt b/requirements.txt
@@ -5,6 +5,7 @@
 #    pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml
 #
 # This file was modified to remove version pins.
+dask
 certifi
     # via btrdb (pyproject.toml)
 grpcio

Original file line number	Diff line number	Diff line change
`@@ -5,6 +5,7 @@`
`5`	`5`	`# pip-compile --output-file=requirements.txt --resolver=backtracking pyproject.toml`
`6`	`6`	`#`
`7`	`7`	`# This file was modified to remove version pins.`
	`8`	`+dask`
`8`	`9`	`certifi`
`9`	`10`	`# via btrdb (pyproject.toml)`
`10`	`11`	`grpcio`