Multistream read bench insert bench (#26)

justinGilmer · web-flow · commit c07a5beb6634 · 2023-06-13T13:55:04.000-05:00
* Fix multistream endpoint bugs

* The streamset was passing the incorrect params to the endpoint
* The endpoint does not return a `version` in its response, just `stat` and `arrowBytes`

Params have been updated and a NoneType is passed around to ignore the
lack of version info, which lets us use the same logic for all bytes
decoding.

* Add multistream benchmark methods for timesnap and no timesnap.
diff --git a/benchmarks/benchmark_streamset_reads.py b/benchmarks/benchmark_streamset_reads.py
@@ -288,6 +288,92 @@ def time_streamset_arrow_aligned_windows_values(
     return results
 
 
+def time_streamset_arrow_multistream_raw_values_non_timesnapped(
+    streamset: btrdb.stream.StreamSet,
+    start: int,
+    end: int,
+    version: int = 0,
+    sampling_frequency: int = None,
+) -> Dict[str, Union[str, int, float]]:
+    """Use the arrow multistream endpoint that joins the stream data on-server before sending to the client.
+
+    We make sure to set a sampling rate of 0 to ensure that we do not time snap and just perform a full-outer join on
+    the streams.
+
+    Parameters
+    ----------
+    streamset : btrdb.stream.StreamSet, required
+        The streamset to perform the multistream query on.
+    start : int, required
+        The start time (in nanoseconds) to query raw data from.
+    end : int, required
+        The end time (in nanoseconds) non-exclusive, to query raw data from.
+    version : int, optional, default=0
+        The version of the stream to pin against, currently this is unused.
+    sampling_frequency : int, optional, ignored
+        The sampling frequency of the data stream in Hz
+
+    Notes
+    -----
+    Sampling_frequency is not used here, it will be manually set.
+    """
+    streamset = streamset.filter(start=start, end=end, sampling_frequency=0)
+    versions = {s.uuid: 0 for s in streamset}
+    streamset = streamset.pin_versions(versions)
+    tic = perf_counter()
+    vals = streamset.arrow_values()
+    toc = perf_counter()
+    queried_points = vals.num_rows * len(streamset)
+    #    print(vals)
+    #    print(vals.to_pandas().describe())
+    run_time = toc - tic
+    results = _create_streamset_result_dict(
+        streamset=streamset, total_time=run_time, point_count=queried_points, version=0
+    )
+    return results
+
+
+def time_streamset_arrow_multistream_raw_values_timesnapped(
+    streamset: btrdb.stream.StreamSet,
+    start: int,
+    end: int,
+    sampling_frequency: int,
+    version: int = 0,
+) -> Dict[str, Union[str, int, float]]:
+    """Use the arrow multistream endpoint that joins the stream data on-server before sending to the client.
+
+    We make sure to set a sampling rate to ensure that we time snap the returned data.
+
+    Parameters
+    ----------
+    streamset : btrdb.stream.StreamSet, required
+        The streamset to perform the multistream query on.
+    start : int, required
+        The start time (in nanoseconds) to query raw data from.
+    end : int, required
+        The end time (in nanoseconds) non-exclusive, to query raw data from.
+    sampling_frequency : int, required
+        The common sampling frequency (in Hz) of the data to snap the data points to.
+    version : int, optional, default=0
+        The version of the stream to pin against, currently this is unused.
+    """
+    streamset = streamset.filter(
+        start=start, end=end, sampling_frequency=sampling_frequency
+    )
+    versions = {s.uuid: 0 for s in streamset}
+    streamset = streamset.pin_versions(versions)
+    tic = perf_counter()
+    vals = streamset.arrow_values()
+    toc = perf_counter()
+    queried_points = vals.num_rows * len(streamset)
+    #    print(vals)
+    run_time = toc - tic
+    results = _create_streamset_result_dict(
+        streamset=streamset, total_time=run_time, point_count=queried_points, version=0
+    )
+    return results
+
+
 def _create_streamset_result_dict(
     streamset: btrdb.stream.StreamSet,
     point_count: int,
@@ -337,6 +423,13 @@ def main():
         res = f(streamset, start, end, pointwidth=pointwidth, version=0)
         res["func"] = f.__name__
         res_list.append(res)
+    for f in [
+        time_streamset_arrow_multistream_raw_values_non_timesnapped,
+        time_streamset_arrow_multistream_raw_values_timesnapped,
+    ]:
+        res = f(streamset, start, end, sampling_frequency=2, version=0)
+        res["func"] = f.__name__
+        res_list.append(res)
 
     return res_list
 
diff --git a/btrdb/endpoint.py b/btrdb/endpoint.py
@@ -71,7 +71,7 @@ def arrowMultiValues(self, uu_list, start, end, version_list, snap_periodNS):
         )
         for result in self.stub.ArrowMultiValues(params):
             check_proto_stat(result.stat)
-            yield result.arrowBytes, result.versionMajor
+            yield result.arrowBytes, None
 
     @error_handler
     def arrowInsertValues(self, uu: uuid.UUID, values: bytearray, policy: str):
diff --git a/btrdb/stream.py b/btrdb/stream.py
@@ -1837,8 +1837,10 @@ def _arrow_multivalues(self, period_ns: int):
         params = self._params_from_filters()
         versions = self.versions()
         params["uu_list"] = [s.uuid for s in self._streams]
-        params["versions"] = [versions[s.uuid] for s in self._streams]
+        params["version_list"] = [versions[s.uuid] for s in self._streams]
         params["snap_periodNS"] = period_ns
+        # dict.pop(key, default_return_value_if_no_key)
+        _ = params.pop("sampling_frequency", None)
         arr_bytes = self._btrdb.ep.arrowMultiValues(**params)
         # exhausting the generator from above
         bytes_materialized = list(arr_bytes)

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ def arrowMultiValues(self, uu_list, start, end, version_list, snap_periodNS):`
`71`	`71`	`)`
`72`	`72`	`for result in self.stub.ArrowMultiValues(params):`
`73`	`73`	`check_proto_stat(result.stat)`
`74`		`- yield result.arrowBytes, result.versionMajor`
	`74`	`+ yield result.arrowBytes, None`
`75`	`75`
`76`	`76`	`@error_handler`
`77`	`77`	`def arrowInsertValues(self, uu: uuid.UUID, values: bytearray, policy: str):`