wip

zehiko · zehiko · commit 33d56ad36b2f · 2025-12-11T22:31:28.000+01:00
diff --git a/crates/store/re_datafusion/src/dataframe_query_provider.rs b/crates/store/re_datafusion/src/dataframe_query_provider.rs
@@ -446,58 +446,150 @@ async fn chunk_stream_io_loop(
     chunk_infos: Vec<RecordBatch>,
     output_channel: Sender<ApiResult<ChunksWithSegment>>,
 ) -> Result<(), DataFusionError> {
-    let chunk_infos: Vec<_> = chunk_infos.into_iter().map(Into::into).collect();
-
-    // TODO(zehiko) same as previously with get_chunks, we keep sending 1 request per segment.
-    // As these batches are sorted per segment (see docs above), this ensures that ordering by
-    // segment id is preserved regardless of how server might order responses (in the case of having
-    // batches with different segments in the same request). However, quick testing shows that this
-    // is at least 2x slower than sending all segments in one request. Consider providing ordering
-    // guarantees server side in the future.
-
-    // Convert to concurrent processing using buffered streams
-    const CONCURRENT_REQUESTS: usize = 16; // Adjust based on your needs
-
-    futures_util::stream::iter(chunk_infos)
-        .map(|chunk_info| {
-            let mut client = client.clone();
-            let output_channel = output_channel.clone();
-            async move {
-                let fetch_chunks_request = FetchChunksRequest {
-                    chunk_infos: vec![chunk_info],
-                };
-
-                let fetch_chunks_response_stream = client
-                    .inner()
-                    .fetch_chunks(fetch_chunks_request)
-                    .instrument(tracing::trace_span!("chunk_stream_io_loop"))
-                    .await
-                    .map_err(|err| exec_datafusion_err!("{err}"))?
-                    .into_inner();
-
-                // Then we need to fully decode these chunks, i.e. both the transport layer (Protobuf)
-                // and the app layer (Arrow).
-                let mut chunk_stream =
-                    re_redap_client::fetch_chunks_response_to_chunk_and_segment_id(
-                        fetch_chunks_response_stream,
-                    );
-
-                while let Some(chunk_and_segment_id) = chunk_stream.next().await {
-                    if output_channel.send(chunk_and_segment_id).await.is_err() {
-                        break;
+    // Pipeline configuration
+    const CONCURRENT_REQUESTS: usize = 16;
+    const BUFFER_SIZE_MB: usize = 512;
+    const BUFFER_SIZE_BYTES: usize = BUFFER_SIZE_MB * 1024 * 1024;
+
+    // Create intermediate channel for ordered buffering
+    let (intermediate_tx, mut intermediate_rx) =
+        tokio::sync::mpsc::channel::<(usize, ApiResult<ChunksWithSegment>, u64)>(1024);
+
+    // We need to pair original RecordBatch with converted ChunkInfo for byte length extraction
+    use re_protos::common::v1alpha1::DataframePart;
+    let chunk_info_pairs: Vec<_> = chunk_infos
+        .into_iter()
+        .enumerate()
+        .map(|(index, batch)| {
+            let chunk_info: DataframePart = batch.clone().into();
+            (index, batch, chunk_info)
+        })
+        .collect();
+
+    // Spawn concurrent fetchers
+    let fetcher_handle = tokio::spawn(async move {
+        futures_util::stream::iter(chunk_info_pairs)
+            .map(|(index, original_batch, chunk_info)| {
+                let mut client = client.clone();
+                let intermediate_tx = intermediate_tx.clone();
+                async move {
+                    let fetch_chunks_request = FetchChunksRequest {
+                        chunk_infos: vec![chunk_info],
+                    };
+
+                    let fetch_chunks_response_stream = client
+                        .inner()
+                        .fetch_chunks(fetch_chunks_request)
+                        .instrument(tracing::trace_span!("chunk_stream_io_loop"))
+                        .await
+                        .map_err(|err| exec_datafusion_err!("{err}"))?
+                        .into_inner();
+
+                    let mut chunk_stream =
+                        re_redap_client::fetch_chunks_response_to_chunk_and_segment_id(
+                            fetch_chunks_response_stream,
+                        );
+
+                    while let Some(chunk_and_segment_id) = chunk_stream.next().await {
+                        // Extract byte length from original RecordBatch
+                        let byte_len = extract_chunk_byte_len(&original_batch)?;
+
+                        if intermediate_tx
+                            .send((index, chunk_and_segment_id, byte_len))
+                            .await
+                            .is_err()
+                        {
+                            break;
+                        }
+                    }
+
+                    Ok::<(), DataFusionError>(())
+                }
+            })
+            .buffered(CONCURRENT_REQUESTS)
+            .try_collect::<Vec<_>>()
+            .await
+    });
+
+    // Spawn ordered buffer manager
+    let buffer_handle = tokio::spawn(async move {
+        let mut buffer = Vec::new();
+        let mut total_bytes = 0u64;
+
+        while let Some((index, chunk_result, byte_len)) = intermediate_rx.recv().await {
+            buffer.push((index, chunk_result, byte_len));
+            total_bytes += byte_len;
+
+            // Check if we should flush (either buffer size reached or no more data expected)
+            if total_bytes >= BUFFER_SIZE_BYTES as u64 || intermediate_rx.is_closed() {
+                // Sort buffer by original index to preserve input ordering
+                buffer.sort_by_key(|(index, _, _)| *index);
+
+                // Flush ordered chunks to output
+                for (_, chunk_result, _) in buffer.drain(..) {
+                    if output_channel.send(chunk_result).await.is_err() {
+                        return Ok(());
                     }
                 }
 
-                Ok::<(), DataFusionError>(())
+                total_bytes = 0;
             }
-        })
-        .buffered(CONCURRENT_REQUESTS)
-        .try_collect::<Vec<_>>()
-        .await?;
+        }
+
+        // Flush any remaining chunks
+        if !buffer.is_empty() {
+            buffer.sort_by_key(|(index, _, _)| *index);
+            for (_, chunk_result, _) in buffer.drain(..) {
+                if output_channel.send(chunk_result).await.is_err() {
+                    break;
+                }
+            }
+        }
+
+        Ok::<(), DataFusionError>(())
+    });
+
+    // Wait for both tasks to complete
+    let (fetcher_result, buffer_result) = tokio::try_join!(fetcher_handle, buffer_handle)
+        .map_err(|err| exec_datafusion_err!("Task join error: {err}"))?;
+
+    fetcher_result?;
+    buffer_result?;
 
     Ok(())
 }
 
+fn extract_chunk_byte_len(chunk_info_batch: &RecordBatch) -> Result<u64, DataFusionError> {
+    use arrow::array::AsArray as _;
+    use re_protos::cloud::v1alpha1::FetchChunksRequest;
+
+    // Find the chunk_byte_len column in the batch
+    let schema = chunk_info_batch.schema();
+    let chunk_byte_len_col_idx = schema
+        .column_with_name(FetchChunksRequest::FIELD_CHUNK_BYTE_LEN)
+        .ok_or_else(|| {
+            exec_datafusion_err!(
+                "Missing {} column in chunk info",
+                FetchChunksRequest::FIELD_CHUNK_BYTE_LEN
+            )
+        })?
+        .0;
+
+    let chunk_byte_len_array = chunk_info_batch.column(chunk_byte_len_col_idx);
+
+    // Assuming it's a UInt64 array with a single value (since we're processing one chunk at a time)
+    let uint64_array = chunk_byte_len_array.as_primitive::<arrow::datatypes::UInt64Type>();
+
+    if uint64_array.len() != 1 {
+        return Err(exec_datafusion_err!(
+            "Expected exactly one chunk_byte_len value, got {}",
+            uint64_array.len()
+        ));
+    }
+
+    Ok(uint64_array.value(0))
+}
+
 impl ExecutionPlan for SegmentStreamExec {
     fn name(&self) -> &'static str {
         "SegmentStreamExec"
diff --git a/crates/store/re_protos/src/v1alpha1/rerun.cloud.v1alpha1.ext.rs b/crates/store/re_protos/src/v1alpha1/rerun.cloud.v1alpha1.ext.rs
@@ -206,6 +206,7 @@ impl QueryDatasetResponse {
     pub const FIELD_CHUNK_KEY: &str = "chunk_key";
     pub const FIELD_CHUNK_ENTITY_PATH: &str = "chunk_entity_path";
     pub const FIELD_CHUNK_IS_STATIC: &str = "chunk_is_static";
+    pub const FIELD_CHUNK_BYTE_LEN: &str = "chunk_byte_len";
 
     pub fn field_chunk_id() -> FieldRef {
         lazy_field_ref!(
@@ -271,6 +272,19 @@ impl QueryDatasetResponse {
         )
     }
 
+    pub fn field_chunk_byte_len() -> FieldRef {
+        lazy_field_ref!(
+            Field::new(Self::FIELD_CHUNK_BYTE_LEN, DataType::UInt64, false).with_metadata(
+                [(
+                    re_sorbet::metadata::RERUN_KIND.to_owned(),
+                    "control".to_owned()
+                )]
+                .into_iter()
+                .collect(),
+            )
+        )
+    }
+
     pub fn fields() -> Vec<FieldRef> {
         vec![
             Self::field_chunk_id(),
@@ -279,6 +293,7 @@ impl QueryDatasetResponse {
             Self::field_chunk_key(),
             Self::field_chunk_entity_path(),
             Self::field_chunk_is_static(),
+            Self::field_chunk_byte_len(),
         ]
     }
 
@@ -298,6 +313,7 @@ impl QueryDatasetResponse {
         chunk_keys: Vec<&[u8]>,
         chunk_entity_paths: Vec<String>,
         chunk_is_static: Vec<bool>,
+        chunk_byte_lens: Vec<u64>,
     ) -> arrow::error::Result<RecordBatch> {
         let schema = Arc::new(Self::schema());
 
@@ -310,6 +326,7 @@ impl QueryDatasetResponse {
             Arc::new(BinaryArray::from(chunk_keys)),
             Arc::new(StringArray::from(chunk_entity_paths)),
             Arc::new(BooleanArray::from(chunk_is_static)),
+            Arc::new(arrow::array::UInt64Array::from(chunk_byte_lens)),
         ];
 
         RecordBatch::try_new_with_options(
@@ -328,6 +345,7 @@ impl FetchChunksRequest {
     pub const FIELD_CHUNK_ID: &str = QueryDatasetResponse::FIELD_CHUNK_ID;
     pub const FIELD_CHUNK_SEGMENT_ID: &str = QueryDatasetResponse::FIELD_CHUNK_SEGMENT_ID;
     pub const FIELD_CHUNK_LAYER_NAME: &str = QueryDatasetResponse::FIELD_CHUNK_LAYER_NAME;
+    pub const FIELD_CHUNK_BYTE_LEN: &str = QueryDatasetResponse::FIELD_CHUNK_BYTE_LEN;
 
     pub fn required_column_names() -> Vec<String> {
         vec![
@@ -336,6 +354,7 @@ impl FetchChunksRequest {
             Self::FIELD_CHUNK_ID.to_owned(),
             Self::FIELD_CHUNK_SEGMENT_ID.to_owned(),
             Self::FIELD_CHUNK_LAYER_NAME.to_owned(),
+            Self::FIELD_CHUNK_BYTE_LEN.to_owned(),
         ]
     }
 
@@ -2486,6 +2505,7 @@ mod tests {
         let chunk_keys = vec![b"key1".to_byte_slice(), b"key2".to_byte_slice()];
         let chunk_entity_paths = vec!["/".to_owned(), "/".to_owned()];
         let chunk_is_static = vec![true, false];
+        let chunk_byte_lens = vec![1024u64, 2048u64];
 
         QueryDatasetResponse::create_dataframe(
             chunk_ids,
@@ -2494,6 +2514,7 @@ mod tests {
             chunk_keys,
             chunk_entity_paths,
             chunk_is_static,
+            chunk_byte_lens,
         )
         .unwrap();
     }
diff --git a/crates/store/re_server/src/rerun_cloud.rs b/crates/store/re_server/src/rerun_cloud.rs
@@ -9,6 +9,7 @@ use datafusion::logical_expr::dml::InsertOp;
 use datafusion::prelude::SessionContext;
 use nohash_hasher::IntSet;
 use re_arrow_util::RecordBatchExt as _;
+use re_byte_size::SizeBytes as _;
 use re_chunk_store::{Chunk, ChunkStore, ChunkStoreHandle};
 use re_log_encoding::ToTransport as _;
 use re_log_types::{EntityPath, EntryId, StoreId, StoreKind};
@@ -1088,6 +1089,7 @@ impl RerunCloudService for RerunCloudHandler {
                 let mut chunk_keys = Vec::with_capacity(num_chunks);
                 let mut chunk_entity_path = Vec::with_capacity(num_chunks);
                 let mut chunk_is_static = Vec::with_capacity(num_chunks);
+                let mut chunk_byte_lens = Vec::with_capacity(num_chunks);
 
                 let mut timelines = BTreeMap::new();
 
@@ -1143,6 +1145,7 @@ impl RerunCloudService for RerunCloudHandler {
                     chunk_ids.push(chunk.id());
                     chunk_entity_path.push(chunk.entity_path().to_string());
                     chunk_is_static.push(chunk.is_static());
+                    chunk_byte_lens.push(chunk.total_size_bytes());
                     chunk_keys.push(
                         ChunkKey {
                             chunk_id: chunk.id(),
@@ -1163,6 +1166,7 @@ impl RerunCloudService for RerunCloudHandler {
                     chunk_key_refs,
                     chunk_entity_path,
                     chunk_is_static,
+                    chunk_byte_lens,
                 )
                 .map_err(|err| {
                     tonic::Status::internal(format!("Failed to create dataframe: {err:#}"))