apache
diff --git a/‎native/core/src/execution/jni_api.rs‎
Lines changed: 33 additions & 18 deletions b/‎native/core/src/execution/jni_api.rs‎
Lines changed: 33 additions & 18 deletions
diff --git a/‎native/core/src/execution/shuffle/mod.rs‎
Lines changed: 3 additions & 1 deletion b/‎native/core/src/execution/shuffle/mod.rs‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎native/core/src/execution/shuffle/shuffle_writer.rs‎
Lines changed: 25 additions & 8 deletions b/‎native/core/src/execution/shuffle/shuffle_writer.rs‎
Lines changed: 25 additions & 8 deletions
diff --git a/‎spark/src/main/scala/org/apache/comet/Native.scala‎
Lines changed: 17 additions & 0 deletions b/‎spark/src/main/scala/org/apache/comet/Native.scala‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎spark/src/main/scala/org/apache/spark/sql/comet/CometBroadcastExchangeExec.scala‎
Lines changed: 22 additions & 2 deletions b/‎spark/src/main/scala/org/apache/spark/sql/comet/CometBroadcastExchangeExec.scala‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala‎
Lines changed: 1 addition & 0 deletions b/‎spark/src/main/scala/org/apache/spark/sql/comet/CometMetricNode.scala‎
Lines changed: 1 addition & 0 deletions
@@ -17,6 +17,7 @@
 
 //! Define JNI APIs which can be called from Java/Scala.
 
+use super::{serde, utils::SparkArrowConvert, CometMemoryPool};
 use arrow::datatypes::DataType as ArrowDataType;
 use arrow_array::RecordBatch;
 use datafusion::{
@@ -40,8 +41,6 @@ use jni::{
 use std::time::{Duration, Instant};
 use std::{collections::HashMap, sync::Arc, task::Poll};
 
-use super::{serde, utils::SparkArrowConvert, CometMemoryPool};
-
 use crate::{
     errors::{try_unwrap_or_throw, CometError, CometResult},
     execution::{
@@ -53,13 +52,15 @@ use crate::{
 use datafusion_comet_proto::spark_operator::Operator;
 use datafusion_common::ScalarValue;
 use futures::stream::StreamExt;
+use jni::objects::JByteBuffer;
 use jni::{
     objects::GlobalRef,
     sys::{jboolean, jdouble, jintArray, jobjectArray, jstring},
 };
 use tokio::runtime::Runtime;
 
 use crate::execution::operators::ScanExec;
+use crate::execution::shuffle::read_ipc_compressed;
 use crate::execution::spark_plan::SparkPlan;
 use log::info;
 
@@ -95,7 +96,7 @@ struct ExecutionContext {
 
 /// Accept serialized query plan and return the address of the native query plan.
 /// # Safety
-/// This function is inheritly unsafe since it deals with raw pointers passed from JNI.
+/// This function is inherently unsafe since it deals with raw pointers passed from JNI.
 #[no_mangle]
 pub unsafe extern "system" fn Java_org_apache_comet_Native_createPlan(
     e: JNIEnv,
@@ -231,7 +232,7 @@ fn prepare_output(
     array_addrs: jlongArray,
     schema_addrs: jlongArray,
     output_batch: RecordBatch,
-    exec_context: &mut ExecutionContext,
+    validate: bool,
 ) -> CometResult<jlong> {
     let array_address_array = unsafe { JLongArray::from_raw(array_addrs) };
     let num_cols = env.get_array_length(&array_address_array)? as usize;
@@ -255,7 +256,7 @@ fn prepare_output(
         )));
     }
 
-    if exec_context.debug_native {
+    if validate {
         // Validate the output arrays.
         for array in results.iter() {
             let array_data = array.to_data();
@@ -275,9 +276,6 @@ fn prepare_output(
         i += 1;
     }
 
-    // Update metrics
-    update_metrics(env, exec_context)?;
-
     Ok(num_rows as jlong)
 }
 
@@ -298,7 +296,7 @@ fn pull_input_batches(exec_context: &mut ExecutionContext) -> Result<(), CometEr
 /// Accept serialized query plan and the addresses of Arrow Arrays from Spark,
 /// then execute the query. Return addresses of arrow vector.
 /// # Safety
-/// This function is inheritly unsafe since it deals with raw pointers passed from JNI.
+/// This function is inherently unsafe since it deals with raw pointers passed from JNI.
 #[no_mangle]
 pub unsafe extern "system" fn Java_org_apache_comet_Native_executePlan(
     e: JNIEnv,
@@ -356,22 +354,22 @@ pub unsafe extern "system" fn Java_org_apache_comet_Native_executePlan(
             let next_item = exec_context.stream.as_mut().unwrap().next();
             let poll_output = exec_context.runtime.block_on(async { poll!(next_item) });
 
+            // Update metrics
+            update_metrics(&mut env, exec_context)?;
+
             match poll_output {
                 Poll::Ready(Some(output)) => {
+                    // prepare output for FFI transfer
                     return prepare_output(
                         &mut env,
                         array_addrs,
                         schema_addrs,
                         output?,
-                        exec_context,
+                        exec_context.debug_native,
                     );
                 }
                 Poll::Ready(None) => {
                     // Reaches EOF of output.
-
-                    // Update metrics
-                    update_metrics(&mut env, exec_context)?;
-
                     if exec_context.explain_native {
                         if let Some(plan) = &exec_context.root_op {
                             let formatted_plan_str =
@@ -391,9 +389,6 @@ pub unsafe extern "system" fn Java_org_apache_comet_Native_executePlan(
                 // A poll pending means there are more than one blocking operators,
                 // we don't need go back-forth between JVM/Native. Just keeping polling.
                 Poll::Pending => {
-                    // Update metrics
-                    update_metrics(&mut env, exec_context)?;
-
                     // Pull input batches
                     pull_input_batches(exec_context)?;
 
@@ -459,7 +454,7 @@ fn get_execution_context<'a>(id: i64) -> &'a mut ExecutionContext {
 
 /// Used by Comet shuffle external sorter to write sorted records to disk.
 /// # Safety
-/// This function is inheritly unsafe since it deals with raw pointers passed from JNI.
+/// This function is inherently unsafe since it deals with raw pointers passed from JNI.
 #[no_mangle]
 pub unsafe extern "system" fn Java_org_apache_comet_Native_writeSortedFileNative(
     e: JNIEnv,
@@ -544,3 +539,23 @@ pub extern "system" fn Java_org_apache_comet_Native_sortRowPartitionsNative(
         Ok(())
     })
 }
+
+#[no_mangle]
+/// Used by Comet native shuffle reader
+/// # Safety
+/// This function is inherently unsafe since it deals with raw pointers passed from JNI.
+pub unsafe extern "system" fn Java_org_apache_comet_Native_decodeShuffleBlock(
+    e: JNIEnv,
+    _class: JClass,
+    byte_buffer: JByteBuffer,
+    array_addrs: jlongArray,
+    schema_addrs: jlongArray,
+) -> jlong {
+    try_unwrap_or_throw(&e, |mut env| {
+        let raw_pointer = env.get_direct_buffer_address(&byte_buffer)?;
+        let length = env.get_direct_buffer_capacity(&byte_buffer)?;
+        let slice: &[u8] = unsafe { std::slice::from_raw_parts(raw_pointer, length) };
+        let batch = read_ipc_compressed(slice)?;
+        prepare_output(&mut env, array_addrs, schema_addrs, batch, false)
+    })
+}
@@ -19,4 +19,6 @@ mod list;
 mod map;
 pub mod row;
 mod shuffle_writer;
-pub use shuffle_writer::{write_ipc_compressed, CompressionCodec, ShuffleWriterExec};
+pub use shuffle_writer::{
+    read_ipc_compressed, write_ipc_compressed, CompressionCodec, ShuffleWriterExec,
+};
@@ -21,6 +21,7 @@ use crate::{
     common::bit::ceil,
     errors::{CometError, CometResult},
 };
+use arrow::ipc::reader::StreamReader;
 use arrow::{datatypes::*, ipc::writer::StreamWriter};
 use async_trait::async_trait;
 use bytes::Buf;
@@ -1555,7 +1556,7 @@ pub enum CompressionCodec {
 pub fn write_ipc_compressed<W: Write + Seek>(
     batch: &RecordBatch,
     output: &mut W,
-    codec: &CompressionCodec,
+    compression_codec: &CompressionCodec,
     ipc_time: &Time,
 ) -> Result<usize> {
     if batch.num_rows() == 0 {
@@ -1565,10 +1566,14 @@ pub fn write_ipc_compressed<W: Write + Seek>(
     let mut timer = ipc_time.timer();
     let start_pos = output.stream_position()?;
 
-    // write ipc_length placeholder
+    // write message length placeholder
     output.write_all(&[0u8; 8])?;
 
-    let output = match codec {
+    // write number of columns because JVM side needs to know how many addresses to allocate
+    let field_count = batch.schema().fields().len();
+    output.write_all(&field_count.to_le_bytes())?;
+
+    let output = match compression_codec {
         CompressionCodec::None => {
             let mut arrow_writer = StreamWriter::try_new(output, &batch.schema())?;
             arrow_writer.write(batch)?;
@@ -1587,18 +1592,25 @@ pub fn write_ipc_compressed<W: Write + Seek>(
 
     // fill ipc length
     let end_pos = output.stream_position()?;
-    let ipc_length = end_pos - start_pos - 8;
+    let compressed_length = end_pos - start_pos - 8;
 
     // fill ipc length
     output.seek(SeekFrom::Start(start_pos))?;
-    output.write_all(&ipc_length.to_le_bytes()[..])?;
+    output.write_all(&compressed_length.to_le_bytes()[..])?;
     output.seek(SeekFrom::Start(end_pos))?;
 
     timer.stop();
 
     Ok((end_pos - start_pos) as usize)
 }
 
+pub fn read_ipc_compressed(bytes: &[u8]) -> Result<RecordBatch> {
+    let decoder = zstd::Decoder::new(bytes)?;
+    let mut reader = StreamReader::try_new(decoder, None)?;
+    // TODO check for None
+    reader.next().unwrap().map_err(|e| e.into())
+}
+
 /// A stream that yields no record batches which represent end of output.
 pub struct EmptyStream {
     /// Schema representing the data
@@ -1648,18 +1660,23 @@ mod test {
 
     #[test]
     #[cfg_attr(miri, ignore)] // miri can't call foreign function `ZSTD_createCCtx`
-    fn write_ipc_zstd() {
+    fn roundtrip_ipc_zstd() {
         let batch = create_batch(8192);
         let mut output = vec![];
         let mut cursor = Cursor::new(&mut output);
-        write_ipc_compressed(
+        let length = write_ipc_compressed(
             &batch,
             &mut cursor,
             &CompressionCodec::Zstd(1),
             &Time::default(),
         )
         .unwrap();
-        assert_eq!(40218, output.len());
+        assert_eq!(40226, output.len());
+        assert_eq!(40226, length);
+
+        let ipc_without_length_prefix = &output[16..];
+        let batch2 = read_ipc_compressed(ipc_without_length_prefix).unwrap();
+        assert_eq!(batch, batch2);
     }
 
     #[test]
 
@@ -19,6 +19,8 @@
 
 package org.apache.comet
 
+import java.nio.ByteBuffer
+
 import org.apache.spark.CometTaskMemoryManager
 import org.apache.spark.sql.comet.CometMetricNode
 
@@ -139,4 +141,19 @@ class Native extends NativeBase {
    *   the size of the array.
    */
   @native def sortRowPartitionsNative(addr: Long, size: Long): Unit
+
+  /**
+   * Decompress and decode a native shuffle block.
+   * @param shuffleBlock
+   *   the encoded anc compressed shuffle block.
+   * @param addr
+   *   the address of the array of compressed and encoded bytes.
+   * @param size
+   *   the size of the array.
+   */
+  @native def decodeShuffleBlock(
+      shuffleBlock: ByteBuffer,
+      arrayAddrs: Array[Long],
+      schemaAddrs: Array[Long]): Long
+
 }
@@ -19,20 +19,24 @@
 
 package org.apache.spark.sql.comet
 
+import java.io.DataInputStream
+import java.nio.channels.Channels
 import java.util.UUID
 import java.util.concurrent.{Future, TimeoutException, TimeUnit}
 
 import scala.concurrent.{ExecutionContext, Promise}
 import scala.concurrent.duration.NANOSECONDS
 import scala.util.control.NonFatal
 
-import org.apache.spark.{broadcast, Partition, SparkContext, TaskContext}
+import org.apache.spark.{broadcast, Partition, SparkContext, SparkEnv, TaskContext}
 import org.apache.spark.comet.shims.ShimCometBroadcastExchangeExec
+import org.apache.spark.io.CompressionCodec
 import org.apache.spark.launcher.SparkLauncher
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
 import org.apache.spark.sql.catalyst.plans.logical.Statistics
+import org.apache.spark.sql.comet.execution.shuffle.ArrowReaderIterator
 import org.apache.spark.sql.errors.QueryExecutionErrors
 import org.apache.spark.sql.execution.{ColumnarToRowExec, SparkPlan, SQLExecution}
 import org.apache.spark.sql.execution.adaptive.{AQEShuffleReadExec, ShuffleQueryStageExec}
@@ -299,8 +303,24 @@ class CometBatchRDD(
   override def compute(split: Partition, context: TaskContext): Iterator[ColumnarBatch] = {
     val partition = split.asInstanceOf[CometBatchPartition]
     partition.value.value.toIterator
-      .flatMap(CometExec.decodeBatches(_, this.getClass.getSimpleName))
+      .flatMap(decodeBatches(_, this.getClass.getSimpleName))
   }
+
+  /**
+   * Decodes the byte arrays back to ColumnarBatches and put them into buffer.
+   */
+  private def decodeBatches(bytes: ChunkedByteBuffer, source: String): Iterator[ColumnarBatch] = {
+    if (bytes.size == 0) {
+      return Iterator.empty
+    }
+
+    // decompress with Spark codec not Comet so this is not compatible with shuffle
+    val codec = CompressionCodec.createCodec(SparkEnv.get.conf)
+    val cbbis = bytes.toInputStream()
+    val ins = new DataInputStream(codec.compressedInputStream(cbbis))
+    new ArrowReaderIterator(Channels.newChannel(ins), source)
+  }
+
 }
 
 class CometBatchPartition(
 
@@ -136,6 +136,7 @@ object CometMetricNode {
       "mempool_time" -> SQLMetrics.createNanoTimingMetric(sc, "memory pool time"),
       "repart_time" -> SQLMetrics.createNanoTimingMetric(sc, "repartition time"),
       "ipc_time" -> SQLMetrics.createNanoTimingMetric(sc, "encoding and compression time"),
+      "decodeTime" -> SQLMetrics.createNanoTimingMetric(sc, "decoding and decompression time"),
       "spill_count" -> SQLMetrics.createMetric(sc, "number of spills"),
       "spilled_bytes" -> SQLMetrics.createMetric(sc, "spilled bytes"),
       "input_batches" -> SQLMetrics.createMetric(sc, "number of input batches"))