Skip to content

Commit 2ec5f55

Browse files
thinkharderdevavantgardnerio
authored andcommitted
Add JoinContext with JoinLeftData to TaskContext in HashJoinExec (#300)
* Add JoinContext with JoinLeftData to TaskContext in HashJoinExec * Expose random state as const * re-export ahash::RandomState * JoinContext default impl * Add debug log when setting join left data
1 parent 97058f5 commit 2ec5f55

File tree

3 files changed

+39
-2
lines changed

3 files changed

+39
-2
lines changed

datafusion/physical-plan/src/joins/hash_join.rs

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,26 @@ use arrow_buffer::BooleanBuffer;
7878
use datafusion_expr::Operator;
7979
use datafusion_physical_expr_common::datum::compare_op_for_nested;
8080
use futures::{ready, Stream, StreamExt, TryStreamExt};
81+
use log::debug;
8182
use parking_lot::Mutex;
8283

84+
pub const RANDOM_STATE: RandomState = RandomState::with_seeds(0, 0, 0, 0);
85+
86+
#[derive(Default)]
87+
pub struct JoinContext {
88+
build_state: Mutex<Option<Arc<JoinLeftData>>>,
89+
}
90+
91+
impl JoinContext {
92+
pub fn set_build_state(&self, state: Arc<JoinLeftData>) {
93+
self.build_state.lock().replace(state);
94+
}
95+
96+
pub fn get_build_state(&self) -> Option<Arc<JoinLeftData>> {
97+
self.build_state.lock().clone()
98+
}
99+
}
100+
83101
pub struct SharedJoinState {
84102
state_impl: Arc<dyn SharedJoinStateImpl>,
85103
}
@@ -129,7 +147,7 @@ pub trait SharedJoinStateImpl: Send + Sync + 'static {
129147
type SharedBitmapBuilder = Mutex<BooleanBufferBuilder>;
130148

131149
/// HashTable and input data for the left (build side) of a join
132-
struct JoinLeftData {
150+
pub struct JoinLeftData {
133151
/// The hash table with indices into `batch`
134152
hash_map: JoinHashMap,
135153
/// The input rows for the build side
@@ -167,6 +185,10 @@ impl JoinLeftData {
167185
}
168186
}
169187

188+
pub fn contains_hash(&self, hash: u64) -> bool {
189+
self.hash_map.contains_hash(hash)
190+
}
191+
170192
/// return a reference to the hash map
171193
fn hash_map(&self) -> &JoinHashMap {
172194
&self.hash_map
@@ -787,6 +809,7 @@ impl ExecutionPlan for HashJoinExec {
787809

788810
let distributed_state =
789811
context.session_config().get_extension::<SharedJoinState>();
812+
let join_context = context.session_config().get_extension::<JoinContext>();
790813

791814
let join_metrics = BuildProbeJoinMetrics::new(partition, &self.metrics);
792815
let left_fut = match self.mode {
@@ -874,6 +897,7 @@ impl ExecutionPlan for HashJoinExec {
874897
batch_size,
875898
hashes_buffer: vec![],
876899
right_side_ordered: self.right.output_ordering().is_some(),
900+
join_context,
877901
}))
878902
}
879903

@@ -1199,6 +1223,7 @@ struct HashJoinStream {
11991223
hashes_buffer: Vec<u64>,
12001224
/// Specifies whether the right side has an ordering to potentially preserve
12011225
right_side_ordered: bool,
1226+
join_context: Option<Arc<JoinContext>>,
12021227
}
12031228

12041229
impl RecordBatchStream for HashJoinStream {
@@ -1411,6 +1436,11 @@ impl HashJoinStream {
14111436
.get_shared(cx))?;
14121437
build_timer.done();
14131438

1439+
if let Some(ctx) = self.join_context.as_ref() {
1440+
debug!("setting join left data in join context");
1441+
ctx.set_build_state(Arc::clone(&left_data));
1442+
}
1443+
14141444
self.state = HashJoinStreamState::FetchProbeBatch;
14151445
self.build_side = BuildSide::Ready(BuildSideReadyState { left_data });
14161446

datafusion/physical-plan/src/joins/mod.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@
1919
2020
pub use cross_join::CrossJoinExec;
2121
pub use hash_join::{
22-
HashJoinExec, SharedJoinState, SharedJoinStateImpl, SharedProbeState,
22+
HashJoinExec, JoinContext, JoinLeftData, SharedJoinState, SharedJoinStateImpl,
23+
SharedProbeState, RANDOM_STATE,
2324
};
2425
pub use nested_loop_join::NestedLoopJoinExec;
2526
// Note: SortMergeJoin is not used in plans yet
@@ -33,6 +34,8 @@ mod stream_join_utils;
3334
mod symmetric_hash_join;
3435
pub mod utils;
3536

37+
pub type RandomState = ahash::RandomState;
38+
3639
#[cfg(test)]
3740
pub mod test_utils;
3841

datafusion/physical-plan/src/joins/utils.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ impl JoinHashMap {
140140
next: vec![0; capacity],
141141
}
142142
}
143+
144+
pub fn contains_hash(&self, hash: u64) -> bool {
145+
self.map.find(hash, |(h, _)| *h == hash).is_some()
146+
}
143147
}
144148

145149
// Type of offsets for obtaining indices from JoinHashMap.

0 commit comments

Comments
 (0)