Skip to content

Commit

Permalink
Revert "Revert "MutableDictionaryArray rewrite: use values stored in …
Browse files Browse the repository at this point in the history
…the array instead of the hash->hash map" (jorgecarleitao#1559)"

This reverts commit b2017d7.
  • Loading branch information
aldanor committed Sep 3, 2023
1 parent b2017d7 commit c4d5203
Show file tree
Hide file tree
Showing 8 changed files with 469 additions and 98 deletions.
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ hash_hasher = "^2.0.3"
simdutf8 = "0.1.4"

# A Rust port of SwissTable
hashbrown = { version = "0.14", default-features = false, optional = true }
hashbrown = { version = "0.14", default-features = false, features = ["ahash"] }

# for timezone support
chrono-tz = { version = "0.8", optional = true }
Expand Down Expand Up @@ -243,7 +243,7 @@ compute_merge_sort = ["itertools", "compute_sort"]
compute_nullif = ["compute_comparison"]
compute_partition = ["compute_sort"]
compute_regex_match = ["regex"]
compute_sort = ["compute_take", "hashbrown"]
compute_sort = ["compute_take"]
compute_substring = []
compute_take = []
compute_temporal = []
Expand Down
1 change: 1 addition & 0 deletions src/array/dictionary/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ mod iterator;
mod mutable;
use crate::array::specification::check_indexes_unchecked;
mod typed_iterator;
mod value_map;

use crate::array::dictionary::typed_iterator::{DictValue, DictionaryValuesIterTyped};
pub use iterator::*;
Expand Down
135 changes: 57 additions & 78 deletions src/array/dictionary/mutable.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
use std::hash::{Hash, Hasher};
use std::{collections::hash_map::DefaultHasher, sync::Arc};

use hash_hasher::HashedMap;
use std::hash::Hash;
use std::sync::Arc;

use crate::array::indexable::{AsIndexed, Indexable};
use crate::{
array::{primitive::MutablePrimitiveArray, Array, MutableArray, TryExtend, TryPush},
bitmap::MutableBitmap,
datatypes::DataType,
error::{Error, Result},
error::Result,
};

use super::value_map::ValueMap;
use super::{DictionaryArray, DictionaryKey};

/// A mutable, strong-typed version of [`DictionaryArray`].
Expand All @@ -30,55 +30,29 @@ use super::{DictionaryArray, DictionaryKey};
#[derive(Debug)]
pub struct MutableDictionaryArray<K: DictionaryKey, M: MutableArray> {
data_type: DataType,
map: ValueMap<K, M>,
// invariant: `max(keys) < map.values().len()`
keys: MutablePrimitiveArray<K>,
map: HashedMap<u64, K>,
// invariant: `keys.len() <= values.len()`
values: M,
}

impl<K: DictionaryKey, M: MutableArray> From<MutableDictionaryArray<K, M>> for DictionaryArray<K> {
fn from(mut other: MutableDictionaryArray<K, M>) -> Self {
fn from(other: MutableDictionaryArray<K, M>) -> Self {
// Safety - the invariant of this struct ensures that this is up-held
unsafe {
DictionaryArray::<K>::try_new_unchecked(
other.data_type,
other.keys.into(),
other.values.as_box(),
other.map.into_boxed().as_box(),
)
.unwrap()
}
}
}

impl<K: DictionaryKey, M: MutableArray> From<M> for MutableDictionaryArray<K, M> {
fn from(values: M) -> Self {
Self {
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
}
}
}

impl<K: DictionaryKey, M: MutableArray + Default> MutableDictionaryArray<K, M> {
/// Creates an empty [`MutableDictionaryArray`].
pub fn new() -> Self {
let values = M::default();
Self {
data_type: DataType::Dictionary(
K::KEY_TYPE,
Box::new(values.data_type().clone()),
false,
),
keys: MutablePrimitiveArray::<K>::new(),
map: HashedMap::default(),
values,
}
Self::try_empty(M::default()).unwrap()
}
}

Expand All @@ -89,22 +63,34 @@ impl<K: DictionaryKey, M: MutableArray + Default> Default for MutableDictionaryA
}

impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
/// Returns whether the value should be pushed to the values or not
fn try_push_valid<T: Hash>(&mut self, value: &T) -> Result<bool> {
let mut hasher = DefaultHasher::new();
value.hash(&mut hasher);
let hash = hasher.finish();
match self.map.get(&hash) {
Some(key) => {
self.keys.push(Some(*key));
Ok(false)
}
None => {
let key = K::try_from(self.map.len()).map_err(|_| Error::Overflow)?;
self.map.insert(hash, key);
self.keys.push(Some(key));
Ok(true)
}
/// Creates an empty [`MutableDictionaryArray`] from a given empty values array.
/// # Errors
/// Errors if the array is non-empty.
pub fn try_empty(values: M) -> Result<Self> {
Ok(Self::from_value_map(ValueMap::<K, M>::try_empty(values)?))
}

/// Creates an empty [`MutableDictionaryArray`] preloaded with a given dictionary of values.
/// Indices associated with those values are automatically assigned based on the order of
/// the values.
/// # Errors
/// Errors if there's more values than the maximum value of `K`.
pub fn from_values(values: M) -> Result<Self>
where
M: Indexable,
M::Type: Eq + Hash,
{
Ok(Self::from_value_map(ValueMap::<K, M>::from_values(values)?))
}

fn from_value_map(value_map: ValueMap<K, M>) -> Self {
let keys = MutablePrimitiveArray::<K>::new();
let data_type =
DataType::Dictionary(K::KEY_TYPE, Box::new(value_map.data_type().clone()), false);
Self {
data_type,
map: value_map,
keys,
}
}

Expand All @@ -113,14 +99,9 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
self.keys.push(None)
}

/// returns a mutable reference to the inner values.
fn mut_values(&mut self) -> &mut M {
&mut self.values
}

/// returns a reference to the inner values.
pub fn values(&self) -> &M {
&self.values
self.map.values()
}

/// converts itself into [`Arc<dyn Array>`]
Expand All @@ -142,15 +123,10 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {

/// Shrinks the capacity of the [`MutableDictionaryArray`] to fit its current length.
pub fn shrink_to_fit(&mut self) {
self.values.shrink_to_fit();
self.map.shrink_to_fit();
self.keys.shrink_to_fit();
}

/// Returns the dictionary map
pub fn map(&self) -> &HashedMap<u64, K> {
&self.map
}

/// Returns the dictionary keys
pub fn keys(&self) -> &MutablePrimitiveArray<K> {
&self.keys
Expand All @@ -160,7 +136,7 @@ impl<K: DictionaryKey, M: MutableArray> MutableDictionaryArray<K, M> {
DictionaryArray::<K>::try_new(
self.data_type.clone(),
std::mem::take(&mut self.keys).into(),
self.values.as_box(),
self.map.take_into(),
)
.unwrap()
}
Expand Down Expand Up @@ -208,17 +184,20 @@ impl<K: DictionaryKey, M: 'static + MutableArray> MutableArray for MutableDictio
}
}

impl<K, M, T: Hash> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
impl<K, M, T> TryExtend<Option<T>> for MutableDictionaryArray<K, M>
where
K: DictionaryKey,
M: MutableArray + TryExtend<Option<T>>,
M: MutableArray + Indexable + TryExtend<Option<T>>,
T: AsIndexed<M>,
M::Type: Eq + Hash,
{
fn try_extend<II: IntoIterator<Item = Option<T>>>(&mut self, iter: II) -> Result<()> {
for value in iter {
if let Some(value) = value {
if self.try_push_valid(&value)? {
self.mut_values().try_extend(std::iter::once(Some(value)))?;
}
let key = self
.map
.try_push_valid(value, |arr, v| arr.try_extend(std::iter::once(Some(v))))?;
self.keys.try_push(Some(key))?;
} else {
self.push_null();
}
Expand All @@ -230,19 +209,19 @@ where
impl<K, M, T> TryPush<Option<T>> for MutableDictionaryArray<K, M>
where
K: DictionaryKey,
M: MutableArray + TryPush<Option<T>>,
T: Hash,
M: MutableArray + Indexable + TryPush<Option<T>>,
T: AsIndexed<M>,
M::Type: Eq + Hash,
{
fn try_push(&mut self, item: Option<T>) -> Result<()> {
if let Some(value) = item {
if self.try_push_valid(&value)? {
self.values.try_push(Some(value))
} else {
Ok(())
}
let key = self
.map
.try_push_valid(value, |arr, v| arr.try_push(Some(v)))?;
self.keys.try_push(Some(key))?;
} else {
self.push_null();
Ok(())
}
Ok(())
}
}
Loading

0 comments on commit c4d5203

Please sign in to comment.