Skip to content

Commit 82a7a7e

Browse files
authored
feat: teach vx.array how to convert Python range into SequenceArray (#4571)
In theory, this allows Python users to avoid allocation for large range arrays. In practice, I think we probably always canonicalize these arrays.
1 parent 0036b15 commit 82a7a7e

File tree

5 files changed

+309
-3
lines changed

5 files changed

+309
-3
lines changed

vortex-python/python/vortex/_lib/arrays.pyi

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class Array:
2222
def from_arrow(
2323
obj: pa.Array[pa.Scalar[pa.DataType]] | pa.ChunkedArray[pa.Scalar[pa.DataType]] | pa.Table,
2424
) -> Array: ...
25+
@staticmethod
26+
def from_range(obj: range) -> Array: ...
2527
def to_arrow_array(self) -> pa.Array[pa.Scalar[pa.DataType]]: ...
2628
@property
2729
def id(self) -> str: ...

vortex-python/python/vortex/arrays.py

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,7 +317,8 @@ def array(
317317
| pyarrow.ChunkedArray[pyarrow.Scalar[Any]] # pyright: ignore[reportExplicitAny]
318318
| pyarrow.Table
319319
| list[Any] # pyright: ignore[reportExplicitAny]
320-
| pandas.DataFrame,
320+
| pandas.DataFrame
321+
| range,
321322
) -> Array:
322323
"""The main entry point for creating Vortex arrays from other Python objects.
323324
@@ -394,10 +395,37 @@ def array(
394395
]
395396
]
396397
398+
Initialize a Vortex array from a range:
399+
400+
>>> vortex.array(range(-3, 3)).to_arrow_array()
401+
<pyarrow.lib.Int64Array object at ...>
402+
[
403+
-3,
404+
-2,
405+
-1,
406+
0,
407+
1,
408+
2
409+
]
410+
411+
With a step:
412+
413+
>>> vortex.array(range(-1_000_000, 10_000_000, 2_000_000)).to_arrow_array()
414+
<pyarrow.lib.Int64Array object at ...>
415+
[
416+
-1000000,
417+
1000000,
418+
3000000,
419+
5000000,
420+
7000000,
421+
9000000
422+
]
397423
"""
398424

399425
if isinstance(obj, list):
400426
return Array.from_arrow(pyarrow.array(obj))
427+
if isinstance(obj, range):
428+
return Array.from_range(obj)
401429
try:
402430
import pandas
403431

vortex-python/src/arrays/mod.rs

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,16 @@ pub(crate) mod fastlanes;
77
pub(crate) mod from_arrow;
88
mod native;
99
pub(crate) mod py;
10+
mod range_to_sequence;
1011

1112
use arrow_array::{Array as ArrowArray, ArrayRef as ArrowArrayRef};
1213
use pyo3::exceptions::{PyTypeError, PyValueError};
1314
use pyo3::prelude::*;
14-
use pyo3::types::{PyDict, PyList};
15+
use pyo3::types::{PyDict, PyList, PyRange, PyRangeMethods};
1516
use vortex::arrays::ChunkedVTable;
1617
use vortex::arrow::IntoArrowArray;
1718
use vortex::compute::{Operator, compare, take};
19+
use vortex::dtype::{DType, Nullability, PType, match_each_integer_ptype};
1820
use vortex::error::VortexError;
1921
use vortex::{Array, ArrayRef, ToCanonical};
2022

@@ -181,7 +183,10 @@ impl PyArray {
181183

182184
/// Convert a PyArrow object into a Vortex array.
183185
///
184-
/// One of :class:`pyarrow.Array`, :class:`pyarrow.ChunkedArray`, or :class:`pyarrow.Table`.
186+
/// Parameters
187+
/// ----------
188+
/// obj: pyarrow.Array | pyarrow.ChunkedArray | pyarrow.Table
189+
/// The array to convert.
185190
///
186191
/// Returns
187192
/// -------
@@ -191,6 +196,76 @@ impl PyArray {
191196
from_arrow::from_arrow(&obj)
192197
}
193198

199+
/// Convert a Python range into a Vortex array.
200+
///
201+
/// Unless the array is empty, the encoding of the array is Sequence, which uses O(1) bytes to
202+
/// represent an array of any size.
203+
///
204+
/// Parameters
205+
/// ----------
206+
/// range: range
207+
/// The range to convert.
208+
///
209+
/// Returns
210+
/// -------
211+
/// :class:`~vortex.Array`
212+
///
213+
///
214+
/// Examples
215+
/// --------
216+
///
217+
/// ```python
218+
/// >>> array = vx.Array.from_range(range(0, 10))
219+
/// >>> array
220+
/// <vortex.SequenceArray object at ...>
221+
/// >>> array.to_arrow_array()
222+
/// <pyarrow.lib.Int64Array object at ...>
223+
/// [
224+
/// 0,
225+
/// 1,
226+
/// 2,
227+
/// 3,
228+
/// 4,
229+
/// 5,
230+
/// 6,
231+
/// 7,
232+
/// 8,
233+
/// 9
234+
/// ]
235+
/// ```
236+
#[staticmethod]
237+
#[pyo3(signature = (range, *, dtype = None))]
238+
fn from_range(range: Bound<PyAny>, dtype: Option<Bound<PyDType>>) -> PyResult<PyArrayRef> {
239+
let range = range.downcast::<PyRange>()?;
240+
let start = range.start()?;
241+
let stop = range.stop()?;
242+
let step = range.step()?;
243+
244+
let (ptype, dtype) = if let Some(dtype) = dtype {
245+
let dtype = dtype.downcast::<PyDType>()?.get().inner().clone();
246+
let DType::Primitive(ptype, ..) = &dtype else {
247+
return Err(PyValueError::new_err(
248+
"Cannot construct non-numeric array from a range.",
249+
));
250+
};
251+
(*ptype, dtype)
252+
} else {
253+
let ptype = if start > 0 && stop > 0 {
254+
PType::U64
255+
} else {
256+
PType::I64
257+
};
258+
let dtype = DType::Primitive(ptype, Nullability::NonNullable);
259+
(ptype, dtype)
260+
};
261+
262+
let array = match_each_integer_ptype!(ptype, |T| {
263+
range_to_sequence::sequence_array_from_range::<T>(start, stop, step, dtype)
264+
})?;
265+
266+
Ok(PyVortex(array))
267+
}
268+
194269
/// Convert this array to a PyArrow array.
195270
///
196271
/// .. seealso::
Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
use vortex::ArrayRef;
5+
use vortex::arrays::PrimitiveArray;
6+
use vortex::buffer::Buffer;
7+
use vortex::dtype::{DType, NativePType, Nullability};
8+
use vortex::encodings::sequence::SequenceArray;
9+
use vortex::error::{VortexExpect, VortexResult, vortex_bail};
10+
use vortex::scalar::PValue;
11+
use vortex::validity::Validity;
12+
13+
pub fn sequence_array_from_range<T: NativePType + TryFrom<isize> + Into<PValue>>(
14+
start: isize,
15+
stop: isize,
16+
step: isize,
17+
dtype: DType,
18+
) -> VortexResult<ArrayRef> {
19+
if step == 0 {
20+
vortex_bail!("Step must not be zero");
21+
}
22+
23+
let Some(len) = range_len(start, stop, step) else {
24+
let validity = match dtype.nullability() {
25+
Nullability::NonNullable => Validity::NonNullable,
26+
Nullability::Nullable => Validity::AllValid,
27+
};
28+
return Ok(PrimitiveArray::new::<T>(Buffer::empty(), validity).to_array());
29+
};
30+
let Ok(start) = T::try_from(start) else {
31+
vortex_bail!(
32+
"Start, {}, does not fit in requested dtype: {}",
33+
start,
34+
dtype
35+
);
36+
};
37+
let Ok(step) = T::try_from(step) else {
38+
vortex_bail!("Step, {}, does not fit in requested dtype: {}", step, dtype);
39+
};
40+
41+
Ok(SequenceArray::typed_new::<T>(start, step, dtype.nullability(), len)?.to_array())
42+
}
43+
44+
fn range_len(start: isize, stop: isize, step: isize) -> Option<usize> {
45+
if step > 0 {
46+
if start > stop {
47+
return None;
48+
}
49+
50+
let len = (stop - start + step - 1) / step;
51+
let len =
52+
usize::try_from(len).vortex_expect("stop >= start, step > 0, so len is non-negative");
53+
Some(len)
54+
} else {
55+
assert!(step != 0);
56+
57+
if stop > start {
58+
return None;
59+
}
60+
61+
let len = (start - stop + -step - 1) / -step;
62+
let len =
63+
usize::try_from(len).vortex_expect("start >= stop, step < 0, so len is non-negative");
64+
Some(len)
65+
}
66+
}
67+
68+
#[cfg(test)]
69+
mod test {
70+
use vortex::IntoArray as _;
71+
use vortex::arrow::IntoArrowArray;
72+
use vortex::buffer::buffer;
73+
use vortex::dtype::{DType, Nullability, PType};
74+
75+
use crate::arrays::range_to_sequence::{range_len, sequence_array_from_range};
76+
77+
#[test]
78+
fn test_range_len() {
79+
assert_eq!(range_len(0, 10, 1).unwrap(), 10);
80+
assert_eq!(range_len(0, 10, 5).unwrap(), 2);
81+
assert_eq!(range_len(0, 10, 10).unwrap(), 1);
82+
assert_eq!(range_len(0, 10, 100).unwrap(), 1);
83+
assert_eq!(range_len(-5, -5, 1).unwrap(), 0);
84+
assert_eq!(range_len(-5, 5, 3).unwrap(), 4);
85+
assert_eq!(range_len(-7, -5, 1).unwrap(), 2);
86+
assert_eq!(range_len(3, -3, -1).unwrap(), 6);
87+
assert_eq!(range_len(10, 3, 1), None);
88+
assert_eq!(range_len(0, 10, -1), None);
89+
}
90+
91+
#[test]
92+
fn test_sequence_array_from_len() {
93+
let dtype = DType::Primitive(PType::U16, Nullability::NonNullable);
94+
let arr = sequence_array_from_range::<u16>(0, 10, 1, dtype.clone()).unwrap();
95+
assert_eq!(arr.dtype(), &dtype);
96+
assert_eq!(
97+
&arr.into_arrow_preferred().unwrap(),
98+
&buffer![0u16, 1, 2, 3, 4, 5, 6, 7, 8, 9]
99+
.into_array()
100+
.into_arrow_preferred()
101+
.unwrap()
102+
);
103+
104+
let dtype = DType::Primitive(PType::I32, Nullability::NonNullable);
105+
let arr = sequence_array_from_range::<i32>(0, 10, 5, dtype.clone()).unwrap();
106+
assert_eq!(arr.dtype(), &dtype);
107+
assert_eq!(
108+
&arr.into_arrow_preferred().unwrap(),
109+
&buffer![0i32, 5]
110+
.into_array()
111+
.into_arrow_preferred()
112+
.unwrap()
113+
);
114+
115+
let dtype = DType::Primitive(PType::I8, Nullability::NonNullable);
116+
let arr = sequence_array_from_range::<i8>(-5, 5, 3, dtype.clone()).unwrap();
117+
assert_eq!(arr.dtype(), &dtype);
118+
assert_eq!(
119+
&arr.into_arrow_preferred().unwrap(),
120+
&buffer![-5i8, -2, 1, 4]
121+
.into_array()
122+
.into_arrow_preferred()
123+
.unwrap()
124+
);
125+
126+
let dtype = DType::Primitive(PType::I8, Nullability::NonNullable);
127+
let arr = sequence_array_from_range::<i8>(3, -3, -1, dtype.clone()).unwrap();
128+
assert_eq!(arr.dtype(), &dtype);
129+
assert_eq!(
130+
&arr.into_arrow_preferred().unwrap(),
131+
&buffer![3i8, 2, 1, 0, -1, -2]
132+
.into_array()
133+
.into_arrow_preferred()
134+
.unwrap()
135+
);
136+
137+
let dtype = DType::Primitive(PType::U32, Nullability::NonNullable);
138+
let result = sequence_array_from_range::<u32>(1_000_000, 10, -500_000, dtype);
139+
assert!(
140+
result.is_err_and(|err| err.to_string().contains("does not fit in requested dtype"))
141+
);
142+
143+
let dtype = DType::Primitive(PType::I32, Nullability::NonNullable);
144+
let arr = sequence_array_from_range::<i32>(1_000_000, 10, -500_000, dtype.clone()).unwrap();
145+
assert_eq!(arr.dtype(), &dtype);
146+
assert_eq!(
147+
&arr.into_arrow_preferred().unwrap(),
148+
&buffer![1_000_000i32, 500_000]
149+
.into_array()
150+
.into_arrow_preferred()
151+
.unwrap()
152+
);
153+
}
154+
}
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright the Vortex contributors
3+
4+
import vortex as vx
5+
6+
7+
def test_from_range_0_10_1():
8+
arr = vx.array(range(0, 10))
9+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == list(range(0, 10))
10+
11+
12+
def test_from_range_0_10_5():
13+
arr = vx.array(range(0, 10, 5))
14+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == list(range(0, 10, 5))
15+
16+
17+
def test_from_range_0_10_10():
18+
arr = vx.array(range(0, 10, 10))
19+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == [0]
20+
21+
22+
def test_from_range_0_10_100():
23+
arr = vx.array(range(0, 10, 100))
24+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == [0]
25+
26+
27+
def test_from_range_minus_5_5_1():
28+
arr = vx.array(range(-5, 5))
29+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == list(range(-5, 5))
30+
31+
32+
def test_from_range_minus_5_5_3():
33+
arr = vx.array(range(-5, 5, 3))
34+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == [-5, -2, 1, 4]
35+
36+
37+
def test_from_range_minus_7_minus_5():
38+
arr = vx.array(range(-7, -5))
39+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == [-7, -6]
40+
41+
42+
def test_from_range_invalid():
43+
arr = vx.array(range(10, 3))
44+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == []
45+
46+
arr = vx.array(range(0, 10, -1))
47+
assert list(arr.scalar_at(i).as_py() for i in range(len(arr))) == []

0 commit comments

Comments
 (0)