Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions arrow-arith/src/temporal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use arrow_array::cast::AsArray;
use cast::as_primitive_array;
use chrono::{Datelike, TimeZone, Timelike, Utc};

use arrow_array::ree_recurse;
use arrow_array::temporal_conversions::{
MICROSECONDS, MICROSECONDS_IN_DAY, MILLISECONDS, MILLISECONDS_IN_DAY, NANOSECONDS,
NANOSECONDS_IN_DAY, SECONDS_IN_DAY, date32_to_datetime, date64_to_datetime,
Expand Down Expand Up @@ -194,6 +195,15 @@ pub fn date_part(array: &dyn Array, part: DatePart) -> Result<ArrayRef, ArrowErr
let new_array = array.with_values(values);
Ok(new_array)
}
DataType::RunEndEncoded(k, _) => match k.data_type() {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is unrelated to adding with_values, right? It is adding REE support to date_part 🤔

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yea I noticed it while I was going through spots in the codebase that used this pattern. its was a pretty small change so I included it in this PR

DataType::Int16 => ree_recurse!(array, Int16Type, |a| date_part(a, part)),
DataType::Int32 => ree_recurse!(array, Int32Type, |a| date_part(a, part)),
DataType::Int64 => ree_recurse!(array, Int64Type, |a| date_part(a, part)),
_ => Err(ArrowError::InvalidArgumentError(format!(
"Invalid run-end type: {:?}",
k.data_type()
))),
},
t => return_compute_error_with!(format!("{part} does not support"), t),
)
}
Expand Down Expand Up @@ -2040,4 +2050,33 @@ mod tests {
assert_eq!(2015, actual.value(1));
assert_eq!(2016, actual.value(2));
}

#[test]
fn test_ree_timestamp_year() {
let vals: TimestampSecondArray =
vec![Some(1514764800), Some(1550636625), Some(1550636625)].into();
let run_ends = Int32Array::from(vec![1, 2, 3]);
let ree = RunArray::try_new(&run_ends, &vals).unwrap();

let b = date_part(&ree, DatePart::Year).unwrap();
let ree_result = b.as_run_opt::<Int32Type>().unwrap();
let values = ree_result.values().as_primitive::<Int32Type>();
assert_eq!(2018, values.value(0));
assert_eq!(2019, values.value(1));
assert_eq!(2019, values.value(2));
}

#[test]
fn test_ree_date64_month() {
let vals: PrimitiveArray<Date64Type> =
vec![Some(1514764800000), Some(1550636625000)].into();
let run_ends = Int64Array::from(vec![2, 4]);
let ree = RunArray::try_new(&run_ends, &vals).unwrap();

let b = date_part(&ree, DatePart::Month).unwrap();
let ree_result = b.as_run_opt::<Int64Type>().unwrap();
let values = ree_result.values().as_primitive::<Int32Type>();
assert_eq!(1, values.value(0));
assert_eq!(2, values.value(1));
}
}
37 changes: 37 additions & 0 deletions arrow-array/src/array/run_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,25 @@ use crate::{
types::{Int16Type, Int32Type, Int64Type, RunEndIndexType},
};

/// Recursively applies a function to the values of a RunEndEncoded array, preserving the run structure.
///
/// # Example
///
/// ```ignore
/// let result = ree_recurse!(array, Int32Type, my_function)?;
/// ```
///
/// This macro is useful for implementing functions that should work on the logical values
/// of a REE array while preserving the run-end encoding structure.
#[macro_export]
macro_rules! ree_recurse {
Comment thread
Rich-T-kid marked this conversation as resolved.
Outdated
($array:expr, $run_type:ty, $func:expr) => {{
let ree = $array.as_run_opt::<$run_type>().unwrap();
let inner_values = $func(ree.values().as_ref())?;
Ok(std::sync::Arc::new(ree.with_values(inner_values)))
}};
}

/// An array of [run-end encoded values].
///
/// This encoding is variation on [run-length encoding (RLE)] and is good for representing
Expand Down Expand Up @@ -200,6 +219,24 @@ impl<R: RunEndIndexType> RunArray<R> {
&self.values
}

/// Returns a new [`RunArray`] with the same `run_ends` and the supplied `values`.
///
/// # Panics
Comment thread
Rich-T-kid marked this conversation as resolved.
///
/// Panics if `values.len()` does not equal the existing run count.
pub fn with_values(&self, values: ArrayRef) -> Self {
assert_eq!(values.len(), self.values.len());
let data_type = DataType::RunEndEncoded(
Arc::new(Field::new("run_ends", R::DATA_TYPE, false)),
Comment thread
Rich-T-kid marked this conversation as resolved.
Outdated
Arc::new(Field::new("values", values.data_type().clone(), true)),
);
Self {
data_type,
run_ends: self.run_ends.clone(),
values,
}
}

/// Similar to [`values`] but accounts for logical slicing, returning only the values
/// that are part of the logical slice of this array.
///
Expand Down
57 changes: 38 additions & 19 deletions arrow-string/src/length.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,25 +17,12 @@

//! Defines kernel for length of string arrays and binary arrays

use arrow_array::ree_recurse;
use arrow_array::*;
use arrow_array::{cast::AsArray, types::*};
use arrow_buffer::{ArrowNativeType, NullBuffer, OffsetBuffer};
use arrow_schema::{ArrowError, DataType};
use std::sync::Arc;
macro_rules! ree_length {
($array:expr, $run_type:ty, $k:expr, $v:expr) => {{
let ree = $array.as_run_opt::<$run_type>().unwrap();
let inner_value_lengths = length(ree.values().as_ref())?;
let out_ree = unsafe {
RunArray::<$run_type>::new_unchecked(
DataType::RunEndEncoded(Arc::clone($k), Arc::clone($v)),
ree.run_ends().clone(),
inner_value_lengths,
)
};
Ok(Arc::new(out_ree) as ArrayRef)
}};
}

fn length_impl<P: ArrowPrimitiveType>(
offsets: &OffsetBuffer<P::Native>,
Expand Down Expand Up @@ -130,10 +117,10 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
list.nulls().cloned(),
)?))
}
DataType::RunEndEncoded(k, v) => match k.data_type() {
DataType::Int16 => ree_length!(array, Int16Type, &k, &v),
DataType::Int32 => ree_length!(array, Int32Type, &k, &v),
DataType::Int64 => ree_length!(array, Int64Type, &k, &v),
DataType::RunEndEncoded(k, _) => match k.data_type() {
DataType::Int16 => ree_recurse!(array, Int16Type, length),
DataType::Int32 => ree_recurse!(array, Int32Type, length),
DataType::Int64 => ree_recurse!(array, Int64Type, length),
_ => Err(ArrowError::InvalidArgumentError(format!(
"Invalid run-end type: {:?}",
k.data_type()
Expand All @@ -149,7 +136,7 @@ pub fn length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
///
/// * this only accepts StringArray/Utf8, LargeString/LargeUtf8, StringViewArray/Utf8View,
/// BinaryArray, LargeBinaryArray, BinaryViewArray, and FixedSizeBinaryArray,
/// or DictionaryArray with above Arrays as values
/// or DictionaryArray/REE with above Arrays as values
/// * bit_length of null is null.
/// * bit_length is in number of bits
pub fn bit_length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
Expand Down Expand Up @@ -203,6 +190,15 @@ pub fn bit_length(array: &dyn Array) -> Result<ArrayRef, ArrowError> {
array.nulls().cloned(),
)?))
}
DataType::RunEndEncoded(k, _) => match k.data_type() {
DataType::Int16 => ree_recurse!(array, Int16Type, bit_length),
DataType::Int32 => ree_recurse!(array, Int32Type, bit_length),
DataType::Int64 => ree_recurse!(array, Int64Type, bit_length),
_ => Err(ArrowError::InvalidArgumentError(format!(
"Invalid run-end type: {:?}",
k.data_type()
))),
},
other => Err(ArrowError::ComputeError(format!(
"bit_length not supported for {other:?}"
))),
Expand Down Expand Up @@ -903,4 +899,27 @@ mod tests {

assert!(length(&ree_array).is_err());
}

#[test]
fn bit_length_test_ree_utf8() {
use arrow_array::RunArray;
use arrow_array::types::Int32Type;

let strings = StringArray::from(vec!["hello", "world", "test"]);
let run_ends = PrimitiveArray::<Int32Type>::from(vec![1i32, 2, 3]);
let ree_array = RunArray::<Int32Type>::try_new(&run_ends, &strings).unwrap();

let result = bit_length(&ree_array).unwrap();
let result_values = result
.as_any()
.downcast_ref::<RunArray<Int32Type>>()
.unwrap()
.values()
.as_any()
.downcast_ref::<Int32Array>()
.unwrap();

let expected: Int32Array = vec![40, 40, 32].into();
assert_eq!(&expected, result_values);
}
}
Loading