Document REE row format and add some more tests (#7680)

~Draft until https://github.com/apache/arrow-rs/pull/7649 is merged~

# Which issue does this PR close?
- Follow on to https://github.com/apache/arrow-rs/pull/7649 from @brancz

# Rationale for this change
I noticed some extra testing and docs I would like to see so I made a PR
to add them

# What changes are included in this PR?

1. Add docs + additional tests
# Are there any user-facing changes?
No code changes, only some docs (and more tests)
This commit is contained in:
Andrew Lamb
2025-06-17 15:53:41 -04:00
committed by GitHub
parent 3837ac01dc
commit f37b1149db
2 changed files with 93 additions and 5 deletions
+5 -1
View File
@@ -274,7 +274,11 @@ mod variable;
///
/// ## Dictionary Encoding
///
/// Dictionaries are hydrated to their underlying values
/// Dictionary encoded arrays are hydrated to their underlying values
///
/// ## REE Encoding
///
/// REE (Run End Encoding) arrays, A form of Run Length Encoding, are hydrated to their underlying values.
///
/// ## Struct Encoding
///
+88 -4
View File
@@ -158,8 +158,9 @@ pub unsafe fn decode<R: RunEndIndexType>(
#[cfg(test)]
mod tests {
use crate::{RowConverter, SortField};
use arrow_array::types::Int32Type;
use arrow_array::{Array, Int64Array, RunArray, StringArray};
use arrow_array::cast::AsArray;
use arrow_array::types::{Int16Type, Int32Type, Int64Type};
use arrow_array::{Array, Int64Array, PrimitiveArray, RunArray, StringArray};
use arrow_schema::{DataType, SortOptions};
use std::sync::Arc;
@@ -173,14 +174,44 @@ mod tests {
}
#[test]
fn test_run_end_encoded_round_trip_int64s() {
fn test_run_end_encoded_round_trip_int16_int64s() {
// Test round-trip correctness for RunEndEncodedArray with Int64 values making sure it
// doesn't just work with eg. strings (which are all the other tests).
let values = Int64Array::from(vec![100, 200, 100, 300]);
let run_ends = vec![2, 3, 5, 6];
let array: RunArray<Int16Type> =
RunArray::try_new(&PrimitiveArray::from(run_ends), &values).unwrap();
let converter = RowConverter::new(vec![SortField::new(DataType::RunEndEncoded(
Arc::new(arrow_schema::Field::new("run_ends", DataType::Int16, false)),
Arc::new(arrow_schema::Field::new("values", DataType::Int64, true)),
))])
.unwrap();
let rows = converter
.convert_columns(&[Arc::new(array.clone())])
.unwrap();
let arrays = converter.convert_rows(&rows).unwrap();
let result = arrays[0]
.as_any()
.downcast_ref::<RunArray<Int16Type>>()
.unwrap();
assert_eq!(array.run_ends().values(), result.run_ends().values());
assert_eq!(array.values().as_ref(), result.values().as_ref());
}
#[test]
fn test_run_end_encoded_round_trip_int32_int64s() {
// Test round-trip correctness for RunEndEncodedArray with Int64 values making sure it
// doesn't just work with eg. strings (which are all the other tests).
let values = Int64Array::from(vec![100, 200, 100, 300]);
let run_ends = vec![2, 3, 5, 6];
let array: RunArray<Int32Type> =
RunArray::try_new(&arrow_array::PrimitiveArray::from(run_ends), &values).unwrap();
RunArray::try_new(&PrimitiveArray::from(run_ends), &values).unwrap();
let converter = RowConverter::new(vec![SortField::new(DataType::RunEndEncoded(
Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)),
@@ -202,6 +233,36 @@ mod tests {
assert_eq!(array.values().as_ref(), result.values().as_ref());
}
#[test]
fn test_run_end_encoded_round_trip_int64_int64s() {
// Test round-trip correctness for RunEndEncodedArray with Int64 values making sure it
// doesn't just work with eg. strings (which are all the other tests).
let values = Int64Array::from(vec![100, 200, 100, 300]);
let run_ends = vec![2, 3, 5, 6];
let array: RunArray<Int64Type> =
RunArray::try_new(&PrimitiveArray::from(run_ends), &values).unwrap();
let converter = RowConverter::new(vec![SortField::new(DataType::RunEndEncoded(
Arc::new(arrow_schema::Field::new("run_ends", DataType::Int64, false)),
Arc::new(arrow_schema::Field::new("values", DataType::Int64, true)),
))])
.unwrap();
let rows = converter
.convert_columns(&[Arc::new(array.clone())])
.unwrap();
let arrays = converter.convert_rows(&rows).unwrap();
let result = arrays[0]
.as_any()
.downcast_ref::<RunArray<Int64Type>>()
.unwrap();
assert_eq!(array.run_ends().values(), result.run_ends().values());
assert_eq!(array.values().as_ref(), result.values().as_ref());
}
#[test]
fn test_run_end_encoded_round_trip_strings() {
// Test round-trip correctness for RunEndEncodedArray with strings
@@ -692,4 +753,27 @@ mod tests {
"banana should come before cherry"
);
}
#[test]
fn test_run_end_encoded_empty() {
// Test converting / decoding an empty RunEndEncodedArray
let values: Vec<&str> = vec![];
let array: RunArray<Int32Type> = values.into_iter().collect();
let converter = RowConverter::new(vec![SortField::new(DataType::RunEndEncoded(
Arc::new(arrow_schema::Field::new("run_ends", DataType::Int32, false)),
Arc::new(arrow_schema::Field::new("values", DataType::Utf8, true)),
))])
.unwrap();
let rows = converter.convert_columns(&[Arc::new(array)]).unwrap();
assert_eq!(rows.num_rows(), 0);
// Likewise converting empty rows should yield an empty RunEndEncodedArray
let arrays = converter.convert_rows(&rows).unwrap();
assert_eq!(arrays.len(), 1);
// Verify both columns round-trip correctly
let result_ree = arrays[0].as_run::<Int32Type>();
assert_eq!(result_ree.len(), 0);
}
}