Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 83 additions & 37 deletions datafusion/functions/benches/to_hex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,56 +17,102 @@

extern crate criterion;

use arrow::array::Int64Array;
use arrow::datatypes::{DataType, Field, Int32Type, Int64Type};
use arrow::util::bench_util::create_primitive_array;
use criterion::{Criterion, criterion_group, criterion_main};
use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::string;
use std::hint::black_box;
use std::sync::Arc;
use std::time::Duration;

fn criterion_benchmark(c: &mut Criterion) {
let hex = string::to_hex();
let size = 1024;
let i32_array = Arc::new(create_primitive_array::<Int32Type>(size, 0.2));
let batch_len = i32_array.len();
let i32_args = vec![ColumnarValue::Array(i32_array)];
let config_options = Arc::new(ConfigOptions::default());

c.bench_function(&format!("to_hex i32 array: {size}"), |b| {
b.iter(|| {
let args_cloned = i32_args.clone();
black_box(
hex.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: vec![Field::new("a", DataType::Int32, false).into()],
number_rows: batch_len,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
})
.unwrap(),
)
})
});
let i64_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.2));
let batch_len = i64_array.len();
let i64_args = vec![ColumnarValue::Array(i64_array)];
c.bench_function(&format!("to_hex i64 array: {size}"), |b| {
b.iter(|| {
let args_cloned = i64_args.clone();
black_box(
hex.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: vec![Field::new("a", DataType::Int64, false).into()],
number_rows: batch_len,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
for size in [1024, 4096, 8192] {
let mut group = c.benchmark_group(format!("to_hex size={size}"));
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));

// i32 array with random values
let i32_array = Arc::new(create_primitive_array::<Int32Type>(size, 0.1));
let batch_len = i32_array.len();
let i32_args = vec![ColumnarValue::Array(i32_array)];

group.bench_function("i32_random", |b| {
b.iter(|| {
let args_cloned = i32_args.clone();
black_box(
hex.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: vec![Field::new("a", DataType::Int32, true).into()],
number_rows: batch_len,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
})
.unwrap(),
)
})
});

// i64 array with random values (produces longer hex strings)
let i64_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.1));
let batch_len = i64_array.len();
let i64_args = vec![ColumnarValue::Array(i64_array)];

group.bench_function("i64_random", |b| {
b.iter(|| {
let args_cloned = i64_args.clone();
black_box(
hex.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: vec![Field::new("a", DataType::Int64, true).into()],
number_rows: batch_len,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
})
.unwrap(),
)
})
});

// i64 array with large values (max length hex strings)
let i64_large_array = Arc::new(Int64Array::from(
(0..size)
.map(|i| {
if i % 10 == 0 {
None
} else {
Some(i64::MAX - i as i64)
}
})
.unwrap(),
)
})
});
.collect::<Vec<_>>(),
));
let batch_len = i64_large_array.len();
let i64_large_args = vec![ColumnarValue::Array(i64_large_array)];

group.bench_function("i64_large_values", |b| {
b.iter(|| {
let args_cloned = i64_large_args.clone();
black_box(
hex.invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: vec![Field::new("a", DataType::Int64, true).into()],
number_rows: batch_len,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
})
.unwrap(),
)
})
});

group.finish();
}
}

criterion_group!(benches, criterion_benchmark);
Expand Down
148 changes: 124 additions & 24 deletions datafusion/functions/src/string/to_hex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
// under the License.

use std::any::Any;
use std::fmt::Write;
use std::sync::Arc;

use crate::utils::make_scalar_function;
use arrow::array::{ArrayRef, GenericStringBuilder};
use arrow::array::{Array, ArrayRef, StringArray};
use arrow::buffer::{Buffer, OffsetBuffer};
use arrow::datatypes::DataType::{
Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Utf8,
};
Expand All @@ -37,42 +37,142 @@ use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility};
use datafusion_expr_common::signature::TypeSignature::Exact;
use datafusion_macros::user_doc;

/// Hex lookup table for fast conversion
const HEX_CHARS: &[u8; 16] = b"0123456789abcdef";

/// Converts the number to its equivalent hexadecimal representation.
/// to_hex(2147483647) = '7fffffff'
fn to_hex<T: ArrowPrimitiveType>(args: &[ArrayRef]) -> Result<ArrayRef>
where
T::Native: std::fmt::LowerHex,
T::Native: ToHex,
{
let integer_array = as_primitive_array::<T>(&args[0])?;
let len = integer_array.len();

let mut result = GenericStringBuilder::<i32>::with_capacity(
integer_array.len(),
// * 8 to convert to bits, / 4 bits per hex char
integer_array.len() * (T::Native::get_byte_width() * 8 / 4),
);
// Max hex string length: 16 chars for u64/i64
let max_hex_len = T::Native::get_byte_width() * 2;

for integer in integer_array {
if let Some(value) = integer {
if let Some(value_usize) = value.to_usize() {
write!(result, "{value_usize:x}")?;
} else if let Some(value_isize) = value.to_isize() {
write!(result, "{value_isize:x}")?;
} else {
return exec_err!(
"Unsupported data type {integer:?} for function to_hex"
);
}
result.append_value("");
} else {
result.append_null();
}
// Pre-allocate buffers - avoid the builder API overhead
let mut offsets: Vec<i32> = Vec::with_capacity(len + 1);
let mut values: Vec<u8> = Vec::with_capacity(len * max_hex_len);

// Reusable buffer for hex conversion
let mut hex_buffer = [0u8; 16];

// Start with offset 0
offsets.push(0);

// Process all values directly (including null slots - we write empty strings for nulls)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: I don't think this code is writing empty strings for nulls, it seems to process whatever value is in the null slot normally

// The null bitmap will mark which entries are actually null
for value in integer_array.values() {
let hex_len = value.write_hex_to_buffer(&mut hex_buffer);
values.extend_from_slice(&hex_buffer[16 - hex_len..]);
offsets.push(values.len() as i32);
}

let result = result.finish();
// Copy null bitmap from input (nulls pass through unchanged)
let nulls = integer_array.nulls().cloned();

// SAFETY: offsets are valid (monotonically increasing, last value equals values.len())
// and values contains valid UTF-8 (only ASCII hex digits)
let offsets =
unsafe { OffsetBuffer::new_unchecked(Buffer::from_vec(offsets).into()) };
let result = StringArray::new(offsets, Buffer::from_vec(values), nulls);

Ok(Arc::new(result) as ArrayRef)
}

/// Trait for converting integer types to hexadecimal in a buffer
trait ToHex: ArrowNativeType {
/// Write hex representation to buffer and return the number of hex digits written.
/// The hex digits are written right-aligned in the buffer (starting from position 16 - len).
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize;
}

/// Write unsigned value to hex buffer and return the number of digits written.
/// Digits are written right-aligned in the buffer.
#[inline]
fn write_unsigned_hex_to_buffer(value: u64, buffer: &mut [u8; 16]) -> usize {
if value == 0 {
buffer[15] = b'0';
return 1;
}

// Write hex digits from right to left
let mut pos = 16;
let mut v = value;
while v > 0 {
pos -= 1;
buffer[pos] = HEX_CHARS[(v & 0xf) as usize];
v >>= 4;
}

16 - pos
}

/// Write signed value to hex buffer (two's complement for negative) and return digit count
#[inline]
fn write_signed_hex_to_buffer(value: i64, buffer: &mut [u8; 16]) -> usize {
// For negative values, use two's complement representation (same as casting to u64)
write_unsigned_hex_to_buffer(value as u64, buffer)
}

impl ToHex for i8 {
#[inline]
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
write_signed_hex_to_buffer(self as i64, buffer)
}
}

impl ToHex for i16 {
#[inline]
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
write_signed_hex_to_buffer(self as i64, buffer)
}
}

impl ToHex for i32 {
#[inline]
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
write_signed_hex_to_buffer(self as i64, buffer)
}
}

impl ToHex for i64 {
#[inline]
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
write_signed_hex_to_buffer(self, buffer)
}
}

impl ToHex for u8 {
#[inline]
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
write_unsigned_hex_to_buffer(self as u64, buffer)
}
}

impl ToHex for u16 {
#[inline]
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
write_unsigned_hex_to_buffer(self as u64, buffer)
}
}

impl ToHex for u32 {
#[inline]
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
write_unsigned_hex_to_buffer(self as u64, buffer)
}
}

impl ToHex for u64 {
#[inline]
fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize {
write_unsigned_hex_to_buffer(self, buffer)
}
}

#[user_doc(
doc_section(label = "String Functions"),
description = "Converts an integer to a hexadecimal string.",
Expand Down
Loading