diff --git a/datafusion/functions/benches/to_hex.rs b/datafusion/functions/benches/to_hex.rs index 7346b89847e2..1c6757a291b2 100644 --- a/datafusion/functions/benches/to_hex.rs +++ b/datafusion/functions/benches/to_hex.rs @@ -17,56 +17,102 @@ extern crate criterion; +use arrow::array::Int64Array; use arrow::datatypes::{DataType, Field, Int32Type, Int64Type}; use arrow::util::bench_util::create_primitive_array; -use criterion::{Criterion, criterion_group, criterion_main}; +use criterion::{Criterion, SamplingMode, criterion_group, criterion_main}; use datafusion_common::config::ConfigOptions; use datafusion_expr::{ColumnarValue, ScalarFunctionArgs}; use datafusion_functions::string; use std::hint::black_box; use std::sync::Arc; +use std::time::Duration; fn criterion_benchmark(c: &mut Criterion) { let hex = string::to_hex(); - let size = 1024; - let i32_array = Arc::new(create_primitive_array::(size, 0.2)); - let batch_len = i32_array.len(); - let i32_args = vec![ColumnarValue::Array(i32_array)]; let config_options = Arc::new(ConfigOptions::default()); - c.bench_function(&format!("to_hex i32 array: {size}"), |b| { - b.iter(|| { - let args_cloned = i32_args.clone(); - black_box( - hex.invoke_with_args(ScalarFunctionArgs { - args: args_cloned, - arg_fields: vec![Field::new("a", DataType::Int32, false).into()], - number_rows: batch_len, - return_field: Field::new("f", DataType::Utf8, true).into(), - config_options: Arc::clone(&config_options), - }) - .unwrap(), - ) - }) - }); - let i64_array = Arc::new(create_primitive_array::(size, 0.2)); - let batch_len = i64_array.len(); - let i64_args = vec![ColumnarValue::Array(i64_array)]; - c.bench_function(&format!("to_hex i64 array: {size}"), |b| { - b.iter(|| { - let args_cloned = i64_args.clone(); - black_box( - hex.invoke_with_args(ScalarFunctionArgs { - args: args_cloned, - arg_fields: vec![Field::new("a", DataType::Int64, false).into()], - number_rows: batch_len, - return_field: Field::new("f", DataType::Utf8, true).into(), - config_options: Arc::clone(&config_options), + for size in [1024, 4096, 8192] { + let mut group = c.benchmark_group(format!("to_hex size={size}")); + group.sampling_mode(SamplingMode::Flat); + group.sample_size(10); + group.measurement_time(Duration::from_secs(10)); + + // i32 array with random values + let i32_array = Arc::new(create_primitive_array::(size, 0.1)); + let batch_len = i32_array.len(); + let i32_args = vec![ColumnarValue::Array(i32_array)]; + + group.bench_function("i32_random", |b| { + b.iter(|| { + let args_cloned = i32_args.clone(); + black_box( + hex.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + arg_fields: vec![Field::new("a", DataType::Int32, true).into()], + number_rows: batch_len, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .unwrap(), + ) + }) + }); + + // i64 array with random values (produces longer hex strings) + let i64_array = Arc::new(create_primitive_array::(size, 0.1)); + let batch_len = i64_array.len(); + let i64_args = vec![ColumnarValue::Array(i64_array)]; + + group.bench_function("i64_random", |b| { + b.iter(|| { + let args_cloned = i64_args.clone(); + black_box( + hex.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + arg_fields: vec![Field::new("a", DataType::Int64, true).into()], + number_rows: batch_len, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .unwrap(), + ) + }) + }); + + // i64 array with large values (max length hex strings) + let i64_large_array = Arc::new(Int64Array::from( + (0..size) + .map(|i| { + if i % 10 == 0 { + None + } else { + Some(i64::MAX - i as i64) + } }) - .unwrap(), - ) - }) - }); + .collect::>(), + )); + let batch_len = i64_large_array.len(); + let i64_large_args = vec![ColumnarValue::Array(i64_large_array)]; + + group.bench_function("i64_large_values", |b| { + b.iter(|| { + let args_cloned = i64_large_args.clone(); + black_box( + hex.invoke_with_args(ScalarFunctionArgs { + args: args_cloned, + arg_fields: vec![Field::new("a", DataType::Int64, true).into()], + number_rows: batch_len, + return_field: Field::new("f", DataType::Utf8, true).into(), + config_options: Arc::clone(&config_options), + }) + .unwrap(), + ) + }) + }); + + group.finish(); + } } criterion_group!(benches, criterion_benchmark); diff --git a/datafusion/functions/src/string/to_hex.rs b/datafusion/functions/src/string/to_hex.rs index fb34c96ad83a..dd4f4174266f 100644 --- a/datafusion/functions/src/string/to_hex.rs +++ b/datafusion/functions/src/string/to_hex.rs @@ -16,11 +16,11 @@ // under the License. use std::any::Any; -use std::fmt::Write; use std::sync::Arc; use crate::utils::make_scalar_function; -use arrow::array::{ArrayRef, GenericStringBuilder}; +use arrow::array::{Array, ArrayRef, StringArray}; +use arrow::buffer::{Buffer, OffsetBuffer}; use arrow::datatypes::DataType::{ Int8, Int16, Int32, Int64, UInt8, UInt16, UInt32, UInt64, Utf8, }; @@ -37,42 +37,142 @@ use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl, Signature, Volatility}; use datafusion_expr_common::signature::TypeSignature::Exact; use datafusion_macros::user_doc; +/// Hex lookup table for fast conversion +const HEX_CHARS: &[u8; 16] = b"0123456789abcdef"; + /// Converts the number to its equivalent hexadecimal representation. /// to_hex(2147483647) = '7fffffff' fn to_hex(args: &[ArrayRef]) -> Result where - T::Native: std::fmt::LowerHex, + T::Native: ToHex, { let integer_array = as_primitive_array::(&args[0])?; + let len = integer_array.len(); - let mut result = GenericStringBuilder::::with_capacity( - integer_array.len(), - // * 8 to convert to bits, / 4 bits per hex char - integer_array.len() * (T::Native::get_byte_width() * 8 / 4), - ); + // Max hex string length: 16 chars for u64/i64 + let max_hex_len = T::Native::get_byte_width() * 2; - for integer in integer_array { - if let Some(value) = integer { - if let Some(value_usize) = value.to_usize() { - write!(result, "{value_usize:x}")?; - } else if let Some(value_isize) = value.to_isize() { - write!(result, "{value_isize:x}")?; - } else { - return exec_err!( - "Unsupported data type {integer:?} for function to_hex" - ); - } - result.append_value(""); - } else { - result.append_null(); - } + // Pre-allocate buffers - avoid the builder API overhead + let mut offsets: Vec = Vec::with_capacity(len + 1); + let mut values: Vec = Vec::with_capacity(len * max_hex_len); + + // Reusable buffer for hex conversion + let mut hex_buffer = [0u8; 16]; + + // Start with offset 0 + offsets.push(0); + + // Process all values directly (including null slots - we write empty strings for nulls) + // The null bitmap will mark which entries are actually null + for value in integer_array.values() { + let hex_len = value.write_hex_to_buffer(&mut hex_buffer); + values.extend_from_slice(&hex_buffer[16 - hex_len..]); + offsets.push(values.len() as i32); } - let result = result.finish(); + // Copy null bitmap from input (nulls pass through unchanged) + let nulls = integer_array.nulls().cloned(); + + // SAFETY: offsets are valid (monotonically increasing, last value equals values.len()) + // and values contains valid UTF-8 (only ASCII hex digits) + let offsets = + unsafe { OffsetBuffer::new_unchecked(Buffer::from_vec(offsets).into()) }; + let result = StringArray::new(offsets, Buffer::from_vec(values), nulls); Ok(Arc::new(result) as ArrayRef) } +/// Trait for converting integer types to hexadecimal in a buffer +trait ToHex: ArrowNativeType { + /// Write hex representation to buffer and return the number of hex digits written. + /// The hex digits are written right-aligned in the buffer (starting from position 16 - len). + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize; +} + +/// Write unsigned value to hex buffer and return the number of digits written. +/// Digits are written right-aligned in the buffer. +#[inline] +fn write_unsigned_hex_to_buffer(value: u64, buffer: &mut [u8; 16]) -> usize { + if value == 0 { + buffer[15] = b'0'; + return 1; + } + + // Write hex digits from right to left + let mut pos = 16; + let mut v = value; + while v > 0 { + pos -= 1; + buffer[pos] = HEX_CHARS[(v & 0xf) as usize]; + v >>= 4; + } + + 16 - pos +} + +/// Write signed value to hex buffer (two's complement for negative) and return digit count +#[inline] +fn write_signed_hex_to_buffer(value: i64, buffer: &mut [u8; 16]) -> usize { + // For negative values, use two's complement representation (same as casting to u64) + write_unsigned_hex_to_buffer(value as u64, buffer) +} + +impl ToHex for i8 { + #[inline] + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize { + write_signed_hex_to_buffer(self as i64, buffer) + } +} + +impl ToHex for i16 { + #[inline] + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize { + write_signed_hex_to_buffer(self as i64, buffer) + } +} + +impl ToHex for i32 { + #[inline] + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize { + write_signed_hex_to_buffer(self as i64, buffer) + } +} + +impl ToHex for i64 { + #[inline] + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize { + write_signed_hex_to_buffer(self, buffer) + } +} + +impl ToHex for u8 { + #[inline] + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize { + write_unsigned_hex_to_buffer(self as u64, buffer) + } +} + +impl ToHex for u16 { + #[inline] + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize { + write_unsigned_hex_to_buffer(self as u64, buffer) + } +} + +impl ToHex for u32 { + #[inline] + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize { + write_unsigned_hex_to_buffer(self as u64, buffer) + } +} + +impl ToHex for u64 { + #[inline] + fn write_hex_to_buffer(self, buffer: &mut [u8; 16]) -> usize { + write_unsigned_hex_to_buffer(self, buffer) + } +} + #[user_doc( doc_section(label = "String Functions"), description = "Converts an integer to a hexadecimal string.",