From 3e75327e808498177c19db932cdfac2c5507ee7d Mon Sep 17 00:00:00 2001 From: Hyukjin Kwon Date: Fri, 12 Dec 2025 10:27:58 +0900 Subject: [PATCH] GH-48672, GH-48465: [Python] Add an option for truncating intraday milliseconds in Date64 --- python/pyarrow/array.pxi | 36 ++- python/pyarrow/includes/libarrow_python.pxd | 4 + python/pyarrow/pandas_compat.py | 5 +- python/pyarrow/scalar.pxi | 9 +- .../src/arrow/python/arrow_to_pandas.cc | 40 +++- .../src/arrow/python/arrow_to_pandas.h | 5 + .../src/arrow/python/numpy_to_arrow.cc | 15 +- .../pyarrow/src/arrow/python/numpy_to_arrow.h | 4 +- .../src/arrow/python/python_to_arrow.cc | 9 +- .../src/arrow/python/python_to_arrow.h | 5 + python/pyarrow/src/arrow/python/type_traits.h | 1 + python/pyarrow/table.pxi | 19 +- python/pyarrow/tests/test_array.py | 210 ++++++++++++++++++ 13 files changed, 331 insertions(+), 31 deletions(-) diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 575b628db3a..670a75fb202 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -27,7 +27,8 @@ cdef extern from "" namespace "std": T get[T](...) cdef _sequence_to_array(object sequence, object mask, object size, - DataType type, CMemoryPool* pool, c_bool from_pandas): + DataType type, CMemoryPool* pool, c_bool from_pandas, + bint truncate_date64_time): cdef: int64_t c_size PyConversionOptions options @@ -41,6 +42,7 @@ cdef _sequence_to_array(object sequence, object mask, object size, options.from_pandas = from_pandas options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) + options.truncate_date64_time = truncate_date64_time with nogil: chunked = GetResultValue( @@ -81,7 +83,8 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values, cdef _ndarray_to_array(object values, object mask, DataType type, - c_bool from_pandas, c_bool safe, CMemoryPool* pool): + c_bool from_pandas, c_bool safe, CMemoryPool* pool, + bint truncate_date64_time): cdef: shared_ptr[CChunkedArray] chunked_out shared_ptr[CDataType] c_type = _ndarray_to_type(values, type) @@ -89,7 +92,7 @@ cdef _ndarray_to_array(object values, object mask, DataType type, with nogil: check_status(NdarrayToArrow(pool, values, mask, from_pandas, - c_type, cast_options, &chunked_out)) + c_type, cast_options, truncate_date64_time, &chunked_out)) if chunked_out.get().num_chunks() > 1: return pyarrow_wrap_chunked_array(chunked_out) @@ -127,7 +130,7 @@ def _handle_arrow_array_protocol(obj, type, mask, size): def array(object obj, type=None, mask=None, size=None, from_pandas=None, - bint safe=True, MemoryPool memory_pool=None): + bint safe=True, MemoryPool memory_pool=None, bint truncate_date64_time=True): """ Create pyarrow.Array instance from a Python object. @@ -162,6 +165,10 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool. + truncate_date64_time : bool, default True + If True (default), truncate intraday milliseconds when converting Python + datetime objects to date64. + If False, preserve the full datetime including time components. Returns ------- @@ -313,7 +320,8 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, elif (pandas_api.is_categorical(values) and type is not None and type.id != Type_DICTIONARY): result = _ndarray_to_array( - np.asarray(values), mask, type, c_from_pandas, safe, pool + np.asarray( + values), mask, type, c_from_pandas, safe, pool, truncate_date64_time ) elif pandas_api.is_categorical(values): if type is not None: @@ -358,21 +366,22 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None, values, obj.dtype, type) if type and type.id == _Type_RUN_END_ENCODED: arr = _ndarray_to_array( - values, mask, type.value_type, c_from_pandas, safe, pool) + values, mask, type.value_type, c_from_pandas, safe, pool, truncate_date64_time) result = _pc().run_end_encode(arr, run_end_type=type.run_end_type, memory_pool=memory_pool) else: result = _ndarray_to_array(values, mask, type, c_from_pandas, safe, - pool) + pool, truncate_date64_time) else: if type and type.id == _Type_RUN_END_ENCODED: arr = _sequence_to_array( - obj, mask, size, type.value_type, pool, from_pandas) + obj, mask, size, type.value_type, pool, from_pandas, truncate_date64_time) result = _pc().run_end_encode(arr, run_end_type=type.run_end_type, memory_pool=memory_pool) # ConvertPySequence does strict conversion if type is explicitly passed else: - result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas) + result = _sequence_to_array( + obj, mask, size, type, pool, c_from_pandas, truncate_date64_time) if extension_type is not None: result = ExtensionArray.from_storage(extension_type, result) @@ -880,7 +889,8 @@ cdef class _PandasConvertible(_Weakrefable): bint self_destruct=False, str maps_as_pydicts=None, types_mapper=None, - bint coerce_temporal_nanoseconds=False + bint coerce_temporal_nanoseconds=False, + bint truncate_date64_time=False ): """ Convert to a pandas-compatible NumPy array or DataFrame, as appropriate @@ -965,6 +975,10 @@ cdef class _PandasConvertible(_Weakrefable): default behavior in pandas version 1.x. Set this option to True if you'd like to use this coercion when using pandas version >= 2.0 for backwards compatibility (not recommended otherwise). + truncate_date64_time : bool, default False + If True, truncate intraday milliseconds when converting date64 to pandas + datetime. + If False (default), preserve the full datetime including time components. Returns ------- @@ -1041,6 +1055,7 @@ cdef class _PandasConvertible(_Weakrefable): split_blocks=split_blocks, self_destruct=self_destruct, maps_as_pydicts=maps_as_pydicts, + truncate_date64_time=truncate_date64_time, coerce_temporal_nanoseconds=coerce_temporal_nanoseconds ) return self._to_pandas(options, categories=categories, @@ -1063,6 +1078,7 @@ cdef PandasOptions _convert_pandas_options(dict options): result.self_destruct = options['self_destruct'] result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds'] result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False) + result.truncate_date64_time = options['truncate_date64_time'] maps_as_pydicts = options['maps_as_pydicts'] if maps_as_pydicts is None: diff --git a/python/pyarrow/includes/libarrow_python.pxd b/python/pyarrow/includes/libarrow_python.pxd index 4724c52ccb5..ef51f90efa3 100644 --- a/python/pyarrow/includes/libarrow_python.pxd +++ b/python/pyarrow/includes/libarrow_python.pxd @@ -66,6 +66,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: c_bool from_pandas c_bool ignore_timezone c_bool strict + c_bool truncate_date64_time # TODO Some functions below are not actually "nogil" @@ -81,12 +82,14 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, c_bool from_pandas, const shared_ptr[CDataType]& type, + c_bool truncate_date64_time, shared_ptr[CChunkedArray]* out) CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo, c_bool from_pandas, const shared_ptr[CDataType]& type, const CCastOptions& cast_options, + c_bool truncate_date64_time, shared_ptr[CChunkedArray]* out) CStatus NdarrayToTensor(CMemoryPool* pool, object ao, @@ -193,6 +196,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil: c_bool coerce_temporal_nanoseconds c_bool ignore_timezone c_bool deduplicate_objects + c_bool truncate_date64_time c_bool safe_cast c_bool split_blocks c_bool self_destruct diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py index dfed76d3711..dc0d6c35bc9 100644 --- a/python/pyarrow/pandas_compat.py +++ b/python/pyarrow/pandas_compat.py @@ -597,7 +597,7 @@ def dataframe_to_types(df, preserve_index, columns=None): def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None, - safe=True): + safe=True, truncate_date64_time=True): (all_names, column_names, column_field_names, @@ -630,7 +630,8 @@ def convert_column(col, field): type_ = field.type try: - result = pa.array(col, type=type_, from_pandas=True, safe=safe) + result = pa.array(col, type=type_, from_pandas=True, safe=safe, + truncate_date64_time=truncate_date64_time) except (pa.ArrowInvalid, pa.ArrowNotImplementedError, pa.ArrowTypeError) as e: diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi index 83cabcf447d..7633dd30e45 100644 --- a/python/pyarrow/scalar.pxi +++ b/python/pyarrow/scalar.pxi @@ -1598,7 +1598,8 @@ cdef object get_scalar_class_from_type( return _scalar_classes[data_type.id()] -def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): +def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None, + bint truncate_date64_time=True): """ Create a pyarrow.Scalar instance from a Python object. @@ -1616,6 +1617,10 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): memory_pool : pyarrow.MemoryPool, optional If not passed, will allocate memory from the currently-set default memory pool. + truncate_date64_time : bool, default True + If True (default), truncate intraday milliseconds when converting Python + datetime objects to date64. + If False, preserve the full datetime including time components. Returns ------- @@ -1668,6 +1673,8 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None): else: options.from_pandas = from_pandas + options.truncate_date64_time = truncate_date64_time + value = [value] with nogil: chunked = GetResultValue(ConvertPySequence(value, None, options, pool)) diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc index ed4f394362a..52fffff9af4 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.cc +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.cc @@ -1547,6 +1547,26 @@ void ConvertDatesShift(const ChunkedArray& data, int64_t* out_values) { } } +template +inline void ConvertDatetimeWithTruncation(const ChunkedArray& data, int64_t* out_values) { + for (int c = 0; c < data.num_chunks(); c++) { + const auto& arr = *data.chunk(c); + const int64_t* in_values = GetPrimitiveValues(arr); + for (int64_t i = 0; i < arr.length(); ++i) { + if (arr.IsNull(i)) { + *out_values++ = kPandasTimestampNull; + } else { + int64_t truncated = in_values[i] - in_values[i] % kMillisecondsInDay; + if constexpr (SHIFT == 1) { + *out_values++ = truncated; + } else { + *out_values++ = truncated * SHIFT; + } + } + } + } +} + class DatetimeDayWriter : public TypedPandasWriter { public: using TypedPandasWriter::TypedPandasWriter; @@ -1617,7 +1637,14 @@ class DatetimeMilliWriter : public DatetimeWriter { // Convert from days since epoch to datetime64[ms] ConvertDatetime(*data, out_values); } else if (type == Type::DATE64) { - ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + // Date64Type is millisecond timestamp + if (this->options_.truncate_date64_time) { + // Truncate intraday milliseconds + ConvertDatetimeWithTruncation<1L>(*data, out_values); + } else { + // Preserve time components + ConvertNumericNullable(*data, kPandasTimestampNull, out_values); + } } else { const auto& ts_type = checked_cast(*data->type()); ARROW_DCHECK_EQ(TimeUnit::MILLI, ts_type.unit()) @@ -1652,9 +1679,14 @@ class DatetimeNanoWriter : public DatetimeWriter { // Convert from days since epoch to datetime64[ns] ConvertDatetime(*data, out_values); } else if (type == Type::DATE64) { - // Date64Type is millisecond timestamp stored as int64_t - // TODO(wesm): Do we want to make sure to zero out the milliseconds? - ConvertDatetime(*data, out_values); + // Date64Type is millisecond timestamp; convert to nanoseconds + if (this->options_.truncate_date64_time) { + // Truncate intraday milliseconds and convert to nanoseconds + ConvertDatetimeWithTruncation<1000000L>(*data, out_values); + } else { + // Preserve time components and convert to nanoseconds + ConvertDatetime(*data, out_values); + } } else if (type == Type::TIMESTAMP) { const auto& ts_type = checked_cast(*data->type()); diff --git a/python/pyarrow/src/arrow/python/arrow_to_pandas.h b/python/pyarrow/src/arrow/python/arrow_to_pandas.h index b4e91e6cf5a..c7dc65b84e1 100644 --- a/python/pyarrow/src/arrow/python/arrow_to_pandas.h +++ b/python/pyarrow/src/arrow/python/arrow_to_pandas.h @@ -89,6 +89,11 @@ struct PandasOptions { /// objects bool deduplicate_objects = false; + /// If true, truncate intraday milliseconds when converting date64 to pandas + /// datetime (default false to preserve time components). + /// If false, preserve the full datetime including time components. + bool truncate_date64_time = false; + /// \brief For certain data types, a cast is needed in order to store the /// data in a pandas DataFrame or Series (e.g. timestamps are always stored /// as nanoseconds in pandas). This option controls whether it is a safe diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc index 5647e895d0f..50d28a4e342 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.cc @@ -183,7 +183,7 @@ class NumPyConverter { public: NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo, const std::shared_ptr& type, bool from_pandas, - const compute::CastOptions& cast_options = compute::CastOptions()) + const compute::CastOptions& cast_options, bool truncate_date64_time) : pool_(pool), type_(type), arr_(reinterpret_cast(arr)), @@ -191,6 +191,7 @@ class NumPyConverter { mask_(nullptr), from_pandas_(from_pandas), cast_options_(cast_options), + truncate_date64_time_(truncate_date64_time), null_bitmap_data_(nullptr), null_count_(0) { if (mo != nullptr && mo != Py_None) { @@ -311,6 +312,7 @@ class NumPyConverter { bool from_pandas_; compute::CastOptions cast_options_; + bool truncate_date64_time_; // Used in visitor pattern ArrayVector out_arrays_; @@ -330,6 +332,7 @@ Status NumPyConverter::Convert() { PyConversionOptions py_options; py_options.type = type_; py_options.from_pandas = from_pandas_; + py_options.truncate_date64_time = truncate_date64_time_; ARROW_ASSIGN_OR_RAISE( auto chunked_array, ConvertPySequence(reinterpret_cast(arr_), @@ -845,7 +848,7 @@ Status NumPyConverter::Visit(const StructType& type) { RETURN_IF_PYERROR(); sub_arrays.emplace_back(sub_array); sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(), - from_pandas_); + from_pandas_, cast_options_, truncate_date64_time_); } } @@ -916,7 +919,7 @@ Status NumPyConverter::Visit(const StructType& type) { Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, - const compute::CastOptions& cast_options, + const compute::CastOptions& cast_options, bool truncate_date64_time, std::shared_ptr* out) { if (!PyArray_Check(ao)) { // This code path cannot be reached by Python unit tests currently so this @@ -927,7 +930,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa return Status::Invalid("only handle 1-dimensional arrays"); } - NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options); + NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options, + truncate_date64_time); RETURN_NOT_OK(converter.Convert()); const auto& output_arrays = converter.result(); ARROW_DCHECK_GT(output_arrays.size(), 0); @@ -938,7 +942,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, std::shared_ptr* out) { - return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out); + return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), false, + out); } } // namespace py diff --git a/python/pyarrow/src/arrow/python/numpy_to_arrow.h b/python/pyarrow/src/arrow/python/numpy_to_arrow.h index b6cd093e554..315fc6d535f 100644 --- a/python/pyarrow/src/arrow/python/numpy_to_arrow.h +++ b/python/pyarrow/src/arrow/python/numpy_to_arrow.h @@ -46,11 +46,13 @@ namespace py { /// whether values are null /// \param[in] type a specific type to cast to, may be null /// \param[in] cast_options casting options +/// \param[in] truncate_date64_time If true, truncate intraday milliseconds when +/// converting Python datetime objects to date64 (default true) /// \param[out] out a ChunkedArray, to accommodate chunked output ARROW_PYTHON_EXPORT Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, - const compute::CastOptions& cast_options, + const compute::CastOptions& cast_options, bool truncate_date64_time, std::shared_ptr* out); /// Safely convert NumPy arrays to Arrow. If target data type is not known, diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 139eb1d7f4f..3e883832c83 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -303,14 +303,15 @@ class PyValue { return value; } - static Result Convert(const Date64Type*, const O&, I obj) { + static Result Convert(const Date64Type*, const O& options, I obj) { int64_t value; if (PyDateTime_Check(obj)) { auto pydate = reinterpret_cast(obj); value = internal::PyDateTime_to_ms(pydate); - // Truncate any intraday milliseconds - // TODO: introduce an option for this - value -= value % 86400000LL; + // Truncate any intraday milliseconds if the option is enabled + if (options.truncate_date64_time) { + value -= value % 86400000LL; + } } else if (PyDate_Check(obj)) { auto pydate = reinterpret_cast(obj); value = internal::PyDate_to_ms(pydate); diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.h b/python/pyarrow/src/arrow/python/python_to_arrow.h index d167996ba8d..aa61bbe81fa 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.h +++ b/python/pyarrow/src/arrow/python/python_to_arrow.h @@ -59,6 +59,11 @@ struct PyConversionOptions { bool ignore_timezone = false; bool strict = false; + + /// If true, truncate intraday milliseconds when converting Python datetime + /// objects to date64 (default true for backwards compatibility). + /// If false, preserve the full datetime including time components. + bool truncate_date64_time = true; }; /// \brief Convert sequence (list, generator, NumPy array with dtype object) of diff --git a/python/pyarrow/src/arrow/python/type_traits.h b/python/pyarrow/src/arrow/python/type_traits.h index 865e1af4276..a467dcd408b 100644 --- a/python/pyarrow/src/arrow/python/type_traits.h +++ b/python/pyarrow/src/arrow/python/type_traits.h @@ -34,6 +34,7 @@ namespace arrow { namespace py { static constexpr int64_t kPandasTimestampNull = std::numeric_limits::min(); +constexpr int64_t kMillisecondsInDay = 86400000LL; constexpr int64_t kNanosecondsInDay = 86400000000000LL; namespace internal { diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi index 9136f252980..2037c11edaa 100644 --- a/python/pyarrow/table.pxi +++ b/python/pyarrow/table.pxi @@ -3366,7 +3366,7 @@ cdef class RecordBatch(_Tabular): @classmethod def from_pandas(cls, df, Schema schema=None, preserve_index=None, - nthreads=None, columns=None): + nthreads=None, columns=None, bint truncate_date64_time=True): """ Convert pandas.DataFrame to an Arrow RecordBatch @@ -3392,6 +3392,10 @@ cdef class RecordBatch(_Tabular): :func:`pyarrow.cpu_count` (may use up to system CPU count threads). columns : list, optional List of column to be converted. If None, use all columns. + truncate_date64_time : bool, default True + If True (default), truncate intraday milliseconds when converting Python + datetime objects to date64. + If False, preserve the full datetime including time components. Returns ------- @@ -3448,7 +3452,8 @@ cdef class RecordBatch(_Tabular): """ from pyarrow.pandas_compat import dataframe_to_arrays arrays, schema, n_rows = dataframe_to_arrays( - df, schema, preserve_index, nthreads=nthreads, columns=columns + df, schema, preserve_index, nthreads=nthreads, columns=columns, + truncate_date64_time=truncate_date64_time ) # If df is empty but row index is not, create empty RecordBatch with rows >0 @@ -4732,7 +4737,8 @@ cdef class Table(_Tabular): @classmethod def from_pandas(cls, df, Schema schema=None, preserve_index=None, - nthreads=None, columns=None, bint safe=True): + nthreads=None, columns=None, bint safe=True, + bint truncate_date64_time=True): """ Convert pandas.DataFrame to an Arrow Table. @@ -4773,6 +4779,10 @@ cdef class Table(_Tabular): List of column to be converted. If None, use all columns. safe : bool, default True Check for overflows or other unsafe conversions. + truncate_date64_time : bool, default True + If True (default), truncate intraday milliseconds when converting Python + datetime objects to date64. + If False, preserve the full datetime including time components. Returns ------- @@ -4799,7 +4809,8 @@ cdef class Table(_Tabular): preserve_index=preserve_index, nthreads=nthreads, columns=columns, - safe=safe + safe=safe, + truncate_date64_time=truncate_date64_time ) # If df is empty but row index is not, create empty Table with rows >0 diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index ec361159c5f..923843989e6 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -2216,6 +2216,216 @@ def test_date64_from_builtin_datetime(): assert as_i8[0].as_py() == as_i8[1].as_py() +def test_date64_truncate_date64_time_option(): + # Test default behavior: truncate intraday milliseconds + dt_with_time = datetime.datetime(2000, 1, 1, 12, 34, 56, 123456) + dt_date_only = datetime.datetime(2000, 1, 1) + + # Default behavior (truncate_date64_time=True) + result_default = pa.array([dt_with_time], type='date64') + result_date_only = pa.array([dt_date_only], type='date64') + + # Both should be equal when truncated + assert result_default.equals(result_date_only) + + # Verify the underlying int64 values are the same + as_i8_default = result_default.view('int64') + as_i8_date_only = result_date_only.view('int64') + assert as_i8_default[0].as_py() == as_i8_date_only[0].as_py() + + # Test with truncate_date64_time=False: preserve time components + result_preserve = pa.array([dt_with_time], type='date64', + truncate_date64_time=False) + result_preserve_date_only = pa.array( + [dt_date_only], type='date64', truncate_date64_time=False) + + # These should not be equal when time is preserved + assert not result_preserve.equals(result_preserve_date_only) + + # Verify the underlying int64 values are different + as_i8_preserve = result_preserve.view('int64') + as_i8_preserve_date_only = result_preserve_date_only.view('int64') + assert as_i8_preserve[0].as_py() != as_i8_preserve_date_only[0].as_py() + + # The preserved time should have more milliseconds + assert as_i8_preserve[0].as_py() > as_i8_preserve_date_only[0].as_py() + + +def test_scalar_date64_truncate_date64_time_option(): + # Test scalar with default behavior + dt_with_time = datetime.datetime(2000, 1, 1, 12, 34, 56, 123456) + dt_date_only = datetime.datetime(2000, 1, 1) + + # Default behavior (truncate_date64_time=True) + scalar_default = pa.scalar(dt_with_time, type=pa.date64()) + scalar_date_only = pa.scalar(dt_date_only, type=pa.date64()) + + # Both should be equal when truncated + assert scalar_default.equals(scalar_date_only) + + # Test with truncate_date64_time=False: preserve time components + scalar_preserve = pa.scalar( + dt_with_time, type=pa.date64(), truncate_date64_time=False) + scalar_preserve_date_only = pa.scalar( + dt_date_only, type=pa.date64(), truncate_date64_time=False) + + # These should not be equal when time is preserved + assert not scalar_preserve.equals(scalar_preserve_date_only) + + +@pytest.mark.pandas +def test_date64_from_pandas_with_truncate_date64_time(): + pd = pytest.importorskip("pandas") + + # Create pandas Series with Python native datetime objects (object dtype) + dt_with_time = datetime.datetime(2000, 1, 1, 12, 34, 56, 123456) + dt_date_only = datetime.datetime(2000, 1, 1) + + series_with_time = pd.Series([dt_with_time], dtype=object) + series_date_only = pd.Series([dt_date_only], dtype=object) + + # Test default behavior: truncate time + # (from_pandas=True, default truncate_date64_time=True) + arr_with_time_default = pa.array(series_with_time, type=pa.date64(), + from_pandas=True) + arr_date_only_default = pa.array(series_date_only, type=pa.date64(), + from_pandas=True) + + # Both should be equal when truncated + assert arr_with_time_default.equals(arr_date_only_default) + + # Verify underlying int64 values are the same + as_i8_with_time = arr_with_time_default.view('int64') + as_i8_date_only = arr_date_only_default.view('int64') + assert as_i8_with_time[0].as_py() == as_i8_date_only[0].as_py() + + # Test with truncate_date64_time=False: preserve time components + # This verifies that from_pandas and truncate_date64_time work together correctly + arr_with_time_preserve = pa.array(series_with_time, type=pa.date64(), + from_pandas=True, truncate_date64_time=False) + arr_date_only_preserve = pa.array(series_date_only, type=pa.date64(), + from_pandas=True, truncate_date64_time=False) + + # These should not be equal when time is preserved + assert not arr_with_time_preserve.equals(arr_date_only_preserve) + + # Verify underlying int64 values are different + as_i8_with_time_preserve = arr_with_time_preserve.view('int64') + as_i8_date_only_preserve = arr_date_only_preserve.view('int64') + assert as_i8_with_time_preserve[0].as_py() != as_i8_date_only_preserve[0].as_py() + + # The preserved time should have more milliseconds + assert as_i8_with_time_preserve[0].as_py() > as_i8_date_only_preserve[0].as_py() + + # Test that from_pandas=True doesn't interfere with truncate_date64_time behavior + # Compare with from_pandas=False to ensure consistent behavior + arr_with_time_no_pandas = pa.array([dt_with_time], type=pa.date64(), + from_pandas=False, truncate_date64_time=False) + arr_with_time_pandas = pa.array(series_with_time, type=pa.date64(), + from_pandas=True, truncate_date64_time=False) + + # Both should produce the same result when truncate_date64_time=False + assert arr_with_time_no_pandas.equals(arr_with_time_pandas) + + +def test_date64_numpy_array_truncate_date64_time_option(): + np = pytest.importorskip("numpy") + + # Create NumPy array with object dtype containing Python datetime objects + dt_with_time = datetime.datetime(2000, 1, 1, 12, 34, 56, 123456) + dt_date_only = datetime.datetime(2000, 1, 1) + + arr_with_time = np.array([dt_with_time], dtype=object) + arr_date_only = np.array([dt_date_only], dtype=object) + + # Test default behavior: NumPy arrays truncate by default + # (since array() defaults to True) + arr_with_time_default = pa.array(arr_with_time, type=pa.date64()) + arr_date_only_default = pa.array(arr_date_only, type=pa.date64()) + + # These should be equal because NumPy arrays truncate by default + assert arr_with_time_default.equals(arr_date_only_default) + + # Verify underlying int64 values are the same (truncated) + as_i8_with_time = arr_with_time_default.view('int64') + as_i8_date_only = arr_date_only_default.view('int64') + assert as_i8_with_time[0].as_py() == as_i8_date_only[0].as_py() + + # Test explicit truncate_date64_time=False: should preserve time + arr_with_time_preserve = pa.array(arr_with_time, type=pa.date64(), + truncate_date64_time=False) + arr_date_only_preserve = pa.array(arr_date_only, type=pa.date64(), + truncate_date64_time=False) + + # These should not be equal when time is preserved + assert not arr_with_time_preserve.equals(arr_date_only_preserve) + + # Verify underlying int64 values are different when time is preserved + as_i8_with_time_preserve = arr_with_time_preserve.view('int64') + as_i8_date_only_preserve = arr_date_only_preserve.view('int64') + assert as_i8_with_time_preserve[0].as_py() != as_i8_date_only_preserve[0].as_py() + assert as_i8_with_time_preserve[0].as_py() > as_i8_date_only_preserve[0].as_py() + + +@pytest.mark.pandas +def test_date64_to_pandas_truncate_date64_time_option(): + pd = pytest.importorskip("pandas") + + # Create date64 array with time components + # 2018-05-10 00:00:00 + milliseconds_at_midnight = 1525910400000 + # 2018-05-10 00:02:03.456 + milliseconds_with_time = milliseconds_at_midnight + 123456 + + arr = pa.array([milliseconds_at_midnight, milliseconds_with_time], + type=pa.date64()) + + # Test default behavior: preserve time components (truncate_date64_time=False) + result_default = arr.to_pandas(date_as_object=False) + expected_default = pd.Series([ + pd.Timestamp('2018-05-10 00:00:00'), + pd.Timestamp('2018-05-10 00:02:03.456'), + ], dtype='datetime64[ms]') + pd.testing.assert_series_equal(result_default, expected_default) + + # Test with truncate_date64_time=True: truncate time components + result_truncated = arr.to_pandas(date_as_object=False, truncate_date64_time=True) + expected_truncated = pd.Series([ + pd.Timestamp('2018-05-10 00:00:00'), + pd.Timestamp('2018-05-10 00:00:00'), + ], dtype='datetime64[ms]') + pd.testing.assert_series_equal(result_truncated, expected_truncated) + + # Test with datetime64[ns] conversion + result_ns_default = arr.to_pandas(date_as_object=False, + coerce_temporal_nanoseconds=True) + expected_ns_default = pd.Series([ + pd.Timestamp('2018-05-10 00:00:00'), + pd.Timestamp('2018-05-10 00:02:03.456'), + ], dtype='datetime64[ns]') + pd.testing.assert_series_equal(result_ns_default, expected_ns_default) + + result_ns_truncated = arr.to_pandas(date_as_object=False, + coerce_temporal_nanoseconds=True, + truncate_date64_time=True) + expected_ns_truncated = pd.Series([ + pd.Timestamp('2018-05-10 00:00:00'), + pd.Timestamp('2018-05-10 00:00:00'), + ], dtype='datetime64[ns]') + pd.testing.assert_series_equal(result_ns_truncated, expected_ns_truncated) + + # Test with ChunkedArray + chunked = pa.chunked_array([[milliseconds_at_midnight], + [milliseconds_with_time]], + type=pa.date64()) + result_chunked_default = chunked.to_pandas(date_as_object=False) + pd.testing.assert_series_equal(result_chunked_default, expected_default) + + result_chunked_truncated = chunked.to_pandas(date_as_object=False, + truncate_date64_time=True) + pd.testing.assert_series_equal(result_chunked_truncated, expected_truncated) + + @pytest.mark.parametrize(('ty', 'values'), [ ('bool', [True, False, True]), ('uint8', range(0, 255)),