Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 26 additions & 10 deletions python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ cdef extern from "<variant>" namespace "std":
T get[T](...)

cdef _sequence_to_array(object sequence, object mask, object size,
DataType type, CMemoryPool* pool, c_bool from_pandas):
DataType type, CMemoryPool* pool, c_bool from_pandas,
bint truncate_date64_time):
cdef:
int64_t c_size
PyConversionOptions options
Expand All @@ -41,6 +42,7 @@ cdef _sequence_to_array(object sequence, object mask, object size,

options.from_pandas = from_pandas
options.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
options.truncate_date64_time = truncate_date64_time

with nogil:
chunked = GetResultValue(
Expand Down Expand Up @@ -81,15 +83,16 @@ cdef shared_ptr[CDataType] _ndarray_to_type(object values,


cdef _ndarray_to_array(object values, object mask, DataType type,
c_bool from_pandas, c_bool safe, CMemoryPool* pool):
c_bool from_pandas, c_bool safe, CMemoryPool* pool,
bint truncate_date64_time):
cdef:
shared_ptr[CChunkedArray] chunked_out
shared_ptr[CDataType] c_type = _ndarray_to_type(values, type)
CCastOptions cast_options = CCastOptions(safe)

with nogil:
check_status(NdarrayToArrow(pool, values, mask, from_pandas,
c_type, cast_options, &chunked_out))
c_type, cast_options, truncate_date64_time, &chunked_out))

if chunked_out.get().num_chunks() > 1:
return pyarrow_wrap_chunked_array(chunked_out)
Expand Down Expand Up @@ -127,7 +130,7 @@ def _handle_arrow_array_protocol(obj, type, mask, size):


def array(object obj, type=None, mask=None, size=None, from_pandas=None,
bint safe=True, MemoryPool memory_pool=None):
bint safe=True, MemoryPool memory_pool=None, bint truncate_date64_time=True):
"""
Create pyarrow.Array instance from a Python object.

Expand Down Expand Up @@ -162,6 +165,10 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the currently-set default
memory pool.
truncate_date64_time : bool, default True
If True (default), truncate intraday milliseconds when converting Python
datetime objects to date64.
If False, preserve the full datetime including time components.

Returns
-------
Expand Down Expand Up @@ -313,7 +320,8 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
elif (pandas_api.is_categorical(values) and
type is not None and type.id != Type_DICTIONARY):
result = _ndarray_to_array(
np.asarray(values), mask, type, c_from_pandas, safe, pool
np.asarray(
values), mask, type, c_from_pandas, safe, pool, truncate_date64_time
)
elif pandas_api.is_categorical(values):
if type is not None:
Expand Down Expand Up @@ -358,21 +366,22 @@ def array(object obj, type=None, mask=None, size=None, from_pandas=None,
values, obj.dtype, type)
if type and type.id == _Type_RUN_END_ENCODED:
arr = _ndarray_to_array(
values, mask, type.value_type, c_from_pandas, safe, pool)
values, mask, type.value_type, c_from_pandas, safe, pool, truncate_date64_time)
result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
memory_pool=memory_pool)
else:
result = _ndarray_to_array(values, mask, type, c_from_pandas, safe,
pool)
pool, truncate_date64_time)
else:
if type and type.id == _Type_RUN_END_ENCODED:
arr = _sequence_to_array(
obj, mask, size, type.value_type, pool, from_pandas)
obj, mask, size, type.value_type, pool, from_pandas, truncate_date64_time)
result = _pc().run_end_encode(arr, run_end_type=type.run_end_type,
memory_pool=memory_pool)
# ConvertPySequence does strict conversion if type is explicitly passed
else:
result = _sequence_to_array(obj, mask, size, type, pool, c_from_pandas)
result = _sequence_to_array(
obj, mask, size, type, pool, c_from_pandas, truncate_date64_time)

if extension_type is not None:
result = ExtensionArray.from_storage(extension_type, result)
Expand Down Expand Up @@ -880,7 +889,8 @@ cdef class _PandasConvertible(_Weakrefable):
bint self_destruct=False,
str maps_as_pydicts=None,
types_mapper=None,
bint coerce_temporal_nanoseconds=False
bint coerce_temporal_nanoseconds=False,
bint truncate_date64_time=False
):
"""
Convert to a pandas-compatible NumPy array or DataFrame, as appropriate
Expand Down Expand Up @@ -965,6 +975,10 @@ cdef class _PandasConvertible(_Weakrefable):
default behavior in pandas version 1.x. Set this option to True if
you'd like to use this coercion when using pandas version >= 2.0
for backwards compatibility (not recommended otherwise).
truncate_date64_time : bool, default False
If True, truncate intraday milliseconds when converting date64 to pandas
datetime.
If False (default), preserve the full datetime including time components.

Returns
-------
Expand Down Expand Up @@ -1041,6 +1055,7 @@ cdef class _PandasConvertible(_Weakrefable):
split_blocks=split_blocks,
self_destruct=self_destruct,
maps_as_pydicts=maps_as_pydicts,
truncate_date64_time=truncate_date64_time,
coerce_temporal_nanoseconds=coerce_temporal_nanoseconds
)
return self._to_pandas(options, categories=categories,
Expand All @@ -1063,6 +1078,7 @@ cdef PandasOptions _convert_pandas_options(dict options):
result.self_destruct = options['self_destruct']
result.coerce_temporal_nanoseconds = options['coerce_temporal_nanoseconds']
result.ignore_timezone = os.environ.get('PYARROW_IGNORE_TIMEZONE', False)
result.truncate_date64_time = options['truncate_date64_time']

maps_as_pydicts = options['maps_as_pydicts']
if maps_as_pydicts is None:
Expand Down
4 changes: 4 additions & 0 deletions python/pyarrow/includes/libarrow_python.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
c_bool from_pandas
c_bool ignore_timezone
c_bool strict
c_bool truncate_date64_time

# TODO Some functions below are not actually "nogil"

Expand All @@ -81,12 +82,14 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
c_bool from_pandas,
const shared_ptr[CDataType]& type,
c_bool truncate_date64_time,
shared_ptr[CChunkedArray]* out)

CStatus NdarrayToArrow(CMemoryPool* pool, object ao, object mo,
c_bool from_pandas,
const shared_ptr[CDataType]& type,
const CCastOptions& cast_options,
c_bool truncate_date64_time,
shared_ptr[CChunkedArray]* out)

CStatus NdarrayToTensor(CMemoryPool* pool, object ao,
Expand Down Expand Up @@ -193,6 +196,7 @@ cdef extern from "arrow/python/api.h" namespace "arrow::py" nogil:
c_bool coerce_temporal_nanoseconds
c_bool ignore_timezone
c_bool deduplicate_objects
c_bool truncate_date64_time
c_bool safe_cast
c_bool split_blocks
c_bool self_destruct
Expand Down
5 changes: 3 additions & 2 deletions python/pyarrow/pandas_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,7 +597,7 @@ def dataframe_to_types(df, preserve_index, columns=None):


def dataframe_to_arrays(df, schema, preserve_index, nthreads=1, columns=None,
safe=True):
safe=True, truncate_date64_time=True):
(all_names,
column_names,
column_field_names,
Expand Down Expand Up @@ -630,7 +630,8 @@ def convert_column(col, field):
type_ = field.type

try:
result = pa.array(col, type=type_, from_pandas=True, safe=safe)
result = pa.array(col, type=type_, from_pandas=True, safe=safe,
truncate_date64_time=truncate_date64_time)
except (pa.ArrowInvalid,
pa.ArrowNotImplementedError,
pa.ArrowTypeError) as e:
Expand Down
9 changes: 8 additions & 1 deletion python/pyarrow/scalar.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1598,7 +1598,8 @@ cdef object get_scalar_class_from_type(
return _scalar_classes[data_type.id()]


def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None,
bint truncate_date64_time=True):
"""
Create a pyarrow.Scalar instance from a Python object.

Expand All @@ -1616,6 +1617,10 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
memory_pool : pyarrow.MemoryPool, optional
If not passed, will allocate memory from the currently-set default
memory pool.
truncate_date64_time : bool, default True
If True (default), truncate intraday milliseconds when converting Python
datetime objects to date64.
If False, preserve the full datetime including time components.

Returns
-------
Expand Down Expand Up @@ -1668,6 +1673,8 @@ def scalar(value, type=None, *, from_pandas=None, MemoryPool memory_pool=None):
else:
options.from_pandas = from_pandas

options.truncate_date64_time = truncate_date64_time

value = [value]
with nogil:
chunked = GetResultValue(ConvertPySequence(value, None, options, pool))
Expand Down
40 changes: 36 additions & 4 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1547,6 +1547,26 @@ void ConvertDatesShift(const ChunkedArray& data, int64_t* out_values) {
}
}

template <int64_t SHIFT>
inline void ConvertDatetimeWithTruncation(const ChunkedArray& data, int64_t* out_values) {
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = *data.chunk(c);
const int64_t* in_values = GetPrimitiveValues<int64_t>(arr);
for (int64_t i = 0; i < arr.length(); ++i) {
if (arr.IsNull(i)) {
*out_values++ = kPandasTimestampNull;
} else {
int64_t truncated = in_values[i] - in_values[i] % kMillisecondsInDay;
if constexpr (SHIFT == 1) {
*out_values++ = truncated;
} else {
*out_values++ = truncated * SHIFT;
}
}
}
}
}

class DatetimeDayWriter : public TypedPandasWriter<NPY_DATETIME> {
public:
using TypedPandasWriter<NPY_DATETIME>::TypedPandasWriter;
Expand Down Expand Up @@ -1617,7 +1637,14 @@ class DatetimeMilliWriter : public DatetimeWriter<TimeUnit::MILLI> {
// Convert from days since epoch to datetime64[ms]
ConvertDatetime<int32_t, 86400000L>(*data, out_values);
} else if (type == Type::DATE64) {
ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
// Date64Type is millisecond timestamp
if (this->options_.truncate_date64_time) {
// Truncate intraday milliseconds
ConvertDatetimeWithTruncation<1L>(*data, out_values);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we avoid computing the ... * 1L for each value in the array when SHIFT == 1? Or will the compiler optimize this away?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it will optimize it out as a noop from my understanding but to make sure, I changed a bit to leverage constexpr for 1 case. It should compiletime branch it out, and should be optimized enough as documented in c++ lang.

} else {
// Preserve time components
ConvertNumericNullable<int64_t>(*data, kPandasTimestampNull, out_values);
}
} else {
const auto& ts_type = checked_cast<const TimestampType&>(*data->type());
ARROW_DCHECK_EQ(TimeUnit::MILLI, ts_type.unit())
Expand Down Expand Up @@ -1652,9 +1679,14 @@ class DatetimeNanoWriter : public DatetimeWriter<TimeUnit::NANO> {
// Convert from days since epoch to datetime64[ns]
ConvertDatetime<int32_t, kNanosecondsInDay>(*data, out_values);
} else if (type == Type::DATE64) {
// Date64Type is millisecond timestamp stored as int64_t
// TODO(wesm): Do we want to make sure to zero out the milliseconds?
ConvertDatetime<int64_t, 1000000L>(*data, out_values);
// Date64Type is millisecond timestamp; convert to nanoseconds
if (this->options_.truncate_date64_time) {
// Truncate intraday milliseconds and convert to nanoseconds
ConvertDatetimeWithTruncation<1000000L>(*data, out_values);
} else {
// Preserve time components and convert to nanoseconds
ConvertDatetime<int64_t, 1000000L>(*data, out_values);
}
} else if (type == Type::TIMESTAMP) {
const auto& ts_type = checked_cast<const TimestampType&>(*data->type());

Expand Down
5 changes: 5 additions & 0 deletions python/pyarrow/src/arrow/python/arrow_to_pandas.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,11 @@ struct PandasOptions {
/// objects
bool deduplicate_objects = false;

/// If true, truncate intraday milliseconds when converting date64 to pandas
/// datetime (default false to preserve time components).
/// If false, preserve the full datetime including time components.
bool truncate_date64_time = false;

/// \brief For certain data types, a cast is needed in order to store the
/// data in a pandas DataFrame or Series (e.g. timestamps are always stored
/// as nanoseconds in pandas). This option controls whether it is a safe
Expand Down
15 changes: 10 additions & 5 deletions python/pyarrow/src/arrow/python/numpy_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -183,14 +183,15 @@ class NumPyConverter {
public:
NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo,
const std::shared_ptr<DataType>& type, bool from_pandas,
const compute::CastOptions& cast_options = compute::CastOptions())
const compute::CastOptions& cast_options, bool truncate_date64_time)
: pool_(pool),
type_(type),
arr_(reinterpret_cast<PyArrayObject*>(arr)),
dtype_(PyArray_DESCR(arr_)),
mask_(nullptr),
from_pandas_(from_pandas),
cast_options_(cast_options),
truncate_date64_time_(truncate_date64_time),
null_bitmap_data_(nullptr),
null_count_(0) {
if (mo != nullptr && mo != Py_None) {
Expand Down Expand Up @@ -311,6 +312,7 @@ class NumPyConverter {

bool from_pandas_;
compute::CastOptions cast_options_;
bool truncate_date64_time_;

// Used in visitor pattern
ArrayVector out_arrays_;
Expand All @@ -330,6 +332,7 @@ Status NumPyConverter::Convert() {
PyConversionOptions py_options;
py_options.type = type_;
py_options.from_pandas = from_pandas_;
py_options.truncate_date64_time = truncate_date64_time_;
ARROW_ASSIGN_OR_RAISE(
auto chunked_array,
ConvertPySequence(reinterpret_cast<PyObject*>(arr_),
Expand Down Expand Up @@ -845,7 +848,7 @@ Status NumPyConverter::Visit(const StructType& type) {
RETURN_IF_PYERROR();
sub_arrays.emplace_back(sub_array);
sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(),
from_pandas_);
from_pandas_, cast_options_, truncate_date64_time_);
}
}

Expand Down Expand Up @@ -916,7 +919,7 @@ Status NumPyConverter::Visit(const StructType& type) {

Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
const compute::CastOptions& cast_options,
const compute::CastOptions& cast_options, bool truncate_date64_time,
std::shared_ptr<ChunkedArray>* out) {
if (!PyArray_Check(ao)) {
// This code path cannot be reached by Python unit tests currently so this
Expand All @@ -927,7 +930,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
return Status::Invalid("only handle 1-dimensional arrays");
}

NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options);
NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options,
truncate_date64_time);
RETURN_NOT_OK(converter.Convert());
const auto& output_arrays = converter.result();
ARROW_DCHECK_GT(output_arrays.size(), 0);
Expand All @@ -938,7 +942,8 @@ Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pa
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
std::shared_ptr<ChunkedArray>* out) {
return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out);
return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), false,
out);
}

} // namespace py
Expand Down
4 changes: 3 additions & 1 deletion python/pyarrow/src/arrow/python/numpy_to_arrow.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,13 @@ namespace py {
/// whether values are null
/// \param[in] type a specific type to cast to, may be null
/// \param[in] cast_options casting options
/// \param[in] truncate_date64_time If true, truncate intraday milliseconds when
/// converting Python datetime objects to date64 (default true)
/// \param[out] out a ChunkedArray, to accommodate chunked output
ARROW_PYTHON_EXPORT
Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas,
const std::shared_ptr<DataType>& type,
const compute::CastOptions& cast_options,
const compute::CastOptions& cast_options, bool truncate_date64_time,
std::shared_ptr<ChunkedArray>* out);

/// Safely convert NumPy arrays to Arrow. If target data type is not known,
Expand Down
9 changes: 5 additions & 4 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -303,14 +303,15 @@ class PyValue {
return value;
}

static Result<int64_t> Convert(const Date64Type*, const O&, I obj) {
static Result<int64_t> Convert(const Date64Type*, const O& options, I obj) {
int64_t value;
if (PyDateTime_Check(obj)) {
auto pydate = reinterpret_cast<PyDateTime_DateTime*>(obj);
value = internal::PyDateTime_to_ms(pydate);
// Truncate any intraday milliseconds
// TODO: introduce an option for this
value -= value % 86400000LL;
// Truncate any intraday milliseconds if the option is enabled
if (options.truncate_date64_time) {
value -= value % 86400000LL;
}
} else if (PyDate_Check(obj)) {
auto pydate = reinterpret_cast<PyDateTime_Date*>(obj);
value = internal::PyDate_to_ms(pydate);
Expand Down
Loading