Skip to content

Conversation

@HyukjinKwon
Copy link
Member

@HyukjinKwon HyukjinKwon commented Dec 12, 2025

Rationale for this change

// Date64Type is millisecond timestamp stored as int64_t
// TODO(wesm): Do we want to make sure to zero out the milliseconds?

// TODO: introduce an option for this

What changes are included in this PR?

This PR adds an option for truncating intraday milliseconds in Date64, which is disabled by default for pandas conversion, and enabled by default for Python conversion to avoid breaking changes.

Are these changes tested?

Yes, unittests were added, and tested as below:

pytest pyarrow/tests/test_pandas.py

Are there any user-facing changes?

No by default. It adds a new option

(Generated by ChatGPT)

Conversion Type Default Behavior With Explicit Option Option Value Result
Python sequences → Arrow (pa.array()) Truncates time Preserves time truncate_date64_time=False int64: 946684800000 (truncated) → 946730096123 (preserved)
NumPy arrays → Arrow (pa.array()) Truncates time Preserves time truncate_date64_time=False int64: 946684800000 (truncated) → 946730096123 (preserved)
Pandas Series → Arrow (pa.array() with from_pandas=True) Truncates time Preserves time truncate_date64_time=False int64: 946684800000 (truncated) → 946730096123 (preserved)
Arrow → Pandas (to_pandas()) Preserves time Truncates time truncate_date64_time=True 2018-05-10 00:02:03.456000 (preserved) → 2018-05-10 00:00:00 (truncated)
import datetime
import pyarrow as pa

dt_with_time = datetime.datetime(2000, 1, 1, 12, 34, 56, 123456)
dt_date_only = datetime.datetime(2000, 1, 1)

# ============================================================================
# 1. Python sequences (lists)
# ============================================================================

# BEFORE (default behavior - truncates time)
arr_python_before = pa.array([dt_with_time], type=pa.date64())
arr_python_date_only_before = pa.array([dt_date_only], type=pa.date64())
print("Python sequences - BEFORE (default):")
print(f"  int64: {arr_python_before.view('int64')[0].as_py()}")  # 946684800000
print(f"  int64: {arr_python_date_only_before.view('int64')[0].as_py()}")  # 946684800000
print(f"  Equal? {arr_python_before.equals(arr_python_date_only_before)}")  # True

# AFTER (explicit truncate_date64_time=False - preserves time)
arr_python_after = pa.array([dt_with_time], type=pa.date64(), truncate_date64_time=False)
arr_python_date_only_after = pa.array([dt_date_only], type=pa.date64(), truncate_date64_time=False)
print("Python sequences - AFTER (truncate_date64_time=False):")
print(f"  int64: {arr_python_after.view('int64')[0].as_py()}")  # 946730096123
print(f"  int64: {arr_python_date_only_after.view('int64')[0].as_py()}")  # 946684800000
print(f"  Equal? {arr_python_after.equals(arr_python_date_only_after)}")  # False

# ============================================================================
# 2. NumPy arrays
# ============================================================================

import numpy as np

arr_numpy = np.array([dt_with_time], dtype=object)
arr_numpy_date_only = np.array([dt_date_only], dtype=object)

# BEFORE (default behavior - truncates time, since array() defaults to True)
arr_numpy_before = pa.array(arr_numpy, type=pa.date64())
arr_numpy_date_only_before = pa.array(arr_numpy_date_only, type=pa.date64())
print("\nNumPy arrays - BEFORE (default):")
print(f"  int64: {arr_numpy_before.view('int64')[0].as_py()}")  # 946684800000
print(f"  int64: {arr_numpy_date_only_before.view('int64')[0].as_py()}")  # 946684800000
print(f"  Equal? {arr_numpy_before.equals(arr_numpy_date_only_before)}")  # True

# AFTER (explicit truncate_date64_time=False - preserves time)
arr_numpy_after = pa.array(arr_numpy, type=pa.date64(), truncate_date64_time=False)
arr_numpy_date_only_after = pa.array(arr_numpy_date_only, type=pa.date64(), truncate_date64_time=False)
print("NumPy arrays - AFTER (truncate_date64_time=False):")
print(f"  int64: {arr_numpy_after.view('int64')[0].as_py()}")  # 946730096123
print(f"  int64: {arr_numpy_date_only_after.view('int64')[0].as_py()}")  # 946684800000
print(f"  Equal? {arr_numpy_after.equals(arr_numpy_date_only_after)}")  # False

# ============================================================================
# 3. Pandas Series
# ============================================================================

import pandas as pd

series_pandas = pd.Series([dt_with_time], dtype=object)
series_pandas_date_only = pd.Series([dt_date_only], dtype=object)

# BEFORE (default behavior - truncates time, since array() defaults to True)
arr_pandas_before = pa.array(series_pandas, type=pa.date64(), from_pandas=True)
arr_pandas_date_only_before = pa.array(series_pandas_date_only, type=pa.date64(), from_pandas=True)
print("\nPandas Series - BEFORE (default):")
print(f"  int64: {arr_pandas_before.view('int64')[0].as_py()}")  # 946684800000
print(f"  int64: {arr_pandas_date_only_before.view('int64')[0].as_py()}")  # 946684800000
print(f"  Equal? {arr_pandas_before.equals(arr_pandas_date_only_before)}")  # True

# AFTER (explicit truncate_date64_time=False - preserves time)
arr_pandas_after = pa.array(series_pandas, type=pa.date64(), from_pandas=True, truncate_date64_time=False)
arr_pandas_date_only_after = pa.array(series_pandas_date_only, type=pa.date64(), from_pandas=True, truncate_date64_time=False)
print("Pandas Series - AFTER (truncate_date64_time=False):")
print(f"  int64: {arr_pandas_after.view('int64')[0].as_py()}")  # 946730096123
print(f"  int64: {arr_pandas_date_only_after.view('int64')[0].as_py()}")  # 946684800000
print(f"  Equal? {arr_pandas_after.equals(arr_pandas_date_only_after)}")  # False

# ============================================================================
# 4. Arrow to Pandas conversion (to_pandas)
# ============================================================================

milliseconds_at_midnight = 1525910400000  # 2018-05-10 00:00:00
milliseconds_with_time = milliseconds_at_midnight + 123456  # 2018-05-10 00:02:03.456

arr_arrow = pa.array([milliseconds_at_midnight, milliseconds_with_time], type=pa.date64())

# BEFORE (default behavior - preserves time, since to_pandas() defaults to False)
result_before = arr_arrow.to_pandas(date_as_object=False)
print("\nArrow to Pandas - BEFORE (default):")
print(f"  arr.to_pandas(date_as_object=False)[0] = {result_before[0]}")  # 2018-05-10 00:00:00
print(f"  arr.to_pandas(date_as_object=False)[1] = {result_before[1]}")  # 2018-05-10 00:02:03.456000

# AFTER (explicit truncate_date64_time=True - truncates time)
result_after = arr_arrow.to_pandas(date_as_object=False, truncate_date64_time=True)
print("Arrow to Pandas - AFTER (truncate_date64_time=True):")
print(f"  arr.to_pandas(date_as_object=False, truncate_date64_time=True)[0] = {result_after[0]}")  # 2018-05-10 00:00:00
print(f"  arr.to_pandas(date_as_object=False, truncate_date64_time=True)[1] = {result_after[1]}")  # 2018-05-10 00:00:00
Python sequences - BEFORE (default):
  int64: 946684800000
  int64: 946684800000
  Equal? True
Python sequences - AFTER (truncate_date64_time=False):
  int64: 946730096123
  int64: 946684800000
  Equal? False

NumPy arrays - BEFORE (default):
  int64: 946684800000
  int64: 946684800000
  Equal? True
NumPy arrays - AFTER (truncate_date64_time=False):
  int64: 946730096123
  int64: 946684800000
  Equal? False

Pandas Series - BEFORE (default):
  int64: 946684800000
  int64: 946684800000
  Equal? True
Pandas Series - AFTER (truncate_date64_time=False):
  int64: 946730096123
  int64: 946684800000
  Equal? False

Arrow to Pandas - BEFORE (default):
  arr.to_pandas(date_as_object=False)[0] = 2018-05-10 00:00:00
  arr.to_pandas(date_as_object=False)[1] = 2018-05-10 00:02:03.456000
Arrow to Pandas - AFTER (truncate_date64_time=True):
  arr.to_pandas(date_as_object=False, truncate_date64_time=True)[0] = 2018-05-10 00:00:00
  arr.to_pandas(date_as_object=False, truncate_date64_time=True)[1] = 2018-05-10 00:00:00

@github-actions
Copy link

⚠️ GitHub issue #48465 has been automatically assigned in GitHub to PR creator.

@alippai
Copy link
Contributor

alippai commented Dec 15, 2025

By spec Date64 should be limited to full day values in arrow

@alippai
Copy link
Contributor

alippai commented Dec 15, 2025

Interesting, the arr.to_pandas(date_as_object=False) docs also says it should be the appropriate time unit (which is D in this case, not ms).

Overall I'm not a fan of introducing a slower conversion for managing a case violating the spec.

@HyukjinKwon
Copy link
Member Author

@alippai Thanks for reviewing this. I am fine with keeping the original behaviour as is, and add a switch. That is actually another todo for Python conversion at:

// TODO: introduce an option for this

If that's preferred, I can add a switch for Python and Arrow conversion sides, and keep the original behaviour as is (True for Python conv, and False for Arrow conv).

Otherwise, we can also simply just remove this todo as well.

@AlenkaF
Copy link
Member

AlenkaF commented Dec 23, 2025

cc @rok pinging in case you have any opinions on this topic.

@rok
Copy link
Member

rok commented Dec 23, 2025

I don't have a strong opinion on this either way. Avoiding a performance regression by making this non-default behavior seems like a good idea at this point.

@HyukjinKwon
Copy link
Member Author

Yeah let me work on it 👍

@HyukjinKwon HyukjinKwon marked this pull request as draft December 23, 2025 11:07
@HyukjinKwon HyukjinKwon changed the title GH-48465: [Python] Truncate intraday milliseconds in Date64 to pandas conversion GH-48672, GH-48465: [Python] Add an option for truncating intraday milliseconds in Date64 Dec 29, 2025
@github-actions
Copy link

⚠️ GitHub issue #48672 has been automatically assigned in GitHub to PR creator.

@HyukjinKwon HyukjinKwon force-pushed the truncate-millies branch 5 times, most recently from 2ffa9a0 to 7e8eb86 Compare December 29, 2025 07:23
@HyukjinKwon HyukjinKwon marked this pull request as ready for review December 29, 2025 07:25
@HyukjinKwon
Copy link
Member Author

This PR should be ready for a look.

@alippai
Copy link
Contributor

alippai commented Dec 29, 2025

Looks good, thanks for the change

// Date64Type is millisecond timestamp
if (this->options_.truncate_date64_time) {
// Truncate intraday milliseconds
ConvertDatetimeWithTruncation<1L>(*data, out_values);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we avoid computing the ... * 1L for each value in the array when SHIFT == 1? Or will the compiler optimize this away?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe it will optimize it out as a noop from my understanding but to make sure, I changed a bit to leverage constexpr for 1 case. It should compiletime branch it out, and should be optimized enough as documented in c++ lang.

Comment on lines 1550 to 1568
template <int64_t SHIFT>
inline void ConvertDatetimeWithTruncation(const ChunkedArray& data, int64_t* out_values) {
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = *data.chunk(c);
const int64_t* in_values = GetPrimitiveValues<int64_t>(arr);
for (int64_t i = 0; i < arr.length(); ++i) {
*out_values++ = arr.IsNull(i)
? kPandasTimestampNull
: ((in_values[i] - in_values[i] % kMillisecondsInDay) * SHIFT);
}
}
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The SHIFT sounds like we are bit-shifting, where this is more a factor.

Suggested change
template <int64_t SHIFT>
inline void ConvertDatetimeWithTruncation(const ChunkedArray& data, int64_t* out_values) {
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = *data.chunk(c);
const int64_t* in_values = GetPrimitiveValues<int64_t>(arr);
for (int64_t i = 0; i < arr.length(); ++i) {
*out_values++ = arr.IsNull(i)
? kPandasTimestampNull
: ((in_values[i] - in_values[i] % kMillisecondsInDay) * SHIFT);
}
}
}
template <int64_t FACTOR>
inline void ConvertDatetimeWithTruncation(const ChunkedArray& data, int64_t* out_values) {
for (int c = 0; c < data.num_chunks(); c++) {
const auto& arr = *data.chunk(c);
const int64_t* in_values = GetPrimitiveValues<int64_t>(arr);
for (int64_t i = 0; i < arr.length(); ++i) {
*out_values++ = arr.IsNull(i)
? kPandasTimestampNull
: ((in_values[i] - in_values[i] % kMillisecondsInDay) * FACTOR);
}
}
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks like this naming exists in ConvertDatetime as well :-(.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah .. let me just keep it consistent for now

@github-actions github-actions bot added awaiting committer review Awaiting committer review and removed awaiting review Awaiting review labels Jan 6, 2026
Copy link
Collaborator

@EnricoMi EnricoMi left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM!

@HyukjinKwon
Copy link
Member Author

@AlenkaF do you mind taking a look when you find some time? I believe I resolved all comments. Now it does not change any default behaviour 🫡

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

Projects

None yet

Development

Successfully merging this pull request may close these issues.

5 participants