From 3e889fd4c9775bf6726c930b81d314a775e9f12a Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Mon, 5 Jan 2026 17:05:31 +0100 Subject: [PATCH 1/2] Initial commit --- python/pyarrow/src/arrow/python/common.h | 10 ++++ python/pyarrow/src/arrow/python/helpers.cc | 57 +++++++++++++++++++ python/pyarrow/src/arrow/python/helpers.h | 9 +++ python/pyarrow/src/arrow/python/inference.cc | 8 +++ .../src/arrow/python/python_to_arrow.cc | 25 +++++++- python/pyarrow/tests/test_extension_type.py | 52 +++++++++++++++++ 6 files changed, 158 insertions(+), 3 deletions(-) diff --git a/python/pyarrow/src/arrow/python/common.h b/python/pyarrow/src/arrow/python/common.h index affefe2859b..e2de621e66a 100644 --- a/python/pyarrow/src/arrow/python/common.h +++ b/python/pyarrow/src/arrow/python/common.h @@ -419,6 +419,16 @@ struct PyBytesView { return Status::OK(); } + // Parse bytes from a uuid.UUID object (stores reference to keep bytes alive) + Status ParseUuid(PyObject* obj) { + ref.reset(PyObject_GetAttrString(obj, "bytes")); + RETURN_IF_PYERROR(); + bytes = PyBytes_AS_STRING(ref.obj()); + size = PyBytes_GET_SIZE(ref.obj()); + is_utf8 = false; + return Status::OK(); + } + protected: OwnedRef ref; }; diff --git a/python/pyarrow/src/arrow/python/helpers.cc b/python/pyarrow/src/arrow/python/helpers.cc index 0a24b259310..b74ca04e494 100644 --- a/python/pyarrow/src/arrow/python/helpers.cc +++ b/python/pyarrow/src/arrow/python/helpers.cc @@ -296,6 +296,63 @@ bool PyFloat_IsNaN(PyObject* obj) { namespace { +// UUID module static data - lazily initialized on first use +// Uses a conditional initialization strategy: std::once_flag when the GIL is +// disabled, or a simple boolean flag when the GIL is enabled. +// See the Pandas static data section below and ARROW-10519 for more details. +#ifdef Py_GIL_DISABLED +static std::once_flag uuid_static_initialized; +#else +static bool uuid_static_initialized = false; +#endif +static PyObject* uuid_UUID = nullptr; + +void GetUuidStaticSymbols() { + OwnedRef uuid_module; + + // Import uuid module + Status s = ImportModule("uuid", &uuid_module); + if (!s.ok()) { + return; + } + +#ifndef Py_GIL_DISABLED + if (uuid_static_initialized) { + return; + } +#endif + + OwnedRef ref; + if (ImportFromModule(uuid_module.obj(), "UUID", &ref).ok()) { + uuid_UUID = ref.obj(); + } +} + +#ifdef Py_GIL_DISABLED +void InitUuidStaticData() { + std::call_once(uuid_static_initialized, GetUuidStaticSymbols); +} +#else +void InitUuidStaticData() { + if (uuid_static_initialized) { + return; + } + GetUuidStaticSymbols(); + uuid_static_initialized = true; +} +#endif + +} // namespace + +bool IsPyUuid(PyObject* obj) { + InitUuidStaticData(); + return uuid_UUID && PyObject_IsInstance(obj, uuid_UUID); +} + +PyObject* GetUuidBytes(PyObject* obj) { return PyObject_GetAttrString(obj, "bytes"); } + +namespace { + // This needs a conditional, because using std::once_flag could introduce // a deadlock when the GIL is enabled. See // https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for diff --git a/python/pyarrow/src/arrow/python/helpers.h b/python/pyarrow/src/arrow/python/helpers.h index b0cf1010289..630e7663d03 100644 --- a/python/pyarrow/src/arrow/python/helpers.h +++ b/python/pyarrow/src/arrow/python/helpers.h @@ -92,6 +92,15 @@ PyObject* BorrowPandasDataOffsetType(); ARROW_PYTHON_EXPORT bool PyFloat_IsNaN(PyObject* obj); +// \brief Check whether obj is a uuid.UUID instance +ARROW_PYTHON_EXPORT +bool IsPyUuid(PyObject* obj); + +// \brief Get bytes from a uuid.UUID instance +// Returns a borrowed reference to a 16-byte bytes object +ARROW_PYTHON_EXPORT +PyObject* GetUuidBytes(PyObject* obj); + inline bool IsPyBinary(PyObject* obj) { return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj); } diff --git a/python/pyarrow/src/arrow/python/inference.cc b/python/pyarrow/src/arrow/python/inference.cc index 1aa7915ba1e..601f67a7eb8 100644 --- a/python/pyarrow/src/arrow/python/inference.cc +++ b/python/pyarrow/src/arrow/python/inference.cc @@ -27,6 +27,7 @@ #include #include +#include "arrow/extension/uuid.h" #include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/util/decimal.h" @@ -344,6 +345,7 @@ class TypeInferrer { arrow_scalar_count_(0), numpy_dtype_count_(0), interval_count_(0), + uuid_count_(0), max_decimal_metadata_(std::numeric_limits::min(), std::numeric_limits::min()), decimal_type_() { @@ -412,6 +414,9 @@ class TypeInferrer { ++decimal_count_; } else if (PyObject_IsInstance(obj, interval_types_.obj())) { ++interval_count_; + } else if (internal::IsPyUuid(obj)) { + ++uuid_count_; + *keep_going = make_unions_; } else { return internal::InvalidValue(obj, "did not recognize Python value type when inferring " @@ -541,6 +546,8 @@ class TypeInferrer { *out = utf8(); } else if (interval_count_) { *out = month_day_nano_interval(); + } else if (uuid_count_) { + *out = extension::uuid(); } else if (arrow_scalar_count_) { *out = scalar_type_; } else { @@ -698,6 +705,7 @@ class TypeInferrer { int64_t arrow_scalar_count_; int64_t numpy_dtype_count_; int64_t interval_count_; + int64_t uuid_count_; std::unique_ptr list_inferrer_; std::map struct_inferrers_; std::shared_ptr scalar_type_; diff --git a/python/pyarrow/src/arrow/python/python_to_arrow.cc b/python/pyarrow/src/arrow/python/python_to_arrow.cc index 139eb1d7f4f..f2e69b18b78 100644 --- a/python/pyarrow/src/arrow/python/python_to_arrow.cc +++ b/python/pyarrow/src/arrow/python/python_to_arrow.cc @@ -36,6 +36,7 @@ #include "arrow/array/builder_primitive.h" #include "arrow/array/builder_time.h" #include "arrow/chunked_array.h" +#include "arrow/extension_type.h" #include "arrow/result.h" #include "arrow/scalar.h" #include "arrow/status.h" @@ -512,7 +513,12 @@ class PyValue { static Status Convert(const FixedSizeBinaryType* type, const O&, I obj, PyBytesView& view) { - ARROW_RETURN_NOT_OK(view.ParseString(obj)); + // Check if obj is a uuid.UUID instance + if (type->byte_width() == 16 && internal::IsPyUuid(obj)) { + ARROW_RETURN_NOT_OK(view.ParseUuid(obj)); + } else { + ARROW_RETURN_NOT_OK(view.ParseString(obj)); + } if (view.size != type->byte_width()) { std::stringstream ss; ss << "expected to be length " << type->byte_width() << " was " << view.size; @@ -1268,9 +1274,16 @@ Result> ConvertPySequence(PyObject* obj, PyObject* // In some cases, type inference may be "loose", like strings. If the user // passed pa.string(), then we will error if we encounter any non-UTF8 // value. If not, then we will allow the result to be a BinaryArray + std::shared_ptr extension_type; if (options.type == nullptr) { ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas)); options.strict = false; + // If type inference returned an extension type, convert using + // the storage type and then wrap the result as an extension array + if (options.type->id() == Type::EXTENSION) { + extension_type = options.type; + options.type = checked_cast(*options.type).storage_type(); + } } else { options.strict = true; } @@ -1278,6 +1291,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter( options.type, options, pool))); + std::shared_ptr result; if (converter->may_overflow()) { // The converter hierarchy contains binary- or list-like builders which can overflow // depending on the input values. Wrap the converter with a chunker which detects @@ -1288,7 +1302,7 @@ Result> ConvertPySequence(PyObject* obj, PyObject* } else { RETURN_NOT_OK(chunked_converter->Extend(seq, size)); } - return chunked_converter->ToChunkedArray(); + ARROW_ASSIGN_OR_RAISE(result, chunked_converter->ToChunkedArray()); } else { // If the converter can't overflow spare the capacity error checking on the hot-path, // this improves the performance roughly by ~10% for primitive types. @@ -1297,8 +1311,13 @@ Result> ConvertPySequence(PyObject* obj, PyObject* } else { RETURN_NOT_OK(converter->Extend(seq, size)); } - return converter->ToChunkedArray(); + ARROW_ASSIGN_OR_RAISE(result, converter->ToChunkedArray()); + } + // If we inferred an extension type, wrap as an extension array + if (extension_type != nullptr) { + return ExtensionType::WrapArray(extension_type, result); } + return result; } } // namespace py diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index ebac37e862b..498d131c56e 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1399,6 +1399,58 @@ def test_uuid_extension(): assert isinstance(array[0], pa.UuidScalar) +def test_uuid_scalar_from_python(): + import uuid + + # Test with explicit type + py_uuid = uuid.uuid4() + scalar = pa.scalar(py_uuid, type=pa.uuid()) + assert isinstance(scalar, pa.UuidScalar) + assert scalar.type == pa.uuid() + assert scalar.as_py() == py_uuid + + # Test with specific UUID value + specific_uuid = UUID("12345678-1234-5678-1234-567812345678") + scalar = pa.scalar(specific_uuid, type=pa.uuid()) + assert scalar.as_py() == specific_uuid + assert scalar.value.as_py() == specific_uuid.bytes + + scalar = pa.scalar(None, type=pa.uuid()) + assert scalar.is_valid is False + assert scalar.as_py() is None + + # Test type inference from uuid.UUID + py_uuid = uuid.uuid4() + scalar = pa.scalar(py_uuid) + assert isinstance(scalar, pa.UuidScalar) + assert scalar.type == pa.uuid() + assert scalar.as_py() == py_uuid + + +def test_uuid_array_from_python(): + import uuid + + # Test array with explicit type + uuids = [uuid.uuid4() for _ in range(3)] + uuids.append(None) + + arr = pa.array(uuids, type=pa.uuid()) + assert arr.type == pa.uuid() + assert len(arr) == 4 + assert arr.null_count == 1 + for i, u in enumerate(uuids): + if u is None: + assert arr[i].as_py() is None + else: + assert arr[i].as_py() == u + + # Test type inference for arrays + arr = pa.array(uuids) + assert arr.type == pa.uuid() + for i, u in enumerate(uuids): + assert arr[i].as_py() == u + + def test_tensor_type(): tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3]) assert tensor_type.extension_name == "arrow.fixed_shape_tensor" From a4ac8bf7577cf97a9b2d059784b5856677b118aa Mon Sep 17 00:00:00 2001 From: Tadeja Kadunc Date: Thu, 8 Jan 2026 20:41:18 +0100 Subject: [PATCH 2/2] Add UUID docs section --- docs/source/python/extending_types.rst | 44 +++++++++++++++++++-- python/pyarrow/tests/test_extension_type.py | 5 +-- 2 files changed, 42 insertions(+), 7 deletions(-) diff --git a/docs/source/python/extending_types.rst b/docs/source/python/extending_types.rst index 29f0ed55d03..dc538a36582 100644 --- a/docs/source/python/extending_types.rst +++ b/docs/source/python/extending_types.rst @@ -458,8 +458,8 @@ You can find the official list of canonical extension types in the :ref:`format_canonical_extensions` section. Here we add examples on how to use them in PyArrow. -Fixed size tensor -""""""""""""""""" +Fixed shape tensor +"""""""""""""""""" To create an array of tensors with equal shape (fixed shape tensor array) we first need to define a fixed shape tensor extension type with value type @@ -469,7 +469,7 @@ and shape: >>> tensor_type = pa.fixed_shape_tensor(pa.int32(), (2, 2)) -Then we need the storage array with :func:`pyarrow.list_` type where ``value_type``` +Then we need the storage array with :func:`pyarrow.list_` type where ``value_type`` is the fixed shape tensor value type and list size is a product of ``tensor_type`` shape elements. Then we can create an array of tensors with ``pa.ExtensionArray.from_storage()`` method: @@ -609,3 +609,41 @@ for ``NCHW`` format where: * C: number of channels of the image * H: height of the image * W: width of the image + +UUID +"""" + +The UUID extension type (``arrow.uuid``) represents universally unique +identifiers as 16-byte fixed-size binary values. PyArrow provides integration +with Python's built-in :mod:`uuid` module, including automatic type inference. + +Creating UUID scalars and arrays +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +PyArrow infers the UUID type from Python's ``uuid.UUID`` objects, +so you can pass them directly to :func:`pyarrow.scalar` and :func:`pyarrow.array`: + +.. code-block:: python + + >>> import uuid + >>> import pyarrow as pa + + >>> pa.scalar(uuid.uuid4()) + + + >>> uuids = [uuid.uuid4() for _ in range(3)] + >>> arr = pa.array(uuids) + >>> arr.type + UuidType(extension) + +You can also explicitly specify the UUID type using :func:`pyarrow.uuid`: + +.. code-block:: python + + >>> pa.array([uuid.uuid4(), uuid.uuid4()], type=pa.uuid()) + + [ + 77C17B9296554636A54C6A7EF37A70E4, + B71D2BF764374A60A1DEECB102A77B16 + ] + diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index 498d131c56e..52e0aa5063a 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -1439,10 +1439,7 @@ def test_uuid_array_from_python(): assert len(arr) == 4 assert arr.null_count == 1 for i, u in enumerate(uuids): - if u is None: - assert arr[i].as_py() is None - else: - assert arr[i].as_py() == u + assert arr[i].as_py() == u # Test type inference for arrays arr = pa.array(uuids)