Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 41 additions & 3 deletions docs/source/python/extending_types.rst
Original file line number Diff line number Diff line change
Expand Up @@ -458,8 +458,8 @@ You can find the official list of canonical extension types in the
:ref:`format_canonical_extensions` section. Here we add examples on how to
use them in PyArrow.

Fixed size tensor
"""""""""""""""""
Fixed shape tensor
""""""""""""""""""

To create an array of tensors with equal shape (fixed shape tensor array) we
first need to define a fixed shape tensor extension type with value type
Expand All @@ -469,7 +469,7 @@ and shape:

>>> tensor_type = pa.fixed_shape_tensor(pa.int32(), (2, 2))

Then we need the storage array with :func:`pyarrow.list_` type where ``value_type```
Then we need the storage array with :func:`pyarrow.list_` type where ``value_type``
is the fixed shape tensor value type and list size is a product of ``tensor_type``
shape elements. Then we can create an array of tensors with
``pa.ExtensionArray.from_storage()`` method:
Expand Down Expand Up @@ -609,3 +609,41 @@ for ``NCHW`` format where:
* C: number of channels of the image
* H: height of the image
* W: width of the image

UUID
""""

The UUID extension type (``arrow.uuid``) represents universally unique
identifiers as 16-byte fixed-size binary values. PyArrow provides integration
with Python's built-in :mod:`uuid` module, including automatic type inference.

Creating UUID scalars and arrays
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

PyArrow infers the UUID type from Python's ``uuid.UUID`` objects,
so you can pass them directly to :func:`pyarrow.scalar` and :func:`pyarrow.array`:

.. code-block:: python

>>> import uuid
>>> import pyarrow as pa

>>> pa.scalar(uuid.uuid4())
<pyarrow.UuidScalar: UUID('59c67eec-f171-4f6f-898b-7b4cdbd2821d')>

>>> uuids = [uuid.uuid4() for _ in range(3)]
>>> arr = pa.array(uuids)
>>> arr.type
UuidType(extension<arrow.uuid>)

You can also explicitly specify the UUID type using :func:`pyarrow.uuid`:

.. code-block:: python

>>> pa.array([uuid.uuid4(), uuid.uuid4()], type=pa.uuid())
<pyarrow.lib.UuidArray object at ...>
[
77C17B9296554636A54C6A7EF37A70E4,
B71D2BF764374A60A1DEECB102A77B16
]

10 changes: 10 additions & 0 deletions python/pyarrow/src/arrow/python/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,16 @@ struct PyBytesView {
return Status::OK();
}

// Parse bytes from a uuid.UUID object (stores reference to keep bytes alive)
Status ParseUuid(PyObject* obj) {
ref.reset(PyObject_GetAttrString(obj, "bytes"));
RETURN_IF_PYERROR();
bytes = PyBytes_AS_STRING(ref.obj());
size = PyBytes_GET_SIZE(ref.obj());
is_utf8 = false;
return Status::OK();
}

protected:
OwnedRef ref;
};
Expand Down
57 changes: 57 additions & 0 deletions python/pyarrow/src/arrow/python/helpers.cc
Original file line number Diff line number Diff line change
Expand Up @@ -296,6 +296,63 @@ bool PyFloat_IsNaN(PyObject* obj) {

namespace {

// UUID module static data - lazily initialized on first use
// Uses a conditional initialization strategy: std::once_flag when the GIL is
// disabled, or a simple boolean flag when the GIL is enabled.
// See the Pandas static data section below and ARROW-10519 for more details.
#ifdef Py_GIL_DISABLED
static std::once_flag uuid_static_initialized;
#else
static bool uuid_static_initialized = false;
#endif
static PyObject* uuid_UUID = nullptr;

void GetUuidStaticSymbols() {
OwnedRef uuid_module;

// Import uuid module
Status s = ImportModule("uuid", &uuid_module);
if (!s.ok()) {
return;
}

#ifndef Py_GIL_DISABLED
if (uuid_static_initialized) {
return;
}
#endif

OwnedRef ref;
if (ImportFromModule(uuid_module.obj(), "UUID", &ref).ok()) {
uuid_UUID = ref.obj();
}
}

#ifdef Py_GIL_DISABLED
void InitUuidStaticData() {
std::call_once(uuid_static_initialized, GetUuidStaticSymbols);
}
#else
void InitUuidStaticData() {
if (uuid_static_initialized) {
return;
}
GetUuidStaticSymbols();
uuid_static_initialized = true;
}
#endif

} // namespace

bool IsPyUuid(PyObject* obj) {
InitUuidStaticData();
return uuid_UUID && PyObject_IsInstance(obj, uuid_UUID);
}

PyObject* GetUuidBytes(PyObject* obj) { return PyObject_GetAttrString(obj, "bytes"); }

namespace {

// This needs a conditional, because using std::once_flag could introduce
// a deadlock when the GIL is enabled. See
// https://github.com/apache/arrow/commit/f69061935e92e36e25bb891177ca8bc4f463b272 for
Expand Down
9 changes: 9 additions & 0 deletions python/pyarrow/src/arrow/python/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,15 @@ PyObject* BorrowPandasDataOffsetType();
ARROW_PYTHON_EXPORT
bool PyFloat_IsNaN(PyObject* obj);

// \brief Check whether obj is a uuid.UUID instance
ARROW_PYTHON_EXPORT
bool IsPyUuid(PyObject* obj);

// \brief Get bytes from a uuid.UUID instance
// Returns a borrowed reference to a 16-byte bytes object
ARROW_PYTHON_EXPORT
PyObject* GetUuidBytes(PyObject* obj);

inline bool IsPyBinary(PyObject* obj) {
return PyBytes_Check(obj) || PyByteArray_Check(obj) || PyMemoryView_Check(obj);
}
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/src/arrow/python/inference.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include <utility>
#include <vector>

#include "arrow/extension/uuid.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
#include "arrow/util/decimal.h"
Expand Down Expand Up @@ -344,6 +345,7 @@ class TypeInferrer {
arrow_scalar_count_(0),
numpy_dtype_count_(0),
interval_count_(0),
uuid_count_(0),
max_decimal_metadata_(std::numeric_limits<int32_t>::min(),
std::numeric_limits<int32_t>::min()),
decimal_type_() {
Expand Down Expand Up @@ -412,6 +414,9 @@ class TypeInferrer {
++decimal_count_;
} else if (PyObject_IsInstance(obj, interval_types_.obj())) {
++interval_count_;
} else if (internal::IsPyUuid(obj)) {
++uuid_count_;
*keep_going = make_unions_;
} else {
return internal::InvalidValue(obj,
"did not recognize Python value type when inferring "
Expand Down Expand Up @@ -541,6 +546,8 @@ class TypeInferrer {
*out = utf8();
} else if (interval_count_) {
*out = month_day_nano_interval();
} else if (uuid_count_) {
*out = extension::uuid();
} else if (arrow_scalar_count_) {
*out = scalar_type_;
} else {
Expand Down Expand Up @@ -698,6 +705,7 @@ class TypeInferrer {
int64_t arrow_scalar_count_;
int64_t numpy_dtype_count_;
int64_t interval_count_;
int64_t uuid_count_;
std::unique_ptr<TypeInferrer> list_inferrer_;
std::map<std::string, TypeInferrer> struct_inferrers_;
std::shared_ptr<DataType> scalar_type_;
Expand Down
25 changes: 22 additions & 3 deletions python/pyarrow/src/arrow/python/python_to_arrow.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@
#include "arrow/array/builder_primitive.h"
#include "arrow/array/builder_time.h"
#include "arrow/chunked_array.h"
#include "arrow/extension_type.h"
#include "arrow/result.h"
#include "arrow/scalar.h"
#include "arrow/status.h"
Expand Down Expand Up @@ -512,7 +513,12 @@ class PyValue {

static Status Convert(const FixedSizeBinaryType* type, const O&, I obj,
PyBytesView& view) {
ARROW_RETURN_NOT_OK(view.ParseString(obj));
// Check if obj is a uuid.UUID instance
if (type->byte_width() == 16 && internal::IsPyUuid(obj)) {
ARROW_RETURN_NOT_OK(view.ParseUuid(obj));
} else {
ARROW_RETURN_NOT_OK(view.ParseString(obj));
}
if (view.size != type->byte_width()) {
std::stringstream ss;
ss << "expected to be length " << type->byte_width() << " was " << view.size;
Expand Down Expand Up @@ -1268,16 +1274,24 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
// In some cases, type inference may be "loose", like strings. If the user
// passed pa.string(), then we will error if we encounter any non-UTF8
// value. If not, then we will allow the result to be a BinaryArray
std::shared_ptr<DataType> extension_type;
if (options.type == nullptr) {
ARROW_ASSIGN_OR_RAISE(options.type, InferArrowType(seq, mask, options.from_pandas));
options.strict = false;
// If type inference returned an extension type, convert using
// the storage type and then wrap the result as an extension array
if (options.type->id() == Type::EXTENSION) {
extension_type = options.type;
options.type = checked_cast<const ExtensionType&>(*options.type).storage_type();
}
} else {
options.strict = true;
}
ARROW_DCHECK_GE(size, 0);

ARROW_ASSIGN_OR_RAISE(auto converter, (MakeConverter<PyConverter, PyConverterTrait>(
options.type, options, pool)));
std::shared_ptr<ChunkedArray> result;
if (converter->may_overflow()) {
// The converter hierarchy contains binary- or list-like builders which can overflow
// depending on the input values. Wrap the converter with a chunker which detects
Expand All @@ -1288,7 +1302,7 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
} else {
RETURN_NOT_OK(chunked_converter->Extend(seq, size));
}
return chunked_converter->ToChunkedArray();
ARROW_ASSIGN_OR_RAISE(result, chunked_converter->ToChunkedArray());
} else {
// If the converter can't overflow spare the capacity error checking on the hot-path,
// this improves the performance roughly by ~10% for primitive types.
Expand All @@ -1297,8 +1311,13 @@ Result<std::shared_ptr<ChunkedArray>> ConvertPySequence(PyObject* obj, PyObject*
} else {
RETURN_NOT_OK(converter->Extend(seq, size));
}
return converter->ToChunkedArray();
ARROW_ASSIGN_OR_RAISE(result, converter->ToChunkedArray());
}
// If we inferred an extension type, wrap as an extension array
if (extension_type != nullptr) {
return ExtensionType::WrapArray(extension_type, result);
}
return result;
}

} // namespace py
Expand Down
49 changes: 49 additions & 0 deletions python/pyarrow/tests/test_extension_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -1399,6 +1399,55 @@ def test_uuid_extension():
assert isinstance(array[0], pa.UuidScalar)


def test_uuid_scalar_from_python():
import uuid

# Test with explicit type
py_uuid = uuid.uuid4()
scalar = pa.scalar(py_uuid, type=pa.uuid())
assert isinstance(scalar, pa.UuidScalar)
assert scalar.type == pa.uuid()
assert scalar.as_py() == py_uuid

# Test with specific UUID value
specific_uuid = UUID("12345678-1234-5678-1234-567812345678")
scalar = pa.scalar(specific_uuid, type=pa.uuid())
assert scalar.as_py() == specific_uuid
assert scalar.value.as_py() == specific_uuid.bytes

scalar = pa.scalar(None, type=pa.uuid())
assert scalar.is_valid is False
assert scalar.as_py() is None

# Test type inference from uuid.UUID
py_uuid = uuid.uuid4()
scalar = pa.scalar(py_uuid)
assert isinstance(scalar, pa.UuidScalar)
assert scalar.type == pa.uuid()
assert scalar.as_py() == py_uuid


def test_uuid_array_from_python():
import uuid

# Test array with explicit type
uuids = [uuid.uuid4() for _ in range(3)]
uuids.append(None)

arr = pa.array(uuids, type=pa.uuid())
assert arr.type == pa.uuid()
assert len(arr) == 4
assert arr.null_count == 1
for i, u in enumerate(uuids):
assert arr[i].as_py() == u

# Test type inference for arrays
arr = pa.array(uuids)
assert arr.type == pa.uuid()
for i, u in enumerate(uuids):
assert arr[i].as_py() == u


def test_tensor_type():
tensor_type = pa.fixed_shape_tensor(pa.int8(), [2, 3])
assert tensor_type.extension_name == "arrow.fixed_shape_tensor"
Expand Down
Loading