From ad4bf106316ef8611ffd9b9406f38cbeb897164d Mon Sep 17 00:00:00 2001 From: zhangstar333 Date: Mon, 29 Dec 2025 09:36:02 +0800 Subject: [PATCH] [opt](varbinary) support multi_distinct_count about varbinary and add conf mapping iceberg uuid (#59406) ### What problem does this PR solve? doc https://github.com/apache/doris-website/pull/3231 1. iceberg uuid type use the conf of enable.varbinary.mapping to control mapping type, default is mapping to string type. 2. as binary type not support used in group by key, so count(distinct binary) will thrown error now, support binary type in multi_distinct_count agg function. --- .../aggregate_function_uniq.cpp | 6 +++--- .../aggregate_function_uniq.h | 9 +++++---- .../datasource/iceberg/IcebergUtils.java | 2 +- .../iceberg/test_iceberg_varbinary.out | Bin 1531 -> 1572 bytes .../iceberg/test_iceberg_varbinary.groovy | 8 ++++++++ 5 files changed, 17 insertions(+), 8 deletions(-) diff --git a/be/src/vec/aggregate_functions/aggregate_function_uniq.cpp b/be/src/vec/aggregate_functions/aggregate_function_uniq.cpp index 4a6ecc42e5783e..126fec0032ef7e 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_uniq.cpp +++ b/be/src/vec/aggregate_functions/aggregate_function_uniq.cpp @@ -40,9 +40,9 @@ AggregateFunctionPtr create_aggregate_function_uniq(const std::string& name, return creator_with_type_list< TYPE_BOOLEAN, TYPE_TINYINT, TYPE_SMALLINT, TYPE_INT, TYPE_BIGINT, TYPE_LARGEINT, TYPE_DECIMAL32, TYPE_DECIMAL64, TYPE_DECIMAL128I, TYPE_DECIMAL256, TYPE_VARCHAR, - TYPE_ARRAY, TYPE_FLOAT, TYPE_DOUBLE, TYPE_DATEV2, TYPE_DATETIMEV2, - TYPE_TIMESTAMPTZ>::create(argument_types, - result_is_nullable, attr); + TYPE_ARRAY, TYPE_FLOAT, TYPE_DOUBLE, TYPE_DATEV2, TYPE_DATETIMEV2, TYPE_TIMESTAMPTZ, + TYPE_VARBINARY>::create(argument_types, result_is_nullable, + attr); } void register_aggregate_function_uniq(AggregateFunctionSimpleFactory& factory) { diff --git a/be/src/vec/aggregate_functions/aggregate_function_uniq.h b/be/src/vec/aggregate_functions/aggregate_function_uniq.h index f0108de4a1a64d..60d2484c3f250d 100644 --- a/be/src/vec/aggregate_functions/aggregate_function_uniq.h +++ b/be/src/vec/aggregate_functions/aggregate_function_uniq.h @@ -28,6 +28,7 @@ #include #include "common/compiler_util.h" // IWYU pragma: keep +#include "runtime/primitive_type.h" #include "vec/aggregate_functions/aggregate_function.h" #include "vec/columns/column.h" #include "vec/columns/column_vector.h" @@ -56,7 +57,7 @@ class ColumnDecimal; template struct AggregateFunctionUniqExactData { - static constexpr bool is_string_key = is_string_type(T); + static constexpr bool is_string_key = is_string_type(T) || is_varbinary(T); using Key = std::conditional_t< is_string_key, UInt128, std::conditional_t struct OneAdder { static void ALWAYS_INLINE add(Data& data, const IColumn& column, size_t row_num) { - if constexpr (is_string_type(T)) { + if constexpr (is_string_type(T) || is_varbinary(T)) { StringRef value = column.get_data_at(row_num); data.set.insert(Data::get_key(value)); } else if constexpr (T == TYPE_ARRAY) { @@ -119,7 +120,7 @@ class AggregateFunctionUniq final NotNullableAggregateFunction { public: using KeyType = - std::conditional_t::ColumnItemType>>; AggregateFunctionUniq(const DataTypes& argument_types_) @@ -138,7 +139,7 @@ class AggregateFunctionUniq final static ALWAYS_INLINE const KeyType* get_keys(std::vector& keys_container, const IColumn& column, size_t batch_size) { - if constexpr (is_string_type(T)) { + if constexpr (is_string_type(T) || is_varbinary(T)) { keys_container.resize(batch_size); for (size_t i = 0; i != batch_size; ++i) { StringRef value = column.get_data_at(i); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java index 001a9a85903df0..c6fb0143dd23cf 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/iceberg/IcebergUtils.java @@ -572,7 +572,7 @@ private static Type icebergPrimitiveTypeToDorisType(org.apache.iceberg.types.Typ case STRING: return Type.STRING; case UUID: - return ScalarType.createVarbinaryType(16); + return enableMappingVarbinary ? ScalarType.createVarbinaryType(16) : Type.STRING; case BINARY: return enableMappingVarbinary ? ScalarType.createVarbinaryType(VarBinaryType.MAX_VARBINARY_LENGTH) : Type.STRING; diff --git a/regression-test/data/external_table_p0/iceberg/test_iceberg_varbinary.out b/regression-test/data/external_table_p0/iceberg/test_iceberg_varbinary.out index 122979181ac5333013ec70392719067c8552cba5..5845c71a460b7011e9fe13eafef1a14f66978c03 100644 GIT binary patch delta 43 scmey(y@Y4OZ`R5GSh(bk3>9>Bxr{lDxwv$76%>n8b5fH_jErD{03rbk3jhEB delta 7 OcmZ3&^P79aZ&m;e=>ul~ diff --git a/regression-test/suites/external_table_p0/iceberg/test_iceberg_varbinary.groovy b/regression-test/suites/external_table_p0/iceberg/test_iceberg_varbinary.groovy index b9b82f915e51ff..9db7b5fb9348e6 100644 --- a/regression-test/suites/external_table_p0/iceberg/test_iceberg_varbinary.groovy +++ b/regression-test/suites/external_table_p0/iceberg/test_iceberg_varbinary.groovy @@ -149,4 +149,12 @@ suite("test_iceberg_varbinary", "p0,external,doris,external_docker,external_dock qt_select19 """ select * from test_ice_uuid_parquet_write_with_mapping order by id; """ + + qt_select21 """ + select multi_distinct_count(col2),multi_distinct_count(col1) from test_ice_uuid_orc; + """ + + qt_select22 """ + select multi_distinct_count(col2),multi_distinct_count(col1) from test_ice_uuid_parquet; + """ }