diff --git a/Makefile b/Makefile index 828680f3..67298868 100644 --- a/Makefile +++ b/Makefile @@ -40,6 +40,10 @@ ifndef VST_ENABLED VST_ENABLED := "false" endif +ifndef ENABLE_VECTOR_INDEX + ENABLE_VECTOR_INDEX := "true" +endif + TESTV2PARALLEL ?= 4 ORGPATH := github.com/arangodb @@ -491,7 +495,7 @@ endif @-docker rm -f -v $(TESTCONTAINER) &> /dev/null @TESTCONTAINER=$(TESTCONTAINER) ARANGODB=$(ARANGODB) ALPINE_IMAGE=$(ALPINE_IMAGE) ENABLE_BACKUP=$(ENABLE_BACKUP) \ ARANGO_LICENSE_KEY=$(ARANGO_LICENSE_KEY) STARTER=$(STARTER) STARTERMODE=$(TEST_MODE) TMPDIR="${TMPDIR}" \ - ENABLE_DATABASE_EXTRA_FEATURES=$(ENABLE_DATABASE_EXTRA_FEATURES) DEBUG_PORT=$(DEBUG_PORT) $(CLUSTERENV) DOCKER_NETWORK=${TEST_NET} "${ROOTDIR}/test/cluster.sh" start + ENABLE_DATABASE_EXTRA_FEATURES=$(ENABLE_DATABASE_EXTRA_FEATURES) ENABLE_VECTOR_INDEX=$(ENABLE_VECTOR_INDEX) DEBUG_PORT=$(DEBUG_PORT) $(CLUSTERENV) DOCKER_NETWORK=${TEST_NET} "${ROOTDIR}/test/cluster.sh" start endif __test_cleanup: diff --git a/test/cluster.sh b/test/cluster.sh index b5019941..6b430416 100755 --- a/test/cluster.sh +++ b/test/cluster.sh @@ -52,6 +52,9 @@ if [ "$CMD" == "start" ]; then if [ -n "$ENABLE_DATABASE_EXTRA_FEATURES" ]; then STARTERARGS="$STARTERARGS --all.database.extended-names-databases=true --args.all.http.compress-response-threshold=1 --args.all.http.handle-content-encoding-for-unauthenticated-requests=true" fi + if [ -n "$ENABLE_VECTOR_INDEX" ]; then + STARTERARGS="$STARTERARGS --args.all.experimental-vector-index=true" + fi # Use DOCKER_PLATFORM if set (e.g., from CircleCI for ARM), otherwise use macOS default if [ -n "$DOCKER_PLATFORM" ]; then DOCKERPLATFORMARG="$DOCKER_PLATFORM" diff --git a/v2/CHANGELOG.md b/v2/CHANGELOG.md index 34e96634..24ccae42 100644 --- a/v2/CHANGELOG.md +++ b/v2/CHANGELOG.md @@ -5,6 +5,7 @@ - Add ARM Support for V2 testcases - Set TESTV2PARALLEL from 1 to 4 - Disabled V8 related testcases in V1 and V2 +- Add Vector index feature ## [2.1.6](https://github.com/arangodb/go-driver/tree/v2.1.6) (2025-11-06) - Add missing endpoints from replication diff --git a/v2/arangodb/collection_indexes.go b/v2/arangodb/collection_indexes.go index 067a6609..7a3b0363 100644 --- a/v2/arangodb/collection_indexes.go +++ b/v2/arangodb/collection_indexes.go @@ -80,6 +80,12 @@ type CollectionIndexes interface { // DeleteIndexByID deletes an index from the collection. DeleteIndexByID(ctx context.Context, id string) error + + // EnsureVectorIndex creates a vector index in the collection, if it does not already exist. + // The index is returned, together with a boolean indicating if the index was newly created (true) or pre-existing (false). + // Available in ArangoDB 3.12.4 and later. + // VectorParams is an obligatory parameter and must contain at least Dimension,Metric and NLists fields. + EnsureVectorIndex(ctx context.Context, fields []string, params *VectorParams, options *CreateVectorIndexOptions) (IndexResponse, bool, error) } // IndexType represents an index type as string @@ -118,6 +124,9 @@ const ( // InvertedIndexType can be used to speed up a broad range of AQL queries, from simple to complex, including full-text search InvertedIndexType = IndexType("inverted") + + // VectorIndexType is used for efficient similarity searches on high-dimensional embeddings, enabling fast and scalable AI use cases. + VectorIndexType = IndexType("vector") ) // IndexResponse is the response from the Index list method @@ -135,6 +144,9 @@ type IndexResponse struct { // InvertedIndex is the inverted index object. It is not empty only for InvertedIndex type. InvertedIndex *InvertedIndexOptions `json:"invertedIndexes"` + + // VectorIndex is the vector index params. It is not empty only for VectorIndex type. + VectorIndex *VectorParams `json:"params,omitempty"` } // IndexSharedOptions contains options that are shared between all index types @@ -310,3 +322,45 @@ type CreateMDIPrefixedIndexOptions struct { // Array expansions are not allowed. PrefixFields []string `json:"prefixFields,required"` } + +type CreateVectorIndexOptions struct { + // Allow writes during creation. + InBackground *bool `json:"inBackground,omitempty"` + // Optional index name. + Name *string `json:"name,omitempty"` + // Number of threads to use for index creation. + Parallelism *int `json:"parallelism,omitempty"` + // Exclude docs missing the field. + Sparse *bool `json:"sparse,omitempty"` + // Introduced in v3.12.7 + // Up to 32 additional attributes can be stored in the index. + StoredValues []string `json:"storedValues,omitempty"` +} + +type VectorParams struct { + // Neighbors considered in search. + DefaultNProbe *int `json:"defaultNProbe,omitempty"` + // Vector length. + Dimension *int `json:"dimension,omitempty"` + // Faiss factory string. + Factory *string `json:"factory,omitempty"` + // Similarity measure. + Metric *VectorMetric `json:"metric,omitempty"` + // Number of centroids. + NLists *int `json:"nLists,omitempty"` + // Faiss training iterations + TrainingIterations *int `json:"trainingIterations,omitempty"` +} + +// VectorMetric defines the type of similarity metric for vector comparison. +type VectorMetric string + +const ( + // Cosine similarity between vectors. + VectorMetricCosine VectorMetric = "cosine" + // Introduced in v3.12.6 + // Inner product similarity. + VectorMetricInnerProduct VectorMetric = "innerProduct" + // Euclidean (L2) distance between vectors. + VectorMetricL2 VectorMetric = "l2" +) diff --git a/v2/arangodb/collection_indexes_impl.go b/v2/arangodb/collection_indexes_impl.go index e75a547c..0cf30bfd 100644 --- a/v2/arangodb/collection_indexes_impl.go +++ b/v2/arangodb/collection_indexes_impl.go @@ -23,6 +23,7 @@ package arangodb import ( "context" "encoding/json" + "fmt" "net/http" "net/url" @@ -306,23 +307,111 @@ func (i *IndexResponse) UnmarshalJSON(data []byte) error { i.Name = respSimple.Name i.Type = respSimple.Type - if respSimple.Type == InvertedIndexType { + switch respSimple.Type { + case InvertedIndexType: result := responseInvertedIndex{} if err := json.Unmarshal(data, &result); err != nil { return err } - i.IndexSharedOptions = result.IndexSharedOptions i.InvertedIndex = &result.InvertedIndexOptions - } else { + case VectorIndexType: + result := responseVectorIndex{} + if err := json.Unmarshal(data, &result); err != nil { + return err + } + i.IndexSharedOptions = result.IndexSharedOptions + i.VectorIndex = result.Params + default: result := responseIndex{} if err := json.Unmarshal(data, &result); err != nil { return err } - i.IndexSharedOptions = result.IndexSharedOptions i.RegularIndex = &result.IndexOptions } + return nil +} + +func (p *VectorParams) validate() error { + if p == nil { + return errors.New("params must be provided for vector index") + } + + if p.Dimension == nil || *p.Dimension <= 0 { + return errors.New("params.Dimension must be provided and greater than zero for vector index") + } + + if p.Metric == nil { + return errors.New("params.Metric must be provided for vector index") + } + + switch *p.Metric { + case VectorMetricCosine, VectorMetricL2, VectorMetricInnerProduct: + // valid + default: + return errors.New("params.Metric must be one of 'cosine', 'l2', or 'innerProduct' for vector index") + } + + if p.NLists != nil && *p.NLists <= 0 { + return errors.New("params.NLists must be greater than zero for vector index") + } return nil } + +func (c *collectionIndexes) EnsureVectorIndex( + ctx context.Context, + fields []string, + params *VectorParams, + options *CreateVectorIndexOptions, +) (IndexResponse, bool, error) { + + if len(fields) != 1 { + return IndexResponse{}, false, errors.New("vector index requires exactly one field") + } + if err := params.validate(); err != nil { + return IndexResponse{}, false, err + } + + reqData := struct { + Type IndexType `json:"type"` + Fields []string `json:"fields"` + Params *VectorParams `json:"params"` + *CreateVectorIndexOptions + }{ + Type: VectorIndexType, + Fields: fields, + Params: params, + CreateVectorIndexOptions: options, + } + + result := responseVectorIndex{} + fmt.Printf("reqData: %+v", reqData) + created, err := c.ensureIndex(ctx, &reqData, &result) + if err != nil { + return IndexResponse{}, false, err + } + + return newVectorIndexResponse(&result), created, nil +} + +type responseVectorIndex struct { + Name string `json:"name,omitempty"` + Type IndexType `json:"type"` + IndexSharedOptions `json:",inline"` + Params *VectorParams `json:"params,omitempty"` +} + +func newVectorIndexResponse(res *responseVectorIndex) IndexResponse { + if res == nil { + return IndexResponse{} + } + + return IndexResponse{ + Name: res.Name, + Type: res.Type, + IndexSharedOptions: res.IndexSharedOptions, + VectorIndex: res.Params, + } +} diff --git a/v2/tests/database_collection_indexes_test.go b/v2/tests/database_collection_indexes_test.go index b9763974..e3fef066 100644 --- a/v2/tests/database_collection_indexes_test.go +++ b/v2/tests/database_collection_indexes_test.go @@ -371,6 +371,31 @@ func Test_NamedIndexes(t *testing.T) { WithDatabase(t, client, nil, func(db arangodb.Database) { WithCollectionV2(t, db, nil, func(col arangodb.Collection) { withContextT(t, defaultTestTimeout, func(ctx context.Context, _ testing.TB) { + docs := []map[string]interface{}{ + { + "pername": "persistent-name", + "geo": []float64{12.9716, 77.5946}, + "createdAt": time.Now().Unix(), + "mkd": 1.23, + "mkd-prefixed": 4.56, + "prefix": "p1", + "vectorfield": []float64{0.1, 0.2, 0.3}, + "text": "first document", + }, + { + "pername": "persistent-name-2", + "geo": []float64{13.0827, 80.2707}, + "createdAt": time.Now().Unix(), + "mkd": 2.34, + "mkd-prefixed": 5.67, + "prefix": "p2", + "vectorfield": []float64{0.4, 0.5, 0.6}, + "text": "second document", + }, + } + + _, err := col.CreateDocuments(ctx, docs) + require.NoError(t, err) var namedIndexTestCases = []struct { Name string @@ -444,6 +469,20 @@ func Test_NamedIndexes(t *testing.T) { return idx, err }, }, + { + Name: "Vector", + MinVersion: "3.12.4", + CreateCallback: func(col arangodb.Collection, name string) (arangodb.IndexResponse, error) { + params := &arangodb.VectorParams{ + Dimension: utils.NewType(3), + Metric: utils.NewType(arangodb.VectorMetricCosine), + NLists: utils.NewType(1), + } + idx, _, err := col.EnsureVectorIndex(ctx, []string{"vectorfield"}, + params, &arangodb.CreateVectorIndexOptions{Name: &name}) + return idx, err + }, + }, } for _, testCase := range namedIndexTestCases { @@ -453,9 +492,13 @@ func Test_NamedIndexes(t *testing.T) { } idx, err := testCase.CreateCallback(col, testCase.Name) - require.NoError(t, err) + require.NoError(t, err, "failed to create %s index", testCase.Name) require.Equal(t, testCase.Name, idx.Name) - + defer func() { + if idx.ID != "" { + _ = col.DeleteIndexByID(ctx, idx.ID) // Ignore errors in tests + } + }() indexes, err := col.Indexes(ctx) require.NoError(t, err) require.NotNil(t, indexes) @@ -469,3 +512,172 @@ func Test_NamedIndexes(t *testing.T) { }) }) } + +func Test_EnsureVectorIndex(t *testing.T) { + Wrap(t, func(t *testing.T, client arangodb.Client) { + WithDatabase(t, client, nil, func(db arangodb.Database) { + WithCollectionV2(t, db, nil, func(col arangodb.Collection) { + withContextT(t, defaultTestTimeout, func(ctx context.Context, _ testing.TB) { + skipBelowVersion(client, ctx, "3.12.4", t) + dimension := 3 + metric := arangodb.VectorMetricCosine + nLists := 1 // or 2, but <= number of docs + + params := &arangodb.VectorParams{ + Dimension: &dimension, + Metric: &metric, + NLists: &nLists, + } + + // Vector indexes require documents to be present for training + // Create sample documents with embeddings + docs := []map[string]interface{}{ + {"embedding": []float64{0.1, 0.2, 0.3}, "text": "first document"}, + {"embedding": []float64{0.4, 0.5, 0.6}, "text": "second document"}, + {"embedding": []float64{0.7, 0.8, 0.9}, "text": "third document"}, + } + + _, err := col.CreateDocuments(ctx, docs) + require.NoError(t, err, "failed to create sample documents for vector index training") + + t.Run("Create Vector Index", func(t *testing.T) { + idx, created, err := col.EnsureVectorIndex( + ctx, + []string{"embedding"}, + params, + &arangodb.CreateVectorIndexOptions{ + Name: utils.NewType("my_vector_index"), + }, + ) + require.NoError(t, err) + require.True(t, created, "index should be created on first call") + require.Equal(t, arangodb.VectorIndexType, idx.Type) + require.NotNil(t, idx.VectorIndex) + require.Equal(t, dimension, *idx.VectorIndex.Dimension) + require.Equal(t, metric, *idx.VectorIndex.Metric) + }) + + t.Run("Create the same index again", func(t *testing.T) { + idx, created, err := col.EnsureVectorIndex( + ctx, + []string{"embedding"}, + params, + nil, + ) + require.NoError(t, err) + defer func() { + if idx.ID != "" { + _ = col.DeleteIndexByID(ctx, idx.ID) // Ignore errors in cleanup + } + }() + require.False(t, created, "index should already exist") + require.Equal(t, arangodb.VectorIndexType, idx.Type) + }) + + t.Run("Invalid Vector Index Params", func(t *testing.T) { + invalidParams := &arangodb.VectorParams{Dimension: utils.NewType(-1)} + _, _, err := col.EnsureVectorIndex(ctx, []string{"embedding"}, invalidParams, nil) + require.Error(t, err, "Should fail with invalid dimension") + }) + + var idx arangodb.IndexResponse + + t.Run("Create Vector Index with storedValues", func(t *testing.T) { + skipBelowVersion(client, ctx, "3.12.7", t) + options := &arangodb.CreateVectorIndexOptions{ + StoredValues: []string{"text"}, + } + var err error + idx, _, err = col.EnsureVectorIndex(ctx, []string{"embedding"}, params, options) + require.NoError(t, err) + require.Equal(t, arangodb.VectorIndexType, idx.Type) + }) + + if idx.ID == "" { + t.Skip("Index not created, skipping dependent tests") + } + defer func() { + if idx.ID != "" { + _ = col.DeleteIndexByID(ctx, idx.ID) // Ignore errors in cleanup + } + }() + // Run explain in a separate subtest + t.Run("storedValues_are_used_for_filter", func(t *testing.T) { + skipBelowVersion(client, ctx, "3.12.7", t) + + query := fmt.Sprintf( + "FOR d IN `%s`\n"+ + " SORT APPROX_NEAR_COSINE(d.embedding, @vector) DESC\n"+ + " LIMIT 1\n"+ + " FILTER d.text == @text\n"+ + " RETURN d", + col.Name(), + ) + + bindVars := map[string]interface{}{ + "text": "first document", + "vector": []float64{0.1, 0.2, 0.3}, + } + + explain, err := db.ExplainQuery(ctx, query, bindVars, nil) + require.NoError(t, err) + require.Contains(t, explain.Plan.Rules, "use-vector-index") + + found := false + for _, node := range explain.Plan.NodesRaw { + if t, ok := node["type"].(string); ok && t == "EnumerateNearVectorNode" { + found = true + break + } + } + if !found { + t.Logf("Execution plan: %+v", explain.Plan) + } + require.True(t, found) + }) + + t.Run("vector_index_with_storedValues_and_indexHint_is_used", func(t *testing.T) { + skipBelowVersion(client, ctx, "3.12.7", t) + // indexHint and forceIndexHint for vector indexes supported by 3.12.7+ + // Query using indexHint + forceIndexHint + query := ` + FOR d IN @@col OPTIONS { + indexHint: [@idxName], + forceIndexHint: true + } + SORT APPROX_NEAR_COSINE(d.embedding, @vector) DESC + LIMIT 1 + RETURN d + ` + bindVars := map[string]interface{}{ + "@col": col.Name(), + "idxName": idx.Name, + "vector": []float64{0.1, 0.2, 0.3}, + } + + // 3. Explain query + explain, err := db.ExplainQuery(ctx, query, bindVars, nil) + require.NoError(t, err) + + // 4. Assert vector index is used + require.Contains(t, explain.Plan.Rules, "use-vector-index") + + // 5. Assert EnumerateNearVectorNode exists + found := false + for _, node := range explain.Plan.NodesRaw { + if nodeType, ok := node["type"].(string); ok && nodeType == "EnumerateNearVectorNode" { + found = true + break + } + } + if !found { + t.Logf("Execution plan: %+v", explain.Plan) + } + require.True(t, found, "expected EnumerateNearVectorNode in execution plan") + }) + + }) + }) + }) + }) +}