diff --git a/src/typeagent/aitools/vectorbase.py b/src/typeagent/aitools/vectorbase.py index 267b576a..d4f84c21 100644 --- a/src/typeagent/aitools/vectorbase.py +++ b/src/typeagent/aitools/vectorbase.py @@ -1,7 +1,7 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -from collections.abc import Callable +from collections.abc import Callable, Iterable from dataclasses import dataclass import numpy as np @@ -156,35 +156,17 @@ def fuzzy_lookup_embedding( min_score = 0.0 if len(self._vectors) == 0: return [] - scores = np.dot(self._vectors, embedding) - - if predicate is None: - # Fast numpy path: filter and top-k without Python-level iteration. - indices = np.flatnonzero(scores >= min_score) - if len(indices) == 0: - return [] - filtered_scores = scores[indices] - if len(indices) <= max_hits: - order = np.argsort(filtered_scores)[::-1] - else: - # argpartition is O(n) vs O(n log n) for full sort. - top_k = np.argpartition(filtered_scores, -max_hits)[-max_hits:] - order = top_k[np.argsort(filtered_scores[top_k])[::-1]] - return [ - ScoredInt(int(indices[i]), float(filtered_scores[i])) for i in order - ] - else: - # Predicate path: pre-filter by score in numpy, then apply predicate - # only to candidates that pass the score threshold. - candidates = np.flatnonzero(scores >= min_score) - scored_ordinals = [ - ScoredInt(int(i), float(scores[i])) - for i in candidates - if predicate(int(i)) - ] - scored_ordinals.sort(key=lambda x: x.score, reverse=True) - return scored_ordinals[:max_hits] + # This line does most of the work: + scores: Iterable[float] = np.dot(self._vectors, embedding) + scored_ordinals = [ + ScoredInt(i, score) + for i, score in enumerate(scores) + if score >= min_score and (predicate is None or predicate(i)) + ] + scored_ordinals.sort(key=lambda x: x.score, reverse=True) + return scored_ordinals[:max_hits] + # TODO: Make this and fuzzy_lookup_embedding() more similar. def fuzzy_lookup_embedding_in_subset( self, embedding: NormalizedEmbedding, @@ -192,27 +174,10 @@ def fuzzy_lookup_embedding_in_subset( max_hits: int | None = None, min_score: float | None = None, ) -> list[ScoredInt]: - if max_hits is None: - max_hits = 10 - if min_score is None: - min_score = 0.0 - if not ordinals_of_subset or len(self._vectors) == 0: - return [] - # Compute dot products only for the subset instead of all vectors. - subset = np.asarray(ordinals_of_subset) - scores = np.dot(self._vectors[subset], embedding) - indices = np.flatnonzero(scores >= min_score) - if len(indices) == 0: - return [] - filtered_scores = scores[indices] - if len(indices) <= max_hits: - order = np.argsort(filtered_scores)[::-1] - else: - top_k = np.argpartition(filtered_scores, -max_hits)[-max_hits:] - order = top_k[np.argsort(filtered_scores[top_k])[::-1]] - return [ - ScoredInt(int(subset[indices[i]]), float(filtered_scores[i])) for i in order - ] + ordinals_set = set(ordinals_of_subset) + return self.fuzzy_lookup_embedding( + embedding, max_hits, min_score, lambda i: i in ordinals_set + ) async def fuzzy_lookup( self, diff --git a/tests/benchmarks/test_benchmark_vectorbase.py b/tests/benchmarks/test_benchmark_vectorbase.py deleted file mode 100644 index b61859a8..00000000 --- a/tests/benchmarks/test_benchmark_vectorbase.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright (c) Microsoft Corporation. -# Licensed under the MIT License. - -"""Benchmarks for VectorBase fuzzy lookup methods. - -Measures fuzzy_lookup_embedding and fuzzy_lookup_embedding_in_subset -with varying vector counts and result sizes. -""" - -import numpy as np -import pytest - -from typeagent.aitools.model_adapters import create_test_embedding_model -from typeagent.aitools.vectorbase import TextEmbeddingIndexSettings, VectorBase - -EMBEDDING_DIM = 384 # Typical small embedding model dimension - - -def make_populated_vector_base(n_vectors: int) -> tuple[VectorBase, np.ndarray]: - """Create a VectorBase with n_vectors random normalized embeddings.""" - settings = TextEmbeddingIndexSettings(create_test_embedding_model()) - vb = VectorBase(settings) - rng = np.random.default_rng(42) - embeddings = rng.standard_normal((n_vectors, EMBEDDING_DIM)).astype(np.float32) - # Normalize to unit vectors (as the real pipeline does). - norms = np.linalg.norm(embeddings, axis=1, keepdims=True) - embeddings = embeddings / norms - vb.add_embeddings(None, embeddings) - # Query vector: also normalized. - query = rng.standard_normal(EMBEDDING_DIM).astype(np.float32) - query = query / np.linalg.norm(query) - return vb, query - - -# --- fuzzy_lookup_embedding --- - - -@pytest.mark.asyncio -async def test_benchmark_fuzzy_lookup_1k(async_benchmark): - vb, query = make_populated_vector_base(1_000) - - async def target(): - vb.fuzzy_lookup_embedding(query, max_hits=10, min_score=0.0) - - await async_benchmark.pedantic(target, rounds=200, warmup_rounds=20) - - -@pytest.mark.asyncio -async def test_benchmark_fuzzy_lookup_10k(async_benchmark): - vb, query = make_populated_vector_base(10_000) - - async def target(): - vb.fuzzy_lookup_embedding(query, max_hits=10, min_score=0.0) - - await async_benchmark.pedantic(target, rounds=200, warmup_rounds=20) - - -@pytest.mark.asyncio -async def test_benchmark_fuzzy_lookup_10k_with_predicate(async_benchmark): - vb, query = make_populated_vector_base(10_000) - # Predicate that accepts ~50% of indices. - even_only = lambda i: i % 2 == 0 - - async def target(): - vb.fuzzy_lookup_embedding( - query, max_hits=10, min_score=0.0, predicate=even_only - ) - - await async_benchmark.pedantic(target, rounds=200, warmup_rounds=20) - - -# --- fuzzy_lookup_embedding_in_subset --- - - -@pytest.mark.asyncio -async def test_benchmark_fuzzy_lookup_subset_1k_of_10k(async_benchmark): - vb, query = make_populated_vector_base(10_000) - subset = list(range(0, 10_000, 10)) # 1000 indices - - async def target(): - vb.fuzzy_lookup_embedding_in_subset(query, subset, max_hits=10, min_score=0.0) - - await async_benchmark.pedantic(target, rounds=200, warmup_rounds=20)