Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 40 additions & 24 deletions src/typeagent/aitools/vectorbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,46 @@
)
from .model_adapters import create_embedding_model

DEFAULT_MIN_SCORE = 0.85

# Empirical defaults for built-in OpenAI embedding models.
# These values come from repeated runs of the Adrian Tchaikovsky Episode 53
# search benchmark in `tools/repeat_embedding_benchmarks.py`, using an
# exhaustive 0.01..1.00 min_score sweep on the Adrian Tchaikovsky Episode 53
# dataset. We keep the highest min_score that preserves the best benchmark
# metrics for each model, which yielded the current plateau boundaries of 0.16 for
# `text-embedding-3-small`, 0.07 for `text-embedding-3-large`, and 0.72 for
# `text-embedding-ada-002`. These are repository defaults for known models,
# not universal truths. Unknown models keep the long-standing fallback score
# of 0.85. Callers can always override `min_score` explicitly for their own
# use cases or models. We intentionally leave `max_matches` out of this table:
# the benchmark still reports a best `max_hits` row, but the library default
# remains `None` unless a caller opts into a specific limit.
MODEL_DEFAULT_MIN_SCORES: dict[str, float] = {
"text-embedding-3-large": 0.07,
"text-embedding-3-small": 0.16,
"text-embedding-ada-002": 0.72,
}


def get_default_min_score(model_name: str) -> float:
"""Return the repository default score cutoff for a known model name."""

return MODEL_DEFAULT_MIN_SCORES.get(model_name, DEFAULT_MIN_SCORE)


@dataclass
class ScoredInt:
"""Associate an integer ordinal with its similarity score."""

item: int
score: float


@dataclass
class TextEmbeddingIndexSettings:
"""Runtime settings for embedding-backed fuzzy lookup."""

embedding_model: IEmbeddingModel
min_score: float # Between 0.0 and 1.0
max_matches: int | None # >= 1; None means no limit
Expand All @@ -34,10 +65,12 @@ def __init__(
max_matches: int | None = None,
batch_size: int | None = None,
):
self.min_score = min_score if min_score is not None else 0.85
self.embedding_model = embedding_model or create_embedding_model()
model_name = getattr(self.embedding_model, "model_name", "")
default_min_score = get_default_min_score(model_name)
self.min_score = min_score if min_score is not None else default_min_score
self.max_matches = max_matches if max_matches and max_matches >= 1 else None
self.batch_size = batch_size if batch_size and batch_size >= 1 else 8
self.embedding_model = embedding_model or create_embedding_model()


class VectorBase:
Expand Down Expand Up @@ -166,27 +199,10 @@ def fuzzy_lookup_embedding_in_subset(
max_hits: int | None = None,
min_score: float | None = None,
) -> list[ScoredInt]:
if max_hits is None:
max_hits = 10
if min_score is None:
min_score = 0.0
if not ordinals_of_subset or len(self._vectors) == 0:
return []
# Compute dot products only for the subset instead of all vectors.
subset = np.asarray(ordinals_of_subset)
scores = np.dot(self._vectors[subset], embedding)
indices = np.flatnonzero(scores >= min_score)
if len(indices) == 0:
return []
filtered_scores = scores[indices]
if len(indices) <= max_hits:
order = np.argsort(filtered_scores)[::-1]
else:
top_k = np.argpartition(filtered_scores, -max_hits)[-max_hits:]
order = top_k[np.argsort(filtered_scores[top_k])[::-1]]
return [
ScoredInt(int(subset[indices[i]]), float(filtered_scores[i])) for i in order
]
ordinals_set = set(ordinals_of_subset)
return self.fuzzy_lookup_embedding(
embedding, max_hits, min_score, lambda i: i in ordinals_set
)

async def fuzzy_lookup(
self,
Expand Down Expand Up @@ -235,7 +251,7 @@ def deserialize(self, data: NormalizedEmbeddings | None) -> None:
return
if self._embedding_size == 0:
if data.ndim < 2 or data.shape[0] == 0:
# Empty data can't determine size; just clear.
# Empty data can't determine size; just clear.
self.clear()
return
self._set_embedding_size(data.shape[1])
Expand Down
103 changes: 103 additions & 0 deletions tests/test_benchmark_embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.

from importlib.util import module_from_spec, spec_from_file_location
from pathlib import Path

import pytest

MODULE_PATH = (
Path(__file__).resolve().parent.parent / "tools" / "benchmark_embeddings.py"
)
SPEC = spec_from_file_location("benchmark_embeddings_for_test", MODULE_PATH)
assert SPEC is not None
assert SPEC.loader is not None
BENCHMARK_EMBEDDINGS = module_from_spec(SPEC)
SPEC.loader.exec_module(BENCHMARK_EMBEDDINGS)

BenchmarkRow = BENCHMARK_EMBEDDINGS.BenchmarkRow
SearchMetrics = BENCHMARK_EMBEDDINGS.SearchMetrics
build_float_range = BENCHMARK_EMBEDDINGS.build_float_range
filter_min_scores_by_ceiling = BENCHMARK_EMBEDDINGS.filter_min_scores_by_ceiling
load_message_texts = BENCHMARK_EMBEDDINGS.load_message_texts
parse_float_list = BENCHMARK_EMBEDDINGS.parse_float_list
resolve_min_scores = BENCHMARK_EMBEDDINGS.resolve_min_scores
select_best_row = BENCHMARK_EMBEDDINGS.select_best_row


def make_row(
min_score: float,
max_hits: int,
hit_rate: float,
mean_reciprocal_rank: float,
) -> BenchmarkRow:
"""Build a benchmark row without repeating nested metrics boilerplate."""

return BenchmarkRow(
min_score=min_score,
max_hits=max_hits,
metrics=SearchMetrics(
hit_rate=hit_rate,
mean_reciprocal_rank=mean_reciprocal_rank,
),
)


def test_select_best_row_prefers_higher_min_score_on_metric_tie() -> None:
rows = [
make_row(0.25, 15, 98.5, 0.7514),
make_row(0.70, 15, 98.5, 0.7514),
]

best_row = select_best_row(rows)

assert best_row.min_score == 0.70
assert best_row.max_hits == 15


def test_select_best_row_prefers_lower_max_hits_on_full_tie() -> None:
rows = [
make_row(0.70, 20, 98.5, 0.7514),
make_row(0.70, 15, 98.5, 0.7514),
]

best_row = select_best_row(rows)

assert best_row.min_score == 0.70
assert best_row.max_hits == 15


def test_parse_float_list_defaults_to_tenth_point_grid() -> None:
assert parse_float_list(None) == [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


def test_build_float_range_supports_hundredth_point_sweeps() -> None:
assert build_float_range(0.01, 0.05, 0.01) == [0.01, 0.02, 0.03, 0.04, 0.05]


def test_resolve_min_scores_uses_generated_range() -> None:
assert resolve_min_scores(None, 0.01, 0.03, 0.01) == [0.01, 0.02, 0.03]


def test_resolve_min_scores_rejects_mixed_inputs() -> None:
with pytest.raises(ValueError, match="Use either --min-scores"):
resolve_min_scores("0.1,0.2", 0.01, 0.03, 0.01)


def test_filter_min_scores_by_ceiling_skips_guaranteed_zero_rows() -> None:
effective_scores, skipped_scores = filter_min_scores_by_ceiling(
[0.01, 0.16, 0.17, 0.5],
0.16,
)

assert effective_scores == [0.01, 0.16]
assert skipped_scores == [0.17, 0.5]


def test_load_message_texts_returns_one_text_blob_per_message() -> None:
repo_root = Path(__file__).resolve().parent.parent

message_texts = load_message_texts(repo_root)

assert message_texts
assert all(isinstance(text, str) for text in message_texts)
Loading