From 2e6ebd907a7d1502981af07692365139602c0233 Mon Sep 17 00:00:00 2001
From: Jack Arturo
Date: Sun, 17 May 2026 23:25:05 -0700
Subject: [PATCH 1/2] fix(benchmarks): add publication verification bundle
---
.flake8 | 4 +
Makefile | 2 +-
README.md | 7 +-
automem/api/recall.py | 5 +-
benchmarks/EXPERIMENT_LOG.md | 28 ++--
.../publication/2026-05-arxiv/README.md | 59 +++++++
.../2026-05-arxiv/artifact-manifest.json | 145 ++++++++++++++++++
.../2026-05-arxiv/benchmark-summary.md | 69 +++++++++
.../publication/2026-05-arxiv/commands.md | 47 ++++++
.../2026-05-arxiv/fresh-verification.md | 66 ++++++++
docs/COMPARISON.md | 2 +-
docs/TESTING.md | 31 ++--
pyproject.toml | 30 ++++
tests/test_api_endpoints.py | 29 ++++
14 files changed, 491 insertions(+), 33 deletions(-)
create mode 100644 benchmarks/publication/2026-05-arxiv/README.md
create mode 100644 benchmarks/publication/2026-05-arxiv/artifact-manifest.json
create mode 100644 benchmarks/publication/2026-05-arxiv/benchmark-summary.md
create mode 100644 benchmarks/publication/2026-05-arxiv/commands.md
create mode 100644 benchmarks/publication/2026-05-arxiv/fresh-verification.md
create mode 100644 pyproject.toml
diff --git a/.flake8 b/.flake8
index 026c69ae..d6c7d5ab 100644
--- a/.flake8
+++ b/.flake8
@@ -3,7 +3,11 @@ max-line-length = 100
select = E9,F63,F7,F82
show_source = True
extend-exclude =
+ .bench-venv,
+ .worktrees,
.venv,
+ benchmarks/results,
+ benchmarks/snapshots,
venv,
node_modules,
packages/graph-viewer/node_modules,
diff --git a/Makefile b/Makefile
index 338f0e7c..7d141cd2 100644
--- a/Makefile
+++ b/Makefile
@@ -161,7 +161,7 @@ bench-compare-branch:
@scripts/bench/compare_branch.sh $(BRANCH) $(or $(CONFIG),baseline) $(or $(BENCH),locomo)
bench-health:
- @python3 scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001)
+ @$(VENV_BIN)/python scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001)
bench-snapshots:
@ls -la benchmarks/snapshots/ 2>/dev/null || echo "No snapshots yet. Run: make bench-ingest BENCH=locomo"
diff --git a/README.md b/README.md
index 525ae180..3817a249 100644
--- a/README.md
+++ b/README.md
@@ -7,7 +7,7 @@
-
+
@@ -27,9 +27,9 @@ Your AI forgets between sessions. RAG dumps documents that look similar. Vector
AutoMem stores typed relationships *and* embeddings. When you ask "why did we choose PostgreSQL?", recall returns not just the matching memory — but the alternatives you considered, the principle behind the choice, and the related decisions that came after.
-It scores **89.27%** on the LoCoMo long-term memory benchmark (ACL 2024) judge-off, and **87.56%** judge-on. See [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md) for methodology and history.
+Current canonical benchmark results are **87.00%** on LongMemEval full with **97.00% recall@5**, and **84.74%** on LoCoMo full. See [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md) for methodology, judge policy, category breakdowns, and historical runs.
-Additional LongMemEval and BEAM validation is tracked in [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md); BEAM is currently reported as exploratory because published comparisons are not yet apples-to-apples.
+Exploratory BEAM validation is tracked separately in [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md); those numbers are diagnostic and not apples-to-apples product claims against published 1M/10M BEAM results.
## Should you use AutoMem?
@@ -223,6 +223,7 @@ _Screenshots will be added once the referenced in-repo image assets are availabl
- [Research foundation](docs/RESEARCH.md) — papers and how AutoMem implements them
- [Comparison](docs/COMPARISON.md) — vs. RAG, vector DBs, building your own
- [Benchmark history](benchmarks/EXPERIMENT_LOG.md) — LoCoMo, LongMemEval, and BEAM methodology and runs
+- [Publication bundle](benchmarks/publication/2026-05-arxiv/) — arXiv claim posture, reproducibility commands, and artifact manifest
**Operations**
- [Health monitoring & backups](docs/MONITORING_AND_BACKUPS.md)
diff --git a/automem/api/recall.py b/automem/api/recall.py
index cf1b4d8f..81c86210 100644
--- a/automem/api/recall.py
+++ b/automem/api/recall.py
@@ -2117,8 +2117,11 @@ def get_related_memories(memory_id: str) -> Any:
ORDER BY coalesce(related.importance, 0.0) DESC, coalesce(related.timestamp, '') DESC
LIMIT $limit
"""
+ # FalkorDB does not accept parameters inside variable-length relationship ranges.
+ # max_depth is parsed and clamped above, so inlining it here is safe.
+ fallback_depth = max_depth
fallback_query = f"""
- MATCH (m:Memory {{id: $id}}){'-[r' + rel_pattern + '*1..$max_depth]-' if rel_pattern else '-[r*1..$max_depth]-'}(related:Memory)
+ MATCH (m:Memory {{id: $id}}){'-[r' + rel_pattern + f'*1..{fallback_depth}]-' if rel_pattern else f'-[r*1..{fallback_depth}]-'}(related:Memory)
WHERE m.id <> related.id
RETURN DISTINCT related
ORDER BY coalesce(related.importance, 0.0) DESC, coalesce(related.timestamp, '') DESC
diff --git a/benchmarks/EXPERIMENT_LOG.md b/benchmarks/EXPERIMENT_LOG.md
index b97a3122..049b8631 100644
--- a/benchmarks/EXPERIMENT_LOG.md
+++ b/benchmarks/EXPERIMENT_LOG.md
@@ -21,9 +21,14 @@ Current headline results:
| Benchmark | Scope | Score | Retrieval | Notes |
|-----------|-------|-------|-----------|-------|
-| LongMemEval full | 500 questions | **86.20% (431/500)** | recall@5 **97.20% (486/500)** | Canonical `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge; `judge_errors=0`, `memory_ingest_failures=0`. |
-| LongMemEval mini | 30 questions, stratified 5 per type | **60.0% (18/30)** | recall@5 **96.67% (29/30)** | Representative canary; do not compare to legacy prefix slices. |
-| LoCoMo full | 10 conversations, 1986 questions | **83.99% (1668/1986)** | -- | Latest recorded full judge-on run from #128; cat5 scored 92.83% with 0 skips. |
+| LongMemEval full | 500 questions | **87.00% (435/500)** | recall@5 **97.00% (485/500)** | Fresh publication verification run with `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge; `judge_errors=0`, `memory_ingest_failures=0`, harness `publishable=true`. |
+| LongMemEval mini | 30 questions, stratified 5 per type | **70.00% (21/30)** | recall@5 **96.67% (29/30)** | Representative canary from the May 2026 publication verification run; do not compare to legacy prefix slices. |
+| LoCoMo full | 10 conversations, 1986 questions | **84.74% (1683/1986)** | -- | Fresh publication verification run with pinned `gpt-5.4-mini-2026-03-17` judge; 444 judge calls, 0 skips/errors. |
+
+For arXiv and release-facing claims, use the curated publication bundle at
+[`benchmarks/publication/2026-05-arxiv/`](publication/2026-05-arxiv/). It
+separates canonical, exploratory, historical, and external-reported results so
+paper text does not accidentally over-claim from diagnostic runs.
Detailed experiment history:
@@ -48,9 +53,10 @@ Detailed experiment history:
| 2026-03-11 | #74 entity expansion | exp/74-entity-expansion-precision-v1 | 89.36% (+0.0) | -- | -- | -- | Hub-node detection. Zero delta — benchmark doesn't exercise graph expansion. → [postmortem](postmortems/2026-03-11_issue74_entity_expansion_precision.md) |
| 2026-03-12 | #79 (PR #125) | exp/79-priority-ids-fetch-v1 | 89.36% (+0.0) | -- | -- | -- | Bug fix: priority_ids now fetches by ID. Merged. → [postmortem](postmortems/2026-03-12_issue79_priority_ids_fetch.md) |
| 2026-04-23 | #128 | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | **85.53% (201/235)** | -- | -- | -- | Content keyword fallback + gentler adaptive floor. Improved **+3.40pp** vs the same-day baseline (`82.13%`, `193/235`) with no sampled question-level regressions across conv-26/conv-30. |
-| 2026-04-23 | #128 full judge | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | -- | **83.99% (1668/1986)** | -- | -- | Full judge-on rerun after harness fixes. Judge preflight passed and cat-5 scored **92.83% (414/446)** with **0 skips**. Improves **+3.93pp** vs full baseline (`80.06%`, `1590/1986`), so #128 is strong enough to move forward to broader validation. |
+| 2026-04-23 | #128 full judge | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | -- | **83.99% (1668/1986)** | -- | -- | Full judge-on rerun after harness fixes; category-5 judge model was `gpt-5.1`. Judge preflight passed and cat-5 scored **92.83% (414/446)** with **0 skips**. Improves **+3.93pp** vs full baseline (`80.06%`, `1590/1986`), so #128 is strong enough to move forward to broader validation. |
| 2026-04-23 | #142 | fix/142-expansion-tag-filter | -- | 77.30% (-0.07) | -- | -- | Expansion tag-filter bypass. Effectively flat vs pre-fix `77.37%` — canonical configs don't exercise `expand_relations`. Validated via scoped repro + helper/API tests. |
-| 2026-04-26 | LongMemEval harness | fix/longmemeval-harness-resume-and-stratified-mini | -- | -- | **60.0% (18/30)** | **86.20% (431/500)** | Representative stratified mini and full canonical run. Full recall@5 **97.20% (486/500)**; `judge_errors=0`, `memory_ingest_failures=0`. |
+| 2026-04-26 | LongMemEval harness | fix/longmemeval-harness-resume-and-stratified-mini | -- | -- | **60.0% (18/30)** | **86.20% (431/500)** | Historical canonical milestone. Full recall@5 **97.20% (486/500)**; `judge_errors=0`, `memory_ingest_failures=0`. Superseded for publication claims by the 2026-05-17 verification run. |
+| 2026-05-17 | Publication verification | feat/automem-arxiv-publication | **85.20% (259/304)** | **84.74% (1683/1986)** | **70.00% (21/30)** | **87.00% (435/500)** | Fresh local publication reruns. LoCoMo full used pinned `gpt-5.4-mini-2026-03-17` judge, 444 judge calls, 0 skips/errors, estimated judge cost `$0.7909`, artifact `benchmarks/results/locomo_baseline_20260517_193934.json`, sha256 `a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716`. LongMemEval full used `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge, recall@5 **97.00% (485/500)**, `memory_ingest_failures=0`, `judge_errors=0`, `publishable=true`, artifact `benchmarks/results/longmemeval-full-publication-20260518.json`, sha256 `ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75`. |
### Category Breakdown (LoCoMo-mini)
@@ -76,20 +82,20 @@ Categories 1-4 are scored by word-overlap/date matching. Category 5 uses an opt-
### Category Breakdown (LongMemEval full)
-Canonical run: `benchmarks/results/longmemeval_full_gpt5mini_20260425_231308.json`.
+Canonical run: `benchmarks/results/longmemeval-full-publication-20260518.json`.
Answerer `gpt-5-mini`; judge `gpt-5.4-mini-2026-03-17`.
| Question type | Accuracy | Recall@5 |
|---------------|----------|----------|
| knowledge-update | 88.46% (69/78) | 100.00% (78/78) |
-| multi-session | 81.20% (108/133) | 98.50% (131/133) |
+| multi-session | 84.21% (112/133) | 97.74% (130/133) |
| single-session-assistant | 98.21% (55/56) | 100.00% (56/56) |
-| single-session-preference | 60.00% (18/30) | 90.00% (27/30) |
-| single-session-user | 91.43% (64/70) | 92.86% (65/70) |
+| single-session-preference | 56.67% (17/30) | 90.00% (27/30) |
+| single-session-user | 92.86% (65/70) | 92.86% (65/70) |
| temporal-reasoning | 87.97% (117/133) | 96.99% (129/133) |
-Failure split from the result-analysis helper: 58 wrong answers had the answer
-session retrieved at recall@5; 11 were retrieval misses. This is the basis for
+Failure split from the result-analysis helper: 54 wrong answers had the answer
+session retrieved at recall@5; 11 wrong answers were retrieval misses. This is the basis for
follow-up issues #158 and #159.
## Exploratory and Historical Benchmarks
diff --git a/benchmarks/publication/2026-05-arxiv/README.md b/benchmarks/publication/2026-05-arxiv/README.md
new file mode 100644
index 00000000..242af1ae
--- /dev/null
+++ b/benchmarks/publication/2026-05-arxiv/README.md
@@ -0,0 +1,59 @@
+# AutoMem arXiv Publication Bundle
+
+This bundle collects the repository-side material for the May 2026 AutoMem
+arXiv preprint effort. It is intentionally conservative: canonical claims come
+from this repository's official benchmark log, while exploratory and external
+numbers are labeled separately.
+
+## Claim Posture
+
+AutoMem should be described as an open-source, inspectable, MCP-first
+graph-vector memory service for AI agents with transparent benchmark harnesses
+and strong canonical LoCoMo / LongMemEval results.
+
+Do not claim "best memory system", "SOTA", or "beats Mem0" from this bundle.
+Those claims require apples-to-apples reruns against the current external
+systems, judge policies, dataset versions, and scale settings.
+
+## Canonical Results
+
+| Status | Benchmark | Scope | Score | Retrieval | Source |
+|---|---:|---:|---:|---:|---|
+| canonical | LongMemEval full | 500 questions | 87.00% (435/500) | recall@5 97.00% (485/500) | Fresh publication verification run; see `fresh-verification.md` |
+| representative canary | LongMemEval mini | 30 stratified questions | 70.00% (21/30) | recall@5 96.67% (29/30) | Fresh publication verification run; see `fresh-verification.md` |
+| canonical | LoCoMo full | 10 conversations, 1,986 questions | 84.74% (1683/1986) | not reported | Fresh publication verification run with pinned `gpt-5.4-mini-2026-03-17` judge |
+
+Canonical LongMemEval model policy:
+
+- Answerer: `gpt-5-mini`
+- Judge: `gpt-5.4-mini-2026-03-17`
+- Judge errors: `0`
+- Memory ingest failures: `0`
+- Harness publishable flag: `true`
+
+## Supplemental Signals
+
+| Status | Benchmark / Evidence | Scope | Result | Caveat |
+|---|---|---:|---:|---|
+| exploratory | BEAM 100K V1 raw-dialogue shim | 20 conversations, 400 questions | 76.25% (305/400), avg 0.677 | Not comparable to published BEAM 1M/10M claims. |
+| exploratory | BEAM 100K V2 fact-extraction shim | 20 conversations, 400 questions | 73.75% (295/400), avg 0.653 | Diagnostic failure-mode signal only. |
+| exploratory | Writ drift integration | 5 drift scenarios | 100% recall accuracy, 20% update fidelity, 0% drift rate | Lives in `automem-evals`; must remain labeled supplemental until promoted. |
+| exploratory | Claude Code hook replay | fixture suite | metrics harness only | Lives in `automem-evals`; workflow-continuity signal, not a memory benchmark. |
+| external reported | Mem0 managed platform | LoCoMo / LongMemEval / BEAM | see cited Mem0 docs | Proprietary managed-platform optimizations; not directly comparable. |
+| not yet run | BEAM official 1M/10M | official BEAM scale | -- | Required before any BEAM-competitive claim. |
+| not yet run | LongMemEval-V2 | web-agent memory | -- | Required before "experienced colleague" claims. |
+| not yet run | Memora / FAMA | invalidated-memory reuse | -- | Natural fit for `INVALIDATED_BY`/`CONTRADICTS`, but not run yet. |
+
+## Bundle Files
+
+- `benchmark-summary.md` - paper-ready benchmark and limitation summary.
+- `artifact-manifest.json` - machine-readable manifest for claims, generated-artifact paths, and commands.
+- `commands.md` - verification and reproduction command inventory.
+- `fresh-verification.md` - latest local verification notes and generated artifact hashes.
+
+## Promotion Rule
+
+Results from `../automem-evals` may inform the paper only as supplemental
+evidence until a result is reproduced or explicitly summarized in this
+repository. Official benchmark claims remain owned by
+`benchmarks/EXPERIMENT_LOG.md`.
diff --git a/benchmarks/publication/2026-05-arxiv/artifact-manifest.json b/benchmarks/publication/2026-05-arxiv/artifact-manifest.json
new file mode 100644
index 00000000..fa603679
--- /dev/null
+++ b/benchmarks/publication/2026-05-arxiv/artifact-manifest.json
@@ -0,0 +1,145 @@
+{
+ "bundle": "automem-arxiv-2026-05",
+ "generated_at_utc": "2026-05-18T06:20:58Z",
+ "automem_git_sha_at_creation": "a742602f5d6ad2dea5a4d3c387d5b49d610afe2c",
+ "git_sha_note": "This records the base HEAD before publication-bundle edits; the PR commit SHA is supplied by GitHub after commit creation.",
+ "official_claim_source": "benchmarks/EXPERIMENT_LOG.md",
+ "judge_policy": "docs/BENCHMARK_JUDGE_POLICY.md",
+ "claims": [
+ {
+ "status": "canonical",
+ "benchmark": "LongMemEval",
+ "scope": "full",
+ "questions": 500,
+ "score": "87.00% (435/500)",
+ "retrieval": "recall@5 97.00% (485/500)",
+ "answer_model": "gpt-5-mini",
+ "judge_model": "gpt-5.4-mini-2026-03-17",
+ "source": "benchmarks/EXPERIMENT_LOG.md; fresh publication verification run 2026-05-18 UTC",
+ "generated_artifact": {
+ "path": "benchmarks/results/longmemeval-full-publication-20260518.json",
+ "gitignored": true,
+ "sha256": "ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75"
+ },
+ "hypotheses_artifact": {
+ "path": "benchmarks/results/longmemeval-full-publication-20260518.jsonl",
+ "gitignored": true,
+ "sha256": "69cd9c8171d5caec8661b1d8c2b27579decc40e2f134ffde74fcc93e70e2e7ce"
+ },
+ "memory_ingest_failures": 0,
+ "judge_errors": 0,
+ "publishable": true,
+ "elapsed_seconds": 10021.779591798782,
+ "category_breakdown": {
+ "knowledge_update": "88.46% (69/78)",
+ "multi_session": "84.21% (112/133)",
+ "single_session_assistant": "98.21% (55/56)",
+ "single_session_preference": "56.67% (17/30)",
+ "single_session_user": "92.86% (65/70)",
+ "temporal_reasoning": "87.97% (117/133)"
+ },
+ "failure_split": "65 wrong total; 54 wrong had answer session retrieved at recall@5; 11 wrong were retrieval misses; 4 correct answers were retrieval misses."
+ },
+ {
+ "status": "representative_canary",
+ "benchmark": "LongMemEval",
+ "scope": "mini stratified",
+ "questions": 30,
+ "score": "70.00% (21/30)",
+ "retrieval": "recall@5 96.67% (29/30)",
+ "answer_model": "gpt-5-mini",
+ "judge_model": "script LLM eval",
+ "source": "local run 2026-05-17; see fresh-verification.md",
+ "generated_artifact": {
+ "path": "benchmarks/results/longmemeval-mini-publication-20260517.json",
+ "gitignored": true,
+ "sha256": "7ea922b77e312a17c313bbf8c0e81f0268b48d1082080cae1db3c38e906577b8"
+ }
+ },
+ {
+ "status": "canonical",
+ "benchmark": "LoCoMo",
+ "scope": "full",
+ "questions": 1986,
+ "score": "84.74% (1683/1986)",
+ "retrieval": null,
+ "answer_model": null,
+ "judge_model": "gpt-5.4-mini-2026-03-17",
+ "source": "benchmarks/EXPERIMENT_LOG.md; fresh publication verification run 2026-05-17",
+ "generated_artifact": {
+ "path": "benchmarks/results/locomo_baseline_20260517_193934.json",
+ "gitignored": true,
+ "sha256": "a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716"
+ },
+ "judge_calls": 444,
+ "judge_errors": 0,
+ "judge_skips": 0,
+ "estimated_cost_usd": 0.790877,
+ "category_breakdown": {
+ "single_hop": "52.13% (147/282)",
+ "temporal": "86.60% (278/321)",
+ "multi_hop": "46.88% (45/96)",
+ "open_domain": "93.58% (787/841)",
+ "complex": "95.52% (426/446)"
+ }
+ },
+ {
+ "status": "fresh_verification",
+ "benchmark": "LoCoMo",
+ "scope": "mini",
+ "questions": 304,
+ "score": "85.20% (259/304)",
+ "retrieval": null,
+ "answer_model": null,
+ "judge_model": "gpt-5.4-mini-2026-03-17",
+ "source": "local run 2026-05-17; see fresh-verification.md",
+ "generated_artifact": {
+ "path": "benchmarks/results/locomo-mini_baseline_20260517_182318.json",
+ "gitignored": true,
+ "sha256": "ba2b98b0055f92ca17de9bc36207d7f39cf90b6270c2c3d903d69b8044aa7015"
+ }
+ },
+ {
+ "status": "exploratory",
+ "benchmark": "BEAM",
+ "scope": "100K V1 raw-dialogue shim",
+ "questions": 400,
+ "score": "76.25% (305/400), avg 0.677",
+ "retrieval": "top-k 200",
+ "source": "benchmarks/EXPERIMENT_LOG.md"
+ },
+ {
+ "status": "exploratory",
+ "benchmark": "BEAM",
+ "scope": "100K V2 fact-extraction shim",
+ "questions": 400,
+ "score": "73.75% (295/400), avg 0.653",
+ "retrieval": "top-k 200",
+ "source": "benchmarks/EXPERIMENT_LOG.md"
+ },
+ {
+ "status": "exploratory",
+ "benchmark": "Writ",
+ "scope": "drift category, 5 scenarios",
+ "questions": 5,
+ "score": "100.0% recall_accuracy; 20.0% update_fidelity; 0.0% drift_rate",
+ "retrieval": null,
+ "source": "../automem-evals/docs/writ_integration.md"
+ },
+ {
+ "status": "exploratory",
+ "benchmark": "Claude Code hook replay",
+ "scope": "fixture and metrics harness",
+ "questions": null,
+ "score": "harness tests only; no publication score",
+ "retrieval": null,
+ "source": "../automem-evals/docs/session_2026-04-28_hook_replay.md"
+ }
+ ],
+ "not_yet_run": [
+ "BEAM official 1M/10M",
+ "LongMemEval-V2",
+ "Memora/FAMA",
+ "Mem0 managed-platform apples-to-apples comparison"
+ ]
+}
diff --git a/benchmarks/publication/2026-05-arxiv/benchmark-summary.md b/benchmarks/publication/2026-05-arxiv/benchmark-summary.md
new file mode 100644
index 00000000..f3b4a74d
--- /dev/null
+++ b/benchmarks/publication/2026-05-arxiv/benchmark-summary.md
@@ -0,0 +1,69 @@
+# Benchmark Summary For Paper Draft
+
+## Recommended Headline
+
+AutoMem is an open-source graph-vector memory service for AI agents that
+publishes transparent benchmark harnesses and current canonical results of
+87.00% on LongMemEval full and 84.74% on LoCoMo full.
+
+This is a reproducibility and systems claim, not a state-of-the-art claim.
+
+## Canonical Results
+
+The official source of truth is `benchmarks/EXPERIMENT_LOG.md`.
+
+| Benchmark | Scope | Status | Score | Retrieval | Models / Judge |
+|---|---:|---|---:|---:|---|
+| LongMemEval full | 500 questions | canonical | 87.00% (435/500) | recall@5 97.00% (485/500) | `gpt-5-mini` answerer, `gpt-5.4-mini-2026-03-17` judge |
+| LongMemEval mini | 30 stratified questions | representative canary | 70.00% (21/30) | recall@5 96.67% (29/30) | `gpt-5-mini` answerer, script LLM eval |
+| LoCoMo full | 10 conversations, 1,986 questions | canonical | 84.74% (1683/1986) | not reported | Pinned `gpt-5.4-mini-2026-03-17` judge, 444 judge calls, 0 skips/errors |
+
+LongMemEval failure split: 54 wrong answers had the answer session retrieved at
+recall@5, while 11 wrong answers were retrieval misses. This supports a paper discussion that
+future improvements are likely in answer synthesis, memory representation, and
+preference handling, not only first-stage retrieval.
+
+## Historical / Exploratory Context
+
+Older LoCoMo mini/full values, including the 89.27% judge-off mini and 87.56%
+March full judge-on run, remain useful trend anchors but should not be used as
+current headline claims.
+
+BEAM 100K shim results, Writ drift runs, and hook replay metrics came from
+`automem-evals` and are explicitly diagnostic. They are not comparable to
+published BEAM 1M/10M numbers or production memory benchmarks because the
+scale, adapter, extraction policy, and judge setup differ.
+
+## External Comparisons
+
+The paper may cite external reported numbers from Mem0, Zep/Graphiti, Letta,
+A-MEM, BEAM, LongMemEval-V2, and Memora/FAMA, but those rows must be labeled
+`external reported` unless rerun through an AutoMem-controlled harness.
+
+For any comparison table, include:
+
+- system and version/date
+- open-source vs managed
+- benchmark and dataset version/hash
+- scope and question count
+- ingest/extraction protocol
+- retrieval method
+- answer model and judge/evaluator
+- token/context budget
+- latency/cost if available
+- score and recall@k
+- artifact URL or repro command
+- claim status
+
+## Limitations To State
+
+- No current SOTA claim.
+- External systems use different extraction policies, judges, hosted services,
+ and token budgets.
+- BEAM 1M/10M, LongMemEval-V2, and Memora/FAMA have not yet been run as
+ canonical AutoMem benchmarks.
+- LoCoMo and LongMemEval primarily test recall and answer synthesis; they do
+ not fully measure write precision, forgetting, privacy boundaries, or
+ long-running coding-agent workflows.
+- Some detailed JSON result artifacts are local/generated and gitignored; the
+ committed experiment log is the current durable source.
diff --git a/benchmarks/publication/2026-05-arxiv/commands.md b/benchmarks/publication/2026-05-arxiv/commands.md
new file mode 100644
index 00000000..1e4a2c30
--- /dev/null
+++ b/benchmarks/publication/2026-05-arxiv/commands.md
@@ -0,0 +1,47 @@
+# Publication Verification Commands
+
+Run these from `/Users/jgarturo/Projects/OpenAI/automem` unless noted.
+
+## Repository Checks
+
+```bash
+make test
+.venv/bin/black --check .
+.venv/bin/isort --check-only .
+make lint
+make test-integration
+make bench-health
+```
+
+## Canonical Benchmarks
+
+```bash
+BENCH_JUDGE_MODEL=gpt-5.4-mini-2026-03-17 make bench-eval BENCH=locomo-mini CONFIG=baseline
+BENCH_JUDGE_MODEL=gpt-5.4-mini-2026-03-17 make bench-eval BENCH=locomo CONFIG=baseline
+./test-longmemeval-benchmark.sh --llm-eval --llm-model gpt-5-mini --per-type 5 --output benchmarks/results/longmemeval-mini-publication
+./test-longmemeval-benchmark.sh --llm-eval --llm-model gpt-5-mini --output benchmarks/results/longmemeval-full-publication
+```
+
+The current 84.74% LoCoMo full result in `benchmarks/EXPERIMENT_LOG.md` is the
+fresh publication verification artifact produced by the pinned
+`gpt-5.4-mini-2026-03-17` command above.
+
+The LongMemEval full run is expensive and long-running. Use `--resume` with the
+same `--output` base if interrupted.
+
+## Supplemental Evals
+
+From `/Users/jgarturo/Projects/OpenAI/automem-evals`:
+
+```bash
+python3 -m unittest discover -s runners -p 'test_*.py'
+python3 -m unittest discover -s scripts -p 'test_*.py'
+python3 scripts/seed_from_snapshot.py
+python3 scripts/seed_associations.py
+python3 runners/compare_rulesets.py --rulesets baseline_v1 bare_tag_1m_v2
+python3 scripts/beam_shim_smoke.py --self-spawn
+python3 runners/run_writ.py --compare automem baseline --scenarios drift
+```
+
+Supplemental outputs are not canonical publication claims until they are
+summarized here or reproduced by a canonical harness.
diff --git a/benchmarks/publication/2026-05-arxiv/fresh-verification.md b/benchmarks/publication/2026-05-arxiv/fresh-verification.md
new file mode 100644
index 00000000..dddfa799
--- /dev/null
+++ b/benchmarks/publication/2026-05-arxiv/fresh-verification.md
@@ -0,0 +1,66 @@
+# Fresh Verification Notes
+
+Local verification run date: 2026-05-17 / 2026-05-18 UTC.
+
+## Repository Checks
+
+| Check | Result | Notes |
+|---|---|---|
+| `make test` | pass | 238 passed, 1 skipped, 25 deselected |
+| `.venv/bin/black --check .` | pass | Added `pyproject.toml` with the repo's documented 100-column Black configuration and generated-directory excludes. |
+| `.venv/bin/isort --check-only .` | pass | Added `pyproject.toml` isort config; excludes virtualenvs, snapshots, worktrees, showcase output, and vendored LoCoMo source. |
+| `make lint` | pass | Required `.flake8` exclude update for generated/env/worktree directories. |
+| `make test-integration` | pass | 11 passed, 253 deselected after rebuilding the Docker API image. |
+| `make bench-health` | pass | Required Makefile fix to use the venv Python; health reported `HEALTHY`. |
+
+## Benchmark Reruns
+
+| Benchmark | Command | Result | Generated artifact |
+|---|---|---|---|
+| LoCoMo mini | `BENCH_JUDGE_MODEL=gpt-5.4-mini-2026-03-17 make bench-eval BENCH=locomo-mini CONFIG=baseline` | 85.20% (259/304) | `benchmarks/results/locomo-mini_baseline_20260517_182318.json`, sha256 `ba2b98b0055f92ca17de9bc36207d7f39cf90b6270c2c3d903d69b8044aa7015` |
+| LoCoMo full | `BENCH_JUDGE_MODEL=gpt-5.4-mini-2026-03-17 make bench-eval BENCH=locomo CONFIG=baseline` | 84.74% (1683/1986) | `benchmarks/results/locomo_baseline_20260517_193934.json`, sha256 `a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716` |
+| LongMemEval mini | `./test-longmemeval-benchmark.sh --llm-eval --llm-model gpt-5-mini --per-type 5 --output benchmarks/results/longmemeval-mini-publication-20260517` | 70.00% (21/30), recall@5 96.67% (29/30) | `benchmarks/results/longmemeval-mini-publication-20260517.json`, sha256 `7ea922b77e312a17c313bbf8c0e81f0268b48d1082080cae1db3c38e906577b8` |
+| LongMemEval full | `./test-longmemeval-benchmark.sh --llm-eval --llm-model gpt-5-mini --eval-llm-model gpt-5.4-mini-2026-03-17 --output benchmarks/results/longmemeval-full-publication-20260518` | 87.00% (435/500), recall@5 97.00% (485/500) | `benchmarks/results/longmemeval-full-publication-20260518.json`, sha256 `ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75` |
+
+The official LongMemEval full claim is now the fresh publication verification
+artifact:
+
+- `benchmarks/results/longmemeval-full-publication-20260518.json`
+- sha256 `ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75`
+- 87.00% (435/500), recall@5 97.00% (485/500)
+- `memory_ingest_failures=0`, `judge_errors=0`, `publishable=true`
+
+Console caveat: the full run logged transient `gpt-5-mini` empty-answer
+warnings and one local recall read timeout, but the harness completed and marked
+the aggregate artifact publishable. Treat those warnings as answerer/service
+stability notes rather than hidden failures.
+
+The official LoCoMo full claim is now the fresh pinned-judge publication
+verification artifact:
+
+- `benchmarks/results/locomo_baseline_20260517_193934.json`
+- sha256 `a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716`
+- 84.74% (1683/1986)
+- Judge: `gpt-5.4-mini-2026-03-17`, 444 calls, 0 skips/errors
+
+## Supplemental Eval Repo Checks
+
+From `/Users/jgarturo/Projects/OpenAI/automem-evals`:
+
+| Check | Result |
+|---|---|
+| `python3 -m unittest discover -s runners -p 'test_*.py'` | pass, 95 tests |
+| `python3 -m unittest discover -s scripts -p 'test_*.py'` | pass, 10 tests |
+| `npm test` in `../automem-evals/third_party/writ` | pass, 72 tests |
+| `npm run build` in `../automem-evals/third_party/writ` | pass |
+
+Writ drift evidence remains exploratory and lives in
+`../automem-evals/docs/writ_integration.md`: AutoMem recall accuracy 100.0%,
+update fidelity 20.0%, drift rate 0.0% across 5 drift scenarios.
+
+## Paper Checks
+
+The paper source in `/Users/jgarturo/Projects/OpenAI/automem-paper` passed
+static checks for input-file existence and BibTeX cite-key resolution. No local
+LaTeX compiler (`pdflatex`, `latexmk`, `tectonic`, or `pandoc`) was available,
+so no PDF compilation is claimed.
diff --git a/docs/COMPARISON.md b/docs/COMPARISON.md
index 42854e7c..e7afaa39 100644
--- a/docs/COMPARISON.md
+++ b/docs/COMPARISON.md
@@ -84,7 +84,7 @@ When building your own is the right answer:
| **Confidence** | 0.05 | The memory's `confidence` score (0–1) |
| **Relevance** | 0.00 | Consolidation decay relevance — disabled by default |
-These defaults reflect the current LoCoMo baseline (89.27% judge-off, 87.56% judge-on). For a query like `GET /recall?query=database+migration&tags=decision&time_query=last+month`, the temporal-alignment and tag components dominate; for `GET /recall?query=why+postgres&expand_relations=true`, the relation component does.
+These defaults reflect the current canonical benchmark posture: LongMemEval full at 87.00% with 97.00% recall@5, and LoCoMo full at 84.74%. For a query like `GET /recall?query=database+migration&tags=decision&time_query=last+month`, the temporal-alignment and tag components dominate; for `GET /recall?query=why+postgres&expand_relations=true`, the relation component does.
The Recall Quality Lab (`scripts/lab/`) lets you sweep any weight and A/B-compare configs against snapshots of production data without touching the service.
diff --git a/docs/TESTING.md b/docs/TESTING.md
index b4315b49..b89c0a7b 100644
--- a/docs/TESTING.md
+++ b/docs/TESTING.md
@@ -262,12 +262,14 @@ LoCoMo evaluates AI systems' ability to remember and reason across very long con
Historical note: older public LoCoMo references such as CORE's **88.24%** are still useful background context, but they are not AutoMem's primary comparison target because the public setups are not perfectly apples-to-apples, especially around category-5 handling.
-AutoMem currently publishes two LoCoMo baselines:
+AutoMem currently publishes the following LoCoMo baselines:
| Setup | Scope | Score | Notes |
|------|-------|-------|-------|
-| `locomo-mini`, judge off | 2 conversations, categories 1-4 only | **89.27% (208/233)** | 71 category-5 questions skipped |
-| `locomo`, judge on (`gpt-4o`) | Full 10 conversations | **87.56% (1739/1986)** | Category 5 scored at 95.74% (427/446) |
+| `locomo`, judge on | Full 10 conversations | **84.74% (1683/1986)** | Current canonical full run from the May 2026 publication verification; category 5 used pinned `gpt-5.4-mini-2026-03-17` and scored **95.52% (426/446)** with 0 skips/errors. |
+| `locomo`, judge on | Full 10 conversations | **83.99% (1668/1986)** | Historical #128 full run; category 5 used `gpt-5.1` and scored **92.83% (414/446)** with 0 skips. |
+| `locomo-mini`, judge off | 2 conversations, categories 1-4 only | **89.27% (208/233)** | Historical mini anchor after evaluator fixes; not the current headline full-run claim. |
+| `locomo`, judge on (`gpt-4o`) | Full 10 conversations | **87.56% (1739/1986)** | Historical March 2026 run; kept for trend context only. |
### Running the Benchmark
@@ -304,20 +306,16 @@ Memory usage:
Example benchmark output:
```text
📊 FINAL RESULTS
-🎯 Overall Accuracy: 87.56% (1739/1986)
-⏱️ Total Time: 3497s
+🎯 Overall Accuracy: 84.74% (1683/1986)
+⏱️ Total Time: 3164s
💾 Total Memories Stored: 5882
-📈 Category Breakdown:
- Single-hop Recall : 66.31% (187/282)
- Temporal Understanding : 87.23% (280/321)
- Multi-hop Reasoning : 45.83% ( 44/ 96)
- Open Domain : 95.24% (801/841)
- Complex Reasoning : 95.74% (427/446)
+📈 Category Breakdown excerpt:
+ Complex Reasoning : 95.52% (426/446)
```
-If you run without the judge, category 5 will show as `N/A`.
+See `benchmarks/EXPERIMENT_LOG.md` for the full per-category table. If you run without the judge, category 5 will show as `N/A`.
Current baselines and methodology notes live in `benchmarks/EXPERIMENT_LOG.md`.
@@ -343,8 +341,9 @@ Current LongMemEval results:
| Setup | Scope | Score | Retrieval | Notes |
|------|-------|-------|-----------|-------|
-| `longmemeval-mini` representative | 30 questions, stratified 5 per question type | **60.0% (18/30)** | recall@5 **96.67% (29/30)** | Canonical run: `gpt-5-mini` answerer, `gpt-5.4-mini-2026-03-17` judge, `judge_errors=0`, `memory_ingest_failures=0`. |
-| `longmemeval` full canonical | 500 questions | **86.20% (431/500)** | recall@5 **97.20% (486/500)** | Canonical run: `gpt-5-mini` answerer, `gpt-5.4-mini-2026-03-17` judge, `judge_errors=0`, `memory_ingest_failures=0`. |
+| `longmemeval-mini` representative | 30 questions, stratified 5 per question type | **70.00% (21/30)** | recall@5 **96.67% (29/30)** | Fresh publication verification run: `gpt-5-mini` answerer, script LLM eval, 2 empty-answer warnings. |
+| `longmemeval` full canonical | 500 questions | **87.00% (435/500)** | recall@5 **97.00% (485/500)** | Fresh publication verification run: `gpt-5-mini` answerer, `gpt-5.4-mini-2026-03-17` judge, `judge_errors=0`, `memory_ingest_failures=0`, `publishable=true`. |
+| `longmemeval` full historical | 500 questions | **86.20% (431/500)** | recall@5 **97.20% (486/500)** | April 2026 canonical milestone with the same answerer and judge policy; kept for trend context. |
| `longmemeval` partial legacy prefix (50q) | 50 questions, single-session-user type | **82.0% (41/50)** | recall@5 **92.0% (46/50)** | Provisional prefix run with legacy `gpt-4o` answerer; recall_limit=10, no entity/relation expansion. Not reproduced by the current stratified `bench-mini-longmemeval` target and not directly comparable to the older 35.6% / 500-question setup. |
For future published LongMemEval results, use the pinned judge policy in [`docs/BENCHMARK_JUDGE_POLICY.md`](BENCHMARK_JUDGE_POLICY.md) so runs remain comparable over time. The primary answerer is `gpt-5-mini`; `gpt-4o` is legacy continuity only and should be labeled as such. Result metadata distinguishes the answer model (`answerer_model` / `llm_model`) from the judge model (`judge_model`, currently `gpt-5.4-mini-2026-03-17` when `--llm-eval` is enabled).
@@ -385,8 +384,8 @@ AutoMem is expected to perform well due to:
- `DERIVED_FROM`, `PART_OF`
2. **Hybrid Search**: Vector + keyword + tags + importance + time
- - Better than pure semantic search
- - More reliable than vector-only systems
+ - Designed to recover both semantic matches and explicit structured context
+ - Useful for testing graph/vector tradeoffs against snapshot-based evals
3. **Background Intelligence**:
- Entity extraction for structured queries
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..3c038407
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,30 @@
+[tool.black]
+line-length = 100
+extend-exclude = '''
+/(
+ \.bench-venv
+ | \.worktrees
+ | benchmarks/results
+ | benchmarks/snapshots
+ | node_modules
+ | packages/graph-viewer/node_modules
+ | showcase
+)/
+'''
+
+[tool.isort]
+profile = "black"
+line_length = 100
+skip_gitignore = true
+skip = [
+ ".bench-venv",
+ ".venv",
+ ".worktrees",
+ "benchmarks/results",
+ "benchmarks/snapshots",
+ "node_modules",
+ "packages/graph-viewer/node_modules",
+ "showcase",
+ "tests/benchmarks/locomo",
+ "venv",
+]
diff --git a/tests/test_api_endpoints.py b/tests/test_api_endpoints.py
index 2e7e5132..d82e5552 100644
--- a/tests/test_api_endpoints.py
+++ b/tests/test_api_endpoints.py
@@ -1800,6 +1800,35 @@ def query(self, query: str, params: dict[str, Any] | None = None) -> SimpleNames
assert "PARALLEL_CONTEXT" in query
+def test_related_memories_fallback_inlines_sanitized_depth(client, mock_state, auth_headers):
+ class Graph:
+ def __init__(self) -> None:
+ self.calls: list[tuple[str, dict[str, Any]]] = []
+
+ def query(self, query: str, params: dict[str, Any] | None = None) -> SimpleNamespace:
+ self.calls.append((query, params or {}))
+ if len(self.calls) == 1:
+ raise RuntimeError("apoc unavailable")
+ if "$max_depth" in query:
+ raise RuntimeError("FalkorDB rejects parameterized variable-length ranges")
+ return SimpleNamespace(result_set=[])
+
+ graph = Graph()
+ mock_state.memory_graph = graph
+
+ response = client.get(
+ "/memories/aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa/related?max_depth=2",
+ headers=auth_headers,
+ )
+
+ assert response.status_code == 200
+ assert len(graph.calls) == 2
+ fallback_query, fallback_params = graph.calls[1]
+ assert "*1..2" in fallback_query
+ assert "$max_depth" not in fallback_query
+ assert fallback_params["max_depth"] == 2
+
+
# ==================== Test Rate Limiting (if implemented) ====================
From f13fe9ef273488496d246ab3c984050b5c1170a8 Mon Sep 17 00:00:00 2001
From: Jack Arturo
Date: Fri, 22 May 2026 09:31:20 +0200
Subject: [PATCH 2/2] fix(publication): address copilot review on PR #166
---
Makefile | 2 +-
automem/api/recall.py | 3 ++-
benchmarks/publication/2026-05-arxiv/commands.md | 4 ++--
.../publication/2026-05-arxiv/fresh-verification.md | 11 ++++++-----
docs/TESTING.md | 4 ++--
pyproject.toml | 2 ++
tests/test_api_endpoints.py | 4 +++-
7 files changed, 18 insertions(+), 12 deletions(-)
diff --git a/Makefile b/Makefile
index 7d141cd2..1e28ee58 100644
--- a/Makefile
+++ b/Makefile
@@ -161,7 +161,7 @@ bench-compare-branch:
@scripts/bench/compare_branch.sh $(BRANCH) $(or $(CONFIG),baseline) $(or $(BENCH),locomo)
bench-health:
- @$(VENV_BIN)/python scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001)
+ @$(or $(VENV_BIN),.venv/bin)/python scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001)
bench-snapshots:
@ls -la benchmarks/snapshots/ 2>/dev/null || echo "No snapshots yet. Run: make bench-ingest BENCH=locomo"
diff --git a/automem/api/recall.py b/automem/api/recall.py
index 81c86210..2cd572f8 100644
--- a/automem/api/recall.py
+++ b/automem/api/recall.py
@@ -2128,11 +2128,12 @@ def get_related_memories(memory_id: str) -> Any:
LIMIT $limit
"""
params = {"id": memory_id, "max_depth": max_depth, "limit": limit}
+ fallback_params = {"id": memory_id, "limit": limit}
try:
result = graph.query(query, params)
except Exception:
try:
- result = graph.query(fallback_query, params)
+ result = graph.query(fallback_query, fallback_params)
except Exception:
logger.exception("Failed to traverse related memories for %s", memory_id)
abort(500, description="Failed to fetch related memories")
diff --git a/benchmarks/publication/2026-05-arxiv/commands.md b/benchmarks/publication/2026-05-arxiv/commands.md
index 1e4a2c30..def32bab 100644
--- a/benchmarks/publication/2026-05-arxiv/commands.md
+++ b/benchmarks/publication/2026-05-arxiv/commands.md
@@ -1,6 +1,6 @@
# Publication Verification Commands
-Run these from `/Users/jgarturo/Projects/OpenAI/automem` unless noted.
+Run these from the repository root unless noted.
## Repository Checks
@@ -31,7 +31,7 @@ same `--output` base if interrupted.
## Supplemental Evals
-From `/Users/jgarturo/Projects/OpenAI/automem-evals`:
+From a clone of the `automem-evals` repository:
```bash
python3 -m unittest discover -s runners -p 'test_*.py'
diff --git a/benchmarks/publication/2026-05-arxiv/fresh-verification.md b/benchmarks/publication/2026-05-arxiv/fresh-verification.md
index dddfa799..2917c8fa 100644
--- a/benchmarks/publication/2026-05-arxiv/fresh-verification.md
+++ b/benchmarks/publication/2026-05-arxiv/fresh-verification.md
@@ -45,7 +45,8 @@ verification artifact:
## Supplemental Eval Repo Checks
-From `/Users/jgarturo/Projects/OpenAI/automem-evals`:
+From a clone of the `automem-evals` repository. The local verification used a
+sibling checkout referenced as `../automem-evals` below.
| Check | Result |
|---|---|
@@ -60,7 +61,7 @@ update fidelity 20.0%, drift rate 0.0% across 5 drift scenarios.
## Paper Checks
-The paper source in `/Users/jgarturo/Projects/OpenAI/automem-paper` passed
-static checks for input-file existence and BibTeX cite-key resolution. No local
-LaTeX compiler (`pdflatex`, `latexmk`, `tectonic`, or `pandoc`) was available,
-so no PDF compilation is claimed.
+The separate AutoMem paper source checkout passed static checks for input-file
+existence and BibTeX cite-key resolution. No local LaTeX compiler (`pdflatex`,
+`latexmk`, `tectonic`, or `pandoc`) was available, so no PDF compilation is
+claimed.
diff --git a/docs/TESTING.md b/docs/TESTING.md
index b89c0a7b..62ea6e3b 100644
--- a/docs/TESTING.md
+++ b/docs/TESTING.md
@@ -266,8 +266,8 @@ AutoMem currently publishes the following LoCoMo baselines:
| Setup | Scope | Score | Notes |
|------|-------|-------|-------|
-| `locomo`, judge on | Full 10 conversations | **84.74% (1683/1986)** | Current canonical full run from the May 2026 publication verification; category 5 used pinned `gpt-5.4-mini-2026-03-17` and scored **95.52% (426/446)** with 0 skips/errors. |
-| `locomo`, judge on | Full 10 conversations | **83.99% (1668/1986)** | Historical #128 full run; category 5 used `gpt-5.1` and scored **92.83% (414/446)** with 0 skips. |
+| `locomo`, canonical judge on (`gpt-5.4-mini-2026-03-17`) | Full 10 conversations | **84.74% (1683/1986)** | Current canonical full run from the May 2026 publication verification; category 5 scored **95.52% (426/446)** with 0 skips/errors. |
+| `locomo`, historical judge on (`gpt-5.1`) | Full 10 conversations | **83.99% (1668/1986)** | Historical #128 full run; category 5 scored **92.83% (414/446)** with 0 skips. |
| `locomo-mini`, judge off | 2 conversations, categories 1-4 only | **89.27% (208/233)** | Historical mini anchor after evaluator fixes; not the current headline full-run claim. |
| `locomo`, judge on (`gpt-4o`) | Full 10 conversations | **87.56% (1739/1986)** | Historical March 2026 run; kept for trend context only. |
diff --git a/pyproject.toml b/pyproject.toml
index 3c038407..e9c21764 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,12 +3,14 @@ line-length = 100
extend-exclude = '''
/(
\.bench-venv
+ | \.venv
| \.worktrees
| benchmarks/results
| benchmarks/snapshots
| node_modules
| packages/graph-viewer/node_modules
| showcase
+ | venv
)/
'''
diff --git a/tests/test_api_endpoints.py b/tests/test_api_endpoints.py
index d82e5552..aa7449fd 100644
--- a/tests/test_api_endpoints.py
+++ b/tests/test_api_endpoints.py
@@ -1826,7 +1826,9 @@ def query(self, query: str, params: dict[str, Any] | None = None) -> SimpleNames
fallback_query, fallback_params = graph.calls[1]
assert "*1..2" in fallback_query
assert "$max_depth" not in fallback_query
- assert fallback_params["max_depth"] == 2
+ assert "max_depth" not in fallback_params
+ assert fallback_params["id"] == "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
+ assert fallback_params["limit"] == 5
# ==================== Test Rate Limiting (if implemented) ====================