verygoodplugins · jack-arturo · May 22, 2026 · May 18, 2026 · May 22, 2026
diff --git a/.flake8 b/.flake8
@@ -3,7 +3,11 @@ max-line-length = 100
 select = E9,F63,F7,F82
 show_source = True
 extend-exclude =
+  .bench-venv,
+  .worktrees,
   .venv,
+  benchmarks/results,
+  benchmarks/snapshots,
   venv,
   node_modules,
   packages/graph-viewer/node_modules,

diff --git a/Makefile b/Makefile
@@ -161,7 +161,7 @@ bench-compare-branch:
 	@scripts/bench/compare_branch.sh $(BRANCH) $(or $(CONFIG),baseline) $(or $(BENCH),locomo)
 
 bench-health:
-	@python3 scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001)
+	@$(VENV_BIN)/python scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001)
 
 bench-snapshots:
 	@ls -la benchmarks/snapshots/ 2>/dev/null || echo "No snapshots yet. Run: make bench-ingest BENCH=locomo"

diff --git a/README.md b/README.md
@@ -7,7 +7,7 @@
   <a href="LICENSE"><img src="https://img.shields.io/github/license/verygoodplugins/automem" alt="License" /></a>
   <a href="https://automem.ai/discord"><img src="https://img.shields.io/badge/Discord-Join-5865F2?logo=discord&logoColor=white" alt="Discord" /></a>
   <a href="https://x.com/automem_ai"><img src="https://img.shields.io/badge/X-@automem__ai-000000?logo=x&logoColor=white" alt="X" /></a>
-  <a href="benchmarks/EXPERIMENT_LOG.md"><img src="https://img.shields.io/badge/LoCoMo-89.27%25-success" alt="LoCoMo benchmark" /></a>
+  <a href="benchmarks/EXPERIMENT_LOG.md"><img src="https://img.shields.io/badge/LongMemEval-87.00%25-success" alt="LongMemEval benchmark" /></a>
   <a href="https://railway.com/deploy/automem-ai-memory-service?referralCode=VuFE6g&utm_medium=integration&utm_source=github&utm_campaign=generic"><img src="https://img.shields.io/badge/Deploy%20on-Railway-0B0D0E?logo=railway&logoColor=white" alt="Deploy on Railway" /></a>
 </p>
 
@@ -27,9 +27,9 @@ Your AI forgets between sessions. RAG dumps documents that look similar. Vector
 
 AutoMem stores typed relationships *and* embeddings. When you ask "why did we choose PostgreSQL?", recall returns not just the matching memory — but the alternatives you considered, the principle behind the choice, and the related decisions that came after.
 
-It scores **89.27%** on the LoCoMo long-term memory benchmark (ACL 2024) judge-off, and **87.56%** judge-on. See [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md) for methodology and history.
+Current canonical benchmark results are **87.00%** on LongMemEval full with **97.00% recall@5**, and **84.74%** on LoCoMo full. See [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md) for methodology, judge policy, category breakdowns, and historical runs.
 
-Additional LongMemEval and BEAM validation is tracked in [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md); BEAM is currently reported as exploratory because published comparisons are not yet apples-to-apples.
+Exploratory BEAM validation is tracked separately in [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md); those numbers are diagnostic and not apples-to-apples product claims against published 1M/10M BEAM results.
 
 ## Should you use AutoMem?
 
@@ -223,6 +223,7 @@ _Screenshots will be added once the referenced in-repo image assets are availabl
 - [Research foundation](docs/RESEARCH.md) — papers and how AutoMem implements them
 - [Comparison](docs/COMPARISON.md) — vs. RAG, vector DBs, building your own
 - [Benchmark history](benchmarks/EXPERIMENT_LOG.md) — LoCoMo, LongMemEval, and BEAM methodology and runs
+- [Publication bundle](benchmarks/publication/2026-05-arxiv/) — arXiv claim posture, reproducibility commands, and artifact manifest
 
 **Operations**
 - [Health monitoring & backups](docs/MONITORING_AND_BACKUPS.md)

diff --git a/automem/api/recall.py b/automem/api/recall.py
@@ -2117,8 +2117,11 @@ def get_related_memories(memory_id: str) -> Any:
             ORDER BY coalesce(related.importance, 0.0) DESC, coalesce(related.timestamp, '') DESC
             LIMIT $limit
         """
+        # FalkorDB does not accept parameters inside variable-length relationship ranges.
+        # max_depth is parsed and clamped above, so inlining it here is safe.
+        fallback_depth = max_depth
         fallback_query = f"""
-            MATCH (m:Memory {{id: $id}}){'-[r' + rel_pattern + '*1..$max_depth]-' if rel_pattern else '-[r*1..$max_depth]-'}(related:Memory)
+            MATCH (m:Memory {{id: $id}}){'-[r' + rel_pattern + f'*1..{fallback_depth}]-' if rel_pattern else f'-[r*1..{fallback_depth}]-'}(related:Memory)
             WHERE m.id <> related.id
             RETURN DISTINCT related
             ORDER BY coalesce(related.importance, 0.0) DESC, coalesce(related.timestamp, '') DESC

diff --git a/benchmarks/EXPERIMENT_LOG.md b/benchmarks/EXPERIMENT_LOG.md
@@ -21,9 +21,14 @@ Current headline results:
 
 | Benchmark | Scope | Score | Retrieval | Notes |
 |-----------|-------|-------|-----------|-------|
-| LongMemEval full | 500 questions | **86.20% (431/500)** | recall@5 **97.20% (486/500)** | Canonical `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge; `judge_errors=0`, `memory_ingest_failures=0`. |
-| LongMemEval mini | 30 questions, stratified 5 per type | **60.0% (18/30)** | recall@5 **96.67% (29/30)** | Representative canary; do not compare to legacy prefix slices. |
-| LoCoMo full | 10 conversations, 1986 questions | **83.99% (1668/1986)** | -- | Latest recorded full judge-on run from #128; cat5 scored 92.83% with 0 skips. |
+| LongMemEval full | 500 questions | **87.00% (435/500)** | recall@5 **97.00% (485/500)** | Fresh publication verification run with `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge; `judge_errors=0`, `memory_ingest_failures=0`, harness `publishable=true`. |
+| LongMemEval mini | 30 questions, stratified 5 per type | **70.00% (21/30)** | recall@5 **96.67% (29/30)** | Representative canary from the May 2026 publication verification run; do not compare to legacy prefix slices. |
+| LoCoMo full | 10 conversations, 1986 questions | **84.74% (1683/1986)** | -- | Fresh publication verification run with pinned `gpt-5.4-mini-2026-03-17` judge; 444 judge calls, 0 skips/errors. |
+
+For arXiv and release-facing claims, use the curated publication bundle at
+[`benchmarks/publication/2026-05-arxiv/`](publication/2026-05-arxiv/). It
+separates canonical, exploratory, historical, and external-reported results so
+paper text does not accidentally over-claim from diagnostic runs.
 
 Detailed experiment history:
 
@@ -48,9 +53,10 @@ Detailed experiment history:
 | 2026-03-11 | #74 entity expansion | exp/74-entity-expansion-precision-v1 | 89.36% (+0.0) | -- | -- | -- | Hub-node detection. Zero delta — benchmark doesn't exercise graph expansion. → [postmortem](postmortems/2026-03-11_issue74_entity_expansion_precision.md) |
 | 2026-03-12 | #79 (PR #125) | exp/79-priority-ids-fetch-v1 | 89.36% (+0.0) | -- | -- | -- | Bug fix: priority_ids now fetches by ID. Merged. → [postmortem](postmortems/2026-03-12_issue79_priority_ids_fetch.md) |
 | 2026-04-23 | #128 | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | **85.53% (201/235)** | -- | -- | -- | Content keyword fallback + gentler adaptive floor. Improved **+3.40pp** vs the same-day baseline (`82.13%`, `193/235`) with no sampled question-level regressions across conv-26/conv-30. |
-| 2026-04-23 | #128 full judge | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | -- | **83.99% (1668/1986)** | -- | -- | Full judge-on rerun after harness fixes. Judge preflight passed and cat-5 scored **92.83% (414/446)** with **0 skips**. Improves **+3.93pp** vs full baseline (`80.06%`, `1590/1986`), so #128 is strong enough to move forward to broader validation. |
+| 2026-04-23 | #128 full judge | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | -- | **83.99% (1668/1986)** | -- | -- | Full judge-on rerun after harness fixes; category-5 judge model was `gpt-5.1`. Judge preflight passed and cat-5 scored **92.83% (414/446)** with **0 skips**. Improves **+3.93pp** vs full baseline (`80.06%`, `1590/1986`), so #128 is strong enough to move forward to broader validation. |
 | 2026-04-23 | #142 | fix/142-expansion-tag-filter | -- | 77.30% (-0.07) | -- | -- | Expansion tag-filter bypass. Effectively flat vs pre-fix `77.37%` — canonical configs don't exercise `expand_relations`. Validated via scoped repro + helper/API tests. |
-| 2026-04-26 | LongMemEval harness | fix/longmemeval-harness-resume-and-stratified-mini | -- | -- | **60.0% (18/30)** | **86.20% (431/500)** | Representative stratified mini and full canonical run. Full recall@5 **97.20% (486/500)**; `judge_errors=0`, `memory_ingest_failures=0`. |
+| 2026-04-26 | LongMemEval harness | fix/longmemeval-harness-resume-and-stratified-mini | -- | -- | **60.0% (18/30)** | **86.20% (431/500)** | Historical canonical milestone. Full recall@5 **97.20% (486/500)**; `judge_errors=0`, `memory_ingest_failures=0`. Superseded for publication claims by the 2026-05-17 verification run. |
+| 2026-05-17 | Publication verification | feat/automem-arxiv-publication | **85.20% (259/304)** | **84.74% (1683/1986)** | **70.00% (21/30)** | **87.00% (435/500)** | Fresh local publication reruns. LoCoMo full used pinned `gpt-5.4-mini-2026-03-17` judge, 444 judge calls, 0 skips/errors, estimated judge cost `$0.7909`, artifact `benchmarks/results/locomo_baseline_20260517_193934.json`, sha256 `a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716`. LongMemEval full used `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge, recall@5 **97.00% (485/500)**, `memory_ingest_failures=0`, `judge_errors=0`, `publishable=true`, artifact `benchmarks/results/longmemeval-full-publication-20260518.json`, sha256 `ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75`. |
 
 ### Category Breakdown (LoCoMo-mini)
 
@@ -76,20 +82,20 @@ Categories 1-4 are scored by word-overlap/date matching. Category 5 uses an opt-
 
 ### Category Breakdown (LongMemEval full)
 
-Canonical run: `benchmarks/results/longmemeval_full_gpt5mini_20260425_231308.json`.
+Canonical run: `benchmarks/results/longmemeval-full-publication-20260518.json`.
 Answerer `gpt-5-mini`; judge `gpt-5.4-mini-2026-03-17`.
 
 | Question type | Accuracy | Recall@5 |
 |---------------|----------|----------|
 | knowledge-update | 88.46% (69/78) | 100.00% (78/78) |
-| multi-session | 81.20% (108/133) | 98.50% (131/133) |
+| multi-session | 84.21% (112/133) | 97.74% (130/133) |
 | single-session-assistant | 98.21% (55/56) | 100.00% (56/56) |
-| single-session-preference | 60.00% (18/30) | 90.00% (27/30) |
-| single-session-user | 91.43% (64/70) | 92.86% (65/70) |
+| single-session-preference | 56.67% (17/30) | 90.00% (27/30) |
+| single-session-user | 92.86% (65/70) | 92.86% (65/70) |
 | temporal-reasoning | 87.97% (117/133) | 96.99% (129/133) |
 
-Failure split from the result-analysis helper: 58 wrong answers had the answer
-session retrieved at recall@5; 11 were retrieval misses. This is the basis for
+Failure split from the result-analysis helper: 54 wrong answers had the answer
+session retrieved at recall@5; 11 wrong answers were retrieval misses. This is the basis for
 follow-up issues #158 and #159.
 
 ## Exploratory and Historical Benchmarks

diff --git a/benchmarks/publication/2026-05-arxiv/README.md b/benchmarks/publication/2026-05-arxiv/README.md
@@ -0,0 +1,59 @@
+# AutoMem arXiv Publication Bundle
+
+This bundle collects the repository-side material for the May 2026 AutoMem
+arXiv preprint effort. It is intentionally conservative: canonical claims come
+from this repository's official benchmark log, while exploratory and external
+numbers are labeled separately.
+
+## Claim Posture
+
+AutoMem should be described as an open-source, inspectable, MCP-first
+graph-vector memory service for AI agents with transparent benchmark harnesses
+and strong canonical LoCoMo / LongMemEval results.
+
+Do not claim "best memory system", "SOTA", or "beats Mem0" from this bundle.
+Those claims require apples-to-apples reruns against the current external
+systems, judge policies, dataset versions, and scale settings.
+
+## Canonical Results
+
+| Status | Benchmark | Scope | Score | Retrieval | Source |
+|---|---:|---:|---:|---:|---|
+| canonical | LongMemEval full | 500 questions | 87.00% (435/500) | recall@5 97.00% (485/500) | Fresh publication verification run; see `fresh-verification.md` |
+| representative canary | LongMemEval mini | 30 stratified questions | 70.00% (21/30) | recall@5 96.67% (29/30) | Fresh publication verification run; see `fresh-verification.md` |
+| canonical | LoCoMo full | 10 conversations, 1,986 questions | 84.74% (1683/1986) | not reported | Fresh publication verification run with pinned `gpt-5.4-mini-2026-03-17` judge |
+
+Canonical LongMemEval model policy:
+
+- Answerer: `gpt-5-mini`
+- Judge: `gpt-5.4-mini-2026-03-17`
+- Judge errors: `0`
+- Memory ingest failures: `0`
+- Harness publishable flag: `true`
+
+## Supplemental Signals
+
+| Status | Benchmark / Evidence | Scope | Result | Caveat |
+|---|---|---:|---:|---|
+| exploratory | BEAM 100K V1 raw-dialogue shim | 20 conversations, 400 questions | 76.25% (305/400), avg 0.677 | Not comparable to published BEAM 1M/10M claims. |
+| exploratory | BEAM 100K V2 fact-extraction shim | 20 conversations, 400 questions | 73.75% (295/400), avg 0.653 | Diagnostic failure-mode signal only. |
+| exploratory | Writ drift integration | 5 drift scenarios | 100% recall accuracy, 20% update fidelity, 0% drift rate | Lives in `automem-evals`; must remain labeled supplemental until promoted. |
+| exploratory | Claude Code hook replay | fixture suite | metrics harness only | Lives in `automem-evals`; workflow-continuity signal, not a memory benchmark. |
+| external reported | Mem0 managed platform | LoCoMo / LongMemEval / BEAM | see cited Mem0 docs | Proprietary managed-platform optimizations; not directly comparable. |
+| not yet run | BEAM official 1M/10M | official BEAM scale | -- | Required before any BEAM-competitive claim. |
+| not yet run | LongMemEval-V2 | web-agent memory | -- | Required before "experienced colleague" claims. |
+| not yet run | Memora / FAMA | invalidated-memory reuse | -- | Natural fit for `INVALIDATED_BY`/`CONTRADICTS`, but not run yet. |
+
+## Bundle Files
+
+- `benchmark-summary.md` - paper-ready benchmark and limitation summary.
+- `artifact-manifest.json` - machine-readable manifest for claims, generated-artifact paths, and commands.
+- `commands.md` - verification and reproduction command inventory.
+- `fresh-verification.md` - latest local verification notes and generated artifact hashes.
+
+## Promotion Rule
+
+Results from `../automem-evals` may inform the paper only as supplemental
+evidence until a result is reproduced or explicitly summarized in this
+repository. Official benchmark claims remain owned by
+`benchmarks/EXPERIMENT_LOG.md`.
diff --git a/benchmarks/publication/2026-05-arxiv/artifact-manifest.json b/benchmarks/publication/2026-05-arxiv/artifact-manifest.json
@@ -0,0 +1,145 @@
+{
+  "bundle": "automem-arxiv-2026-05",
+  "generated_at_utc": "2026-05-18T06:20:58Z",
+  "automem_git_sha_at_creation": "a742602f5d6ad2dea5a4d3c387d5b49d610afe2c",
+  "git_sha_note": "This records the base HEAD before publication-bundle edits; the PR commit SHA is supplied by GitHub after commit creation.",
+  "official_claim_source": "benchmarks/EXPERIMENT_LOG.md",
+  "judge_policy": "docs/BENCHMARK_JUDGE_POLICY.md",
+  "claims": [
+    {
+      "status": "canonical",
+      "benchmark": "LongMemEval",
+      "scope": "full",
+      "questions": 500,
+      "score": "87.00% (435/500)",
+      "retrieval": "recall@5 97.00% (485/500)",
+      "answer_model": "gpt-5-mini",
+      "judge_model": "gpt-5.4-mini-2026-03-17",
+      "source": "benchmarks/EXPERIMENT_LOG.md; fresh publication verification run 2026-05-18 UTC",
+      "generated_artifact": {
+        "path": "benchmarks/results/longmemeval-full-publication-20260518.json",
+        "gitignored": true,
+        "sha256": "ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75"
+      },
+      "hypotheses_artifact": {
+        "path": "benchmarks/results/longmemeval-full-publication-20260518.jsonl",
+        "gitignored": true,
+        "sha256": "69cd9c8171d5caec8661b1d8c2b27579decc40e2f134ffde74fcc93e70e2e7ce"
+      },
+      "memory_ingest_failures": 0,
+      "judge_errors": 0,
+      "publishable": true,
+      "elapsed_seconds": 10021.779591798782,
+      "category_breakdown": {
+        "knowledge_update": "88.46% (69/78)",
+        "multi_session": "84.21% (112/133)",
+        "single_session_assistant": "98.21% (55/56)",
+        "single_session_preference": "56.67% (17/30)",
+        "single_session_user": "92.86% (65/70)",
+        "temporal_reasoning": "87.97% (117/133)"
+      },
+      "failure_split": "65 wrong total; 54 wrong had answer session retrieved at recall@5; 11 wrong were retrieval misses; 4 correct answers were retrieval misses."
+    },
+    {
+      "status": "representative_canary",
+      "benchmark": "LongMemEval",
+      "scope": "mini stratified",
+      "questions": 30,
+      "score": "70.00% (21/30)",
+      "retrieval": "recall@5 96.67% (29/30)",
+      "answer_model": "gpt-5-mini",
+      "judge_model": "script LLM eval",
+      "source": "local run 2026-05-17; see fresh-verification.md",
+      "generated_artifact": {
+        "path": "benchmarks/results/longmemeval-mini-publication-20260517.json",
+        "gitignored": true,
+        "sha256": "7ea922b77e312a17c313bbf8c0e81f0268b48d1082080cae1db3c38e906577b8"
+      }
+    },
+    {
+      "status": "canonical",
+      "benchmark": "LoCoMo",
+      "scope": "full",
+      "questions": 1986,
+      "score": "84.74% (1683/1986)",
+      "retrieval": null,
+      "answer_model": null,
+      "judge_model": "gpt-5.4-mini-2026-03-17",
+      "source": "benchmarks/EXPERIMENT_LOG.md; fresh publication verification run 2026-05-17",
+      "generated_artifact": {
+        "path": "benchmarks/results/locomo_baseline_20260517_193934.json",
+        "gitignored": true,
+        "sha256": "a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716"
+      },
+      "judge_calls": 444,
+      "judge_errors": 0,
+      "judge_skips": 0,
+      "estimated_cost_usd": 0.790877,
+      "category_breakdown": {
+        "single_hop": "52.13% (147/282)",
+        "temporal": "86.60% (278/321)",
+        "multi_hop": "46.88% (45/96)",
+        "open_domain": "93.58% (787/841)",
+        "complex": "95.52% (426/446)"
+      }
+    },
+    {
+      "status": "fresh_verification",
+      "benchmark": "LoCoMo",
+      "scope": "mini",
+      "questions": 304,
+      "score": "85.20% (259/304)",
+      "retrieval": null,
+      "answer_model": null,
+      "judge_model": "gpt-5.4-mini-2026-03-17",
+      "source": "local run 2026-05-17; see fresh-verification.md",
+      "generated_artifact": {
+        "path": "benchmarks/results/locomo-mini_baseline_20260517_182318.json",
+        "gitignored": true,
+        "sha256": "ba2b98b0055f92ca17de9bc36207d7f39cf90b6270c2c3d903d69b8044aa7015"
+      }
+    },
+    {
+      "status": "exploratory",
+      "benchmark": "BEAM",
+      "scope": "100K V1 raw-dialogue shim",
+      "questions": 400,
+      "score": "76.25% (305/400), avg 0.677",
+      "retrieval": "top-k 200",
+      "source": "benchmarks/EXPERIMENT_LOG.md"
+    },
+    {
+      "status": "exploratory",
+      "benchmark": "BEAM",
+      "scope": "100K V2 fact-extraction shim",
+      "questions": 400,
+      "score": "73.75% (295/400), avg 0.653",
+      "retrieval": "top-k 200",
+      "source": "benchmarks/EXPERIMENT_LOG.md"
+    },
+    {
+      "status": "exploratory",
+      "benchmark": "Writ",
+      "scope": "drift category, 5 scenarios",
+      "questions": 5,
+      "score": "100.0% recall_accuracy; 20.0% update_fidelity; 0.0% drift_rate",
+      "retrieval": null,
+      "source": "../automem-evals/docs/writ_integration.md"
+    },
+    {
+      "status": "exploratory",
+      "benchmark": "Claude Code hook replay",
+      "scope": "fixture and metrics harness",
+      "questions": null,
+      "score": "harness tests only; no publication score",
+      "retrieval": null,
+      "source": "../automem-evals/docs/session_2026-04-28_hook_replay.md"
+    }
+  ],
+  "not_yet_run": [
+    "BEAM official 1M/10M",
+    "LongMemEval-V2",
+    "Memora/FAMA",
+    "Mem0 managed-platform apples-to-apples comparison"
+  ]
+}