diff --git a/.flake8 b/.flake8 index 026c69a..d6c7d5a 100644 --- a/.flake8 +++ b/.flake8 @@ -3,7 +3,11 @@ max-line-length = 100 select = E9,F63,F7,F82 show_source = True extend-exclude = + .bench-venv, + .worktrees, .venv, + benchmarks/results, + benchmarks/snapshots, venv, node_modules, packages/graph-viewer/node_modules, diff --git a/Makefile b/Makefile index 338f0e7..1e28ee5 100644 --- a/Makefile +++ b/Makefile @@ -161,7 +161,7 @@ bench-compare-branch: @scripts/bench/compare_branch.sh $(BRANCH) $(or $(CONFIG),baseline) $(or $(BENCH),locomo) bench-health: - @python3 scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001) + @$(or $(VENV_BIN),.venv/bin)/python scripts/bench/health_check.py --base-url $(or $(BASE_URL),http://localhost:8001) bench-snapshots: @ls -la benchmarks/snapshots/ 2>/dev/null || echo "No snapshots yet. Run: make bench-ingest BENCH=locomo" diff --git a/README.md b/README.md index 525ae18..3817a24 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ License Discord X - LoCoMo benchmark + LongMemEval benchmark Deploy on Railway

@@ -27,9 +27,9 @@ Your AI forgets between sessions. RAG dumps documents that look similar. Vector AutoMem stores typed relationships *and* embeddings. When you ask "why did we choose PostgreSQL?", recall returns not just the matching memory — but the alternatives you considered, the principle behind the choice, and the related decisions that came after. -It scores **89.27%** on the LoCoMo long-term memory benchmark (ACL 2024) judge-off, and **87.56%** judge-on. See [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md) for methodology and history. +Current canonical benchmark results are **87.00%** on LongMemEval full with **97.00% recall@5**, and **84.74%** on LoCoMo full. See [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md) for methodology, judge policy, category breakdowns, and historical runs. -Additional LongMemEval and BEAM validation is tracked in [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md); BEAM is currently reported as exploratory because published comparisons are not yet apples-to-apples. +Exploratory BEAM validation is tracked separately in [`benchmarks/EXPERIMENT_LOG.md`](benchmarks/EXPERIMENT_LOG.md); those numbers are diagnostic and not apples-to-apples product claims against published 1M/10M BEAM results. ## Should you use AutoMem? @@ -223,6 +223,7 @@ _Screenshots will be added once the referenced in-repo image assets are availabl - [Research foundation](docs/RESEARCH.md) — papers and how AutoMem implements them - [Comparison](docs/COMPARISON.md) — vs. RAG, vector DBs, building your own - [Benchmark history](benchmarks/EXPERIMENT_LOG.md) — LoCoMo, LongMemEval, and BEAM methodology and runs +- [Publication bundle](benchmarks/publication/2026-05-arxiv/) — arXiv claim posture, reproducibility commands, and artifact manifest **Operations** - [Health monitoring & backups](docs/MONITORING_AND_BACKUPS.md) diff --git a/automem/api/recall.py b/automem/api/recall.py index cf1b4d8..2cd572f 100644 --- a/automem/api/recall.py +++ b/automem/api/recall.py @@ -2117,19 +2117,23 @@ def get_related_memories(memory_id: str) -> Any: ORDER BY coalesce(related.importance, 0.0) DESC, coalesce(related.timestamp, '') DESC LIMIT $limit """ + # FalkorDB does not accept parameters inside variable-length relationship ranges. + # max_depth is parsed and clamped above, so inlining it here is safe. + fallback_depth = max_depth fallback_query = f""" - MATCH (m:Memory {{id: $id}}){'-[r' + rel_pattern + '*1..$max_depth]-' if rel_pattern else '-[r*1..$max_depth]-'}(related:Memory) + MATCH (m:Memory {{id: $id}}){'-[r' + rel_pattern + f'*1..{fallback_depth}]-' if rel_pattern else f'-[r*1..{fallback_depth}]-'}(related:Memory) WHERE m.id <> related.id RETURN DISTINCT related ORDER BY coalesce(related.importance, 0.0) DESC, coalesce(related.timestamp, '') DESC LIMIT $limit """ params = {"id": memory_id, "max_depth": max_depth, "limit": limit} + fallback_params = {"id": memory_id, "limit": limit} try: result = graph.query(query, params) except Exception: try: - result = graph.query(fallback_query, params) + result = graph.query(fallback_query, fallback_params) except Exception: logger.exception("Failed to traverse related memories for %s", memory_id) abort(500, description="Failed to fetch related memories") diff --git a/benchmarks/EXPERIMENT_LOG.md b/benchmarks/EXPERIMENT_LOG.md index b97a312..049b863 100644 --- a/benchmarks/EXPERIMENT_LOG.md +++ b/benchmarks/EXPERIMENT_LOG.md @@ -21,9 +21,14 @@ Current headline results: | Benchmark | Scope | Score | Retrieval | Notes | |-----------|-------|-------|-----------|-------| -| LongMemEval full | 500 questions | **86.20% (431/500)** | recall@5 **97.20% (486/500)** | Canonical `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge; `judge_errors=0`, `memory_ingest_failures=0`. | -| LongMemEval mini | 30 questions, stratified 5 per type | **60.0% (18/30)** | recall@5 **96.67% (29/30)** | Representative canary; do not compare to legacy prefix slices. | -| LoCoMo full | 10 conversations, 1986 questions | **83.99% (1668/1986)** | -- | Latest recorded full judge-on run from #128; cat5 scored 92.83% with 0 skips. | +| LongMemEval full | 500 questions | **87.00% (435/500)** | recall@5 **97.00% (485/500)** | Fresh publication verification run with `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge; `judge_errors=0`, `memory_ingest_failures=0`, harness `publishable=true`. | +| LongMemEval mini | 30 questions, stratified 5 per type | **70.00% (21/30)** | recall@5 **96.67% (29/30)** | Representative canary from the May 2026 publication verification run; do not compare to legacy prefix slices. | +| LoCoMo full | 10 conversations, 1986 questions | **84.74% (1683/1986)** | -- | Fresh publication verification run with pinned `gpt-5.4-mini-2026-03-17` judge; 444 judge calls, 0 skips/errors. | + +For arXiv and release-facing claims, use the curated publication bundle at +[`benchmarks/publication/2026-05-arxiv/`](publication/2026-05-arxiv/). It +separates canonical, exploratory, historical, and external-reported results so +paper text does not accidentally over-claim from diagnostic runs. Detailed experiment history: @@ -48,9 +53,10 @@ Detailed experiment history: | 2026-03-11 | #74 entity expansion | exp/74-entity-expansion-precision-v1 | 89.36% (+0.0) | -- | -- | -- | Hub-node detection. Zero delta — benchmark doesn't exercise graph expansion. → [postmortem](postmortems/2026-03-11_issue74_entity_expansion_precision.md) | | 2026-03-12 | #79 (PR #125) | exp/79-priority-ids-fetch-v1 | 89.36% (+0.0) | -- | -- | -- | Bug fix: priority_ids now fetches by ID. Merged. → [postmortem](postmortems/2026-03-12_issue79_priority_ids_fetch.md) | | 2026-04-23 | #128 | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | **85.53% (201/235)** | -- | -- | -- | Content keyword fallback + gentler adaptive floor. Improved **+3.40pp** vs the same-day baseline (`82.13%`, `193/235`) with no sampled question-level regressions across conv-26/conv-30. | -| 2026-04-23 | #128 full judge | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | -- | **83.99% (1668/1986)** | -- | -- | Full judge-on rerun after harness fixes. Judge preflight passed and cat-5 scored **92.83% (414/446)** with **0 skips**. Improves **+3.93pp** vs full baseline (`80.06%`, `1590/1986`), so #128 is strong enough to move forward to broader validation. | +| 2026-04-23 | #128 full judge | fix/128-recall-keyword-scoring-dead-for-vector-results-adaptive-floor-too-aggressive | -- | **83.99% (1668/1986)** | -- | -- | Full judge-on rerun after harness fixes; category-5 judge model was `gpt-5.1`. Judge preflight passed and cat-5 scored **92.83% (414/446)** with **0 skips**. Improves **+3.93pp** vs full baseline (`80.06%`, `1590/1986`), so #128 is strong enough to move forward to broader validation. | | 2026-04-23 | #142 | fix/142-expansion-tag-filter | -- | 77.30% (-0.07) | -- | -- | Expansion tag-filter bypass. Effectively flat vs pre-fix `77.37%` — canonical configs don't exercise `expand_relations`. Validated via scoped repro + helper/API tests. | -| 2026-04-26 | LongMemEval harness | fix/longmemeval-harness-resume-and-stratified-mini | -- | -- | **60.0% (18/30)** | **86.20% (431/500)** | Representative stratified mini and full canonical run. Full recall@5 **97.20% (486/500)**; `judge_errors=0`, `memory_ingest_failures=0`. | +| 2026-04-26 | LongMemEval harness | fix/longmemeval-harness-resume-and-stratified-mini | -- | -- | **60.0% (18/30)** | **86.20% (431/500)** | Historical canonical milestone. Full recall@5 **97.20% (486/500)**; `judge_errors=0`, `memory_ingest_failures=0`. Superseded for publication claims by the 2026-05-17 verification run. | +| 2026-05-17 | Publication verification | feat/automem-arxiv-publication | **85.20% (259/304)** | **84.74% (1683/1986)** | **70.00% (21/30)** | **87.00% (435/500)** | Fresh local publication reruns. LoCoMo full used pinned `gpt-5.4-mini-2026-03-17` judge, 444 judge calls, 0 skips/errors, estimated judge cost `$0.7909`, artifact `benchmarks/results/locomo_baseline_20260517_193934.json`, sha256 `a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716`. LongMemEval full used `gpt-5-mini` answerer + `gpt-5.4-mini-2026-03-17` judge, recall@5 **97.00% (485/500)**, `memory_ingest_failures=0`, `judge_errors=0`, `publishable=true`, artifact `benchmarks/results/longmemeval-full-publication-20260518.json`, sha256 `ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75`. | ### Category Breakdown (LoCoMo-mini) @@ -76,20 +82,20 @@ Categories 1-4 are scored by word-overlap/date matching. Category 5 uses an opt- ### Category Breakdown (LongMemEval full) -Canonical run: `benchmarks/results/longmemeval_full_gpt5mini_20260425_231308.json`. +Canonical run: `benchmarks/results/longmemeval-full-publication-20260518.json`. Answerer `gpt-5-mini`; judge `gpt-5.4-mini-2026-03-17`. | Question type | Accuracy | Recall@5 | |---------------|----------|----------| | knowledge-update | 88.46% (69/78) | 100.00% (78/78) | -| multi-session | 81.20% (108/133) | 98.50% (131/133) | +| multi-session | 84.21% (112/133) | 97.74% (130/133) | | single-session-assistant | 98.21% (55/56) | 100.00% (56/56) | -| single-session-preference | 60.00% (18/30) | 90.00% (27/30) | -| single-session-user | 91.43% (64/70) | 92.86% (65/70) | +| single-session-preference | 56.67% (17/30) | 90.00% (27/30) | +| single-session-user | 92.86% (65/70) | 92.86% (65/70) | | temporal-reasoning | 87.97% (117/133) | 96.99% (129/133) | -Failure split from the result-analysis helper: 58 wrong answers had the answer -session retrieved at recall@5; 11 were retrieval misses. This is the basis for +Failure split from the result-analysis helper: 54 wrong answers had the answer +session retrieved at recall@5; 11 wrong answers were retrieval misses. This is the basis for follow-up issues #158 and #159. ## Exploratory and Historical Benchmarks diff --git a/benchmarks/publication/2026-05-arxiv/README.md b/benchmarks/publication/2026-05-arxiv/README.md new file mode 100644 index 0000000..242af1a --- /dev/null +++ b/benchmarks/publication/2026-05-arxiv/README.md @@ -0,0 +1,59 @@ +# AutoMem arXiv Publication Bundle + +This bundle collects the repository-side material for the May 2026 AutoMem +arXiv preprint effort. It is intentionally conservative: canonical claims come +from this repository's official benchmark log, while exploratory and external +numbers are labeled separately. + +## Claim Posture + +AutoMem should be described as an open-source, inspectable, MCP-first +graph-vector memory service for AI agents with transparent benchmark harnesses +and strong canonical LoCoMo / LongMemEval results. + +Do not claim "best memory system", "SOTA", or "beats Mem0" from this bundle. +Those claims require apples-to-apples reruns against the current external +systems, judge policies, dataset versions, and scale settings. + +## Canonical Results + +| Status | Benchmark | Scope | Score | Retrieval | Source | +|---|---:|---:|---:|---:|---| +| canonical | LongMemEval full | 500 questions | 87.00% (435/500) | recall@5 97.00% (485/500) | Fresh publication verification run; see `fresh-verification.md` | +| representative canary | LongMemEval mini | 30 stratified questions | 70.00% (21/30) | recall@5 96.67% (29/30) | Fresh publication verification run; see `fresh-verification.md` | +| canonical | LoCoMo full | 10 conversations, 1,986 questions | 84.74% (1683/1986) | not reported | Fresh publication verification run with pinned `gpt-5.4-mini-2026-03-17` judge | + +Canonical LongMemEval model policy: + +- Answerer: `gpt-5-mini` +- Judge: `gpt-5.4-mini-2026-03-17` +- Judge errors: `0` +- Memory ingest failures: `0` +- Harness publishable flag: `true` + +## Supplemental Signals + +| Status | Benchmark / Evidence | Scope | Result | Caveat | +|---|---|---:|---:|---| +| exploratory | BEAM 100K V1 raw-dialogue shim | 20 conversations, 400 questions | 76.25% (305/400), avg 0.677 | Not comparable to published BEAM 1M/10M claims. | +| exploratory | BEAM 100K V2 fact-extraction shim | 20 conversations, 400 questions | 73.75% (295/400), avg 0.653 | Diagnostic failure-mode signal only. | +| exploratory | Writ drift integration | 5 drift scenarios | 100% recall accuracy, 20% update fidelity, 0% drift rate | Lives in `automem-evals`; must remain labeled supplemental until promoted. | +| exploratory | Claude Code hook replay | fixture suite | metrics harness only | Lives in `automem-evals`; workflow-continuity signal, not a memory benchmark. | +| external reported | Mem0 managed platform | LoCoMo / LongMemEval / BEAM | see cited Mem0 docs | Proprietary managed-platform optimizations; not directly comparable. | +| not yet run | BEAM official 1M/10M | official BEAM scale | -- | Required before any BEAM-competitive claim. | +| not yet run | LongMemEval-V2 | web-agent memory | -- | Required before "experienced colleague" claims. | +| not yet run | Memora / FAMA | invalidated-memory reuse | -- | Natural fit for `INVALIDATED_BY`/`CONTRADICTS`, but not run yet. | + +## Bundle Files + +- `benchmark-summary.md` - paper-ready benchmark and limitation summary. +- `artifact-manifest.json` - machine-readable manifest for claims, generated-artifact paths, and commands. +- `commands.md` - verification and reproduction command inventory. +- `fresh-verification.md` - latest local verification notes and generated artifact hashes. + +## Promotion Rule + +Results from `../automem-evals` may inform the paper only as supplemental +evidence until a result is reproduced or explicitly summarized in this +repository. Official benchmark claims remain owned by +`benchmarks/EXPERIMENT_LOG.md`. diff --git a/benchmarks/publication/2026-05-arxiv/artifact-manifest.json b/benchmarks/publication/2026-05-arxiv/artifact-manifest.json new file mode 100644 index 0000000..fa60367 --- /dev/null +++ b/benchmarks/publication/2026-05-arxiv/artifact-manifest.json @@ -0,0 +1,145 @@ +{ + "bundle": "automem-arxiv-2026-05", + "generated_at_utc": "2026-05-18T06:20:58Z", + "automem_git_sha_at_creation": "a742602f5d6ad2dea5a4d3c387d5b49d610afe2c", + "git_sha_note": "This records the base HEAD before publication-bundle edits; the PR commit SHA is supplied by GitHub after commit creation.", + "official_claim_source": "benchmarks/EXPERIMENT_LOG.md", + "judge_policy": "docs/BENCHMARK_JUDGE_POLICY.md", + "claims": [ + { + "status": "canonical", + "benchmark": "LongMemEval", + "scope": "full", + "questions": 500, + "score": "87.00% (435/500)", + "retrieval": "recall@5 97.00% (485/500)", + "answer_model": "gpt-5-mini", + "judge_model": "gpt-5.4-mini-2026-03-17", + "source": "benchmarks/EXPERIMENT_LOG.md; fresh publication verification run 2026-05-18 UTC", + "generated_artifact": { + "path": "benchmarks/results/longmemeval-full-publication-20260518.json", + "gitignored": true, + "sha256": "ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75" + }, + "hypotheses_artifact": { + "path": "benchmarks/results/longmemeval-full-publication-20260518.jsonl", + "gitignored": true, + "sha256": "69cd9c8171d5caec8661b1d8c2b27579decc40e2f134ffde74fcc93e70e2e7ce" + }, + "memory_ingest_failures": 0, + "judge_errors": 0, + "publishable": true, + "elapsed_seconds": 10021.779591798782, + "category_breakdown": { + "knowledge_update": "88.46% (69/78)", + "multi_session": "84.21% (112/133)", + "single_session_assistant": "98.21% (55/56)", + "single_session_preference": "56.67% (17/30)", + "single_session_user": "92.86% (65/70)", + "temporal_reasoning": "87.97% (117/133)" + }, + "failure_split": "65 wrong total; 54 wrong had answer session retrieved at recall@5; 11 wrong were retrieval misses; 4 correct answers were retrieval misses." + }, + { + "status": "representative_canary", + "benchmark": "LongMemEval", + "scope": "mini stratified", + "questions": 30, + "score": "70.00% (21/30)", + "retrieval": "recall@5 96.67% (29/30)", + "answer_model": "gpt-5-mini", + "judge_model": "script LLM eval", + "source": "local run 2026-05-17; see fresh-verification.md", + "generated_artifact": { + "path": "benchmarks/results/longmemeval-mini-publication-20260517.json", + "gitignored": true, + "sha256": "7ea922b77e312a17c313bbf8c0e81f0268b48d1082080cae1db3c38e906577b8" + } + }, + { + "status": "canonical", + "benchmark": "LoCoMo", + "scope": "full", + "questions": 1986, + "score": "84.74% (1683/1986)", + "retrieval": null, + "answer_model": null, + "judge_model": "gpt-5.4-mini-2026-03-17", + "source": "benchmarks/EXPERIMENT_LOG.md; fresh publication verification run 2026-05-17", + "generated_artifact": { + "path": "benchmarks/results/locomo_baseline_20260517_193934.json", + "gitignored": true, + "sha256": "a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716" + }, + "judge_calls": 444, + "judge_errors": 0, + "judge_skips": 0, + "estimated_cost_usd": 0.790877, + "category_breakdown": { + "single_hop": "52.13% (147/282)", + "temporal": "86.60% (278/321)", + "multi_hop": "46.88% (45/96)", + "open_domain": "93.58% (787/841)", + "complex": "95.52% (426/446)" + } + }, + { + "status": "fresh_verification", + "benchmark": "LoCoMo", + "scope": "mini", + "questions": 304, + "score": "85.20% (259/304)", + "retrieval": null, + "answer_model": null, + "judge_model": "gpt-5.4-mini-2026-03-17", + "source": "local run 2026-05-17; see fresh-verification.md", + "generated_artifact": { + "path": "benchmarks/results/locomo-mini_baseline_20260517_182318.json", + "gitignored": true, + "sha256": "ba2b98b0055f92ca17de9bc36207d7f39cf90b6270c2c3d903d69b8044aa7015" + } + }, + { + "status": "exploratory", + "benchmark": "BEAM", + "scope": "100K V1 raw-dialogue shim", + "questions": 400, + "score": "76.25% (305/400), avg 0.677", + "retrieval": "top-k 200", + "source": "benchmarks/EXPERIMENT_LOG.md" + }, + { + "status": "exploratory", + "benchmark": "BEAM", + "scope": "100K V2 fact-extraction shim", + "questions": 400, + "score": "73.75% (295/400), avg 0.653", + "retrieval": "top-k 200", + "source": "benchmarks/EXPERIMENT_LOG.md" + }, + { + "status": "exploratory", + "benchmark": "Writ", + "scope": "drift category, 5 scenarios", + "questions": 5, + "score": "100.0% recall_accuracy; 20.0% update_fidelity; 0.0% drift_rate", + "retrieval": null, + "source": "../automem-evals/docs/writ_integration.md" + }, + { + "status": "exploratory", + "benchmark": "Claude Code hook replay", + "scope": "fixture and metrics harness", + "questions": null, + "score": "harness tests only; no publication score", + "retrieval": null, + "source": "../automem-evals/docs/session_2026-04-28_hook_replay.md" + } + ], + "not_yet_run": [ + "BEAM official 1M/10M", + "LongMemEval-V2", + "Memora/FAMA", + "Mem0 managed-platform apples-to-apples comparison" + ] +} diff --git a/benchmarks/publication/2026-05-arxiv/benchmark-summary.md b/benchmarks/publication/2026-05-arxiv/benchmark-summary.md new file mode 100644 index 0000000..f3b4a74 --- /dev/null +++ b/benchmarks/publication/2026-05-arxiv/benchmark-summary.md @@ -0,0 +1,69 @@ +# Benchmark Summary For Paper Draft + +## Recommended Headline + +AutoMem is an open-source graph-vector memory service for AI agents that +publishes transparent benchmark harnesses and current canonical results of +87.00% on LongMemEval full and 84.74% on LoCoMo full. + +This is a reproducibility and systems claim, not a state-of-the-art claim. + +## Canonical Results + +The official source of truth is `benchmarks/EXPERIMENT_LOG.md`. + +| Benchmark | Scope | Status | Score | Retrieval | Models / Judge | +|---|---:|---|---:|---:|---| +| LongMemEval full | 500 questions | canonical | 87.00% (435/500) | recall@5 97.00% (485/500) | `gpt-5-mini` answerer, `gpt-5.4-mini-2026-03-17` judge | +| LongMemEval mini | 30 stratified questions | representative canary | 70.00% (21/30) | recall@5 96.67% (29/30) | `gpt-5-mini` answerer, script LLM eval | +| LoCoMo full | 10 conversations, 1,986 questions | canonical | 84.74% (1683/1986) | not reported | Pinned `gpt-5.4-mini-2026-03-17` judge, 444 judge calls, 0 skips/errors | + +LongMemEval failure split: 54 wrong answers had the answer session retrieved at +recall@5, while 11 wrong answers were retrieval misses. This supports a paper discussion that +future improvements are likely in answer synthesis, memory representation, and +preference handling, not only first-stage retrieval. + +## Historical / Exploratory Context + +Older LoCoMo mini/full values, including the 89.27% judge-off mini and 87.56% +March full judge-on run, remain useful trend anchors but should not be used as +current headline claims. + +BEAM 100K shim results, Writ drift runs, and hook replay metrics came from +`automem-evals` and are explicitly diagnostic. They are not comparable to +published BEAM 1M/10M numbers or production memory benchmarks because the +scale, adapter, extraction policy, and judge setup differ. + +## External Comparisons + +The paper may cite external reported numbers from Mem0, Zep/Graphiti, Letta, +A-MEM, BEAM, LongMemEval-V2, and Memora/FAMA, but those rows must be labeled +`external reported` unless rerun through an AutoMem-controlled harness. + +For any comparison table, include: + +- system and version/date +- open-source vs managed +- benchmark and dataset version/hash +- scope and question count +- ingest/extraction protocol +- retrieval method +- answer model and judge/evaluator +- token/context budget +- latency/cost if available +- score and recall@k +- artifact URL or repro command +- claim status + +## Limitations To State + +- No current SOTA claim. +- External systems use different extraction policies, judges, hosted services, + and token budgets. +- BEAM 1M/10M, LongMemEval-V2, and Memora/FAMA have not yet been run as + canonical AutoMem benchmarks. +- LoCoMo and LongMemEval primarily test recall and answer synthesis; they do + not fully measure write precision, forgetting, privacy boundaries, or + long-running coding-agent workflows. +- Some detailed JSON result artifacts are local/generated and gitignored; the + committed experiment log is the current durable source. diff --git a/benchmarks/publication/2026-05-arxiv/commands.md b/benchmarks/publication/2026-05-arxiv/commands.md new file mode 100644 index 0000000..def32ba --- /dev/null +++ b/benchmarks/publication/2026-05-arxiv/commands.md @@ -0,0 +1,47 @@ +# Publication Verification Commands + +Run these from the repository root unless noted. + +## Repository Checks + +```bash +make test +.venv/bin/black --check . +.venv/bin/isort --check-only . +make lint +make test-integration +make bench-health +``` + +## Canonical Benchmarks + +```bash +BENCH_JUDGE_MODEL=gpt-5.4-mini-2026-03-17 make bench-eval BENCH=locomo-mini CONFIG=baseline +BENCH_JUDGE_MODEL=gpt-5.4-mini-2026-03-17 make bench-eval BENCH=locomo CONFIG=baseline +./test-longmemeval-benchmark.sh --llm-eval --llm-model gpt-5-mini --per-type 5 --output benchmarks/results/longmemeval-mini-publication +./test-longmemeval-benchmark.sh --llm-eval --llm-model gpt-5-mini --output benchmarks/results/longmemeval-full-publication +``` + +The current 84.74% LoCoMo full result in `benchmarks/EXPERIMENT_LOG.md` is the +fresh publication verification artifact produced by the pinned +`gpt-5.4-mini-2026-03-17` command above. + +The LongMemEval full run is expensive and long-running. Use `--resume` with the +same `--output` base if interrupted. + +## Supplemental Evals + +From a clone of the `automem-evals` repository: + +```bash +python3 -m unittest discover -s runners -p 'test_*.py' +python3 -m unittest discover -s scripts -p 'test_*.py' +python3 scripts/seed_from_snapshot.py +python3 scripts/seed_associations.py +python3 runners/compare_rulesets.py --rulesets baseline_v1 bare_tag_1m_v2 +python3 scripts/beam_shim_smoke.py --self-spawn +python3 runners/run_writ.py --compare automem baseline --scenarios drift +``` + +Supplemental outputs are not canonical publication claims until they are +summarized here or reproduced by a canonical harness. diff --git a/benchmarks/publication/2026-05-arxiv/fresh-verification.md b/benchmarks/publication/2026-05-arxiv/fresh-verification.md new file mode 100644 index 0000000..2917c8f --- /dev/null +++ b/benchmarks/publication/2026-05-arxiv/fresh-verification.md @@ -0,0 +1,67 @@ +# Fresh Verification Notes + +Local verification run date: 2026-05-17 / 2026-05-18 UTC. + +## Repository Checks + +| Check | Result | Notes | +|---|---|---| +| `make test` | pass | 238 passed, 1 skipped, 25 deselected | +| `.venv/bin/black --check .` | pass | Added `pyproject.toml` with the repo's documented 100-column Black configuration and generated-directory excludes. | +| `.venv/bin/isort --check-only .` | pass | Added `pyproject.toml` isort config; excludes virtualenvs, snapshots, worktrees, showcase output, and vendored LoCoMo source. | +| `make lint` | pass | Required `.flake8` exclude update for generated/env/worktree directories. | +| `make test-integration` | pass | 11 passed, 253 deselected after rebuilding the Docker API image. | +| `make bench-health` | pass | Required Makefile fix to use the venv Python; health reported `HEALTHY`. | + +## Benchmark Reruns + +| Benchmark | Command | Result | Generated artifact | +|---|---|---|---| +| LoCoMo mini | `BENCH_JUDGE_MODEL=gpt-5.4-mini-2026-03-17 make bench-eval BENCH=locomo-mini CONFIG=baseline` | 85.20% (259/304) | `benchmarks/results/locomo-mini_baseline_20260517_182318.json`, sha256 `ba2b98b0055f92ca17de9bc36207d7f39cf90b6270c2c3d903d69b8044aa7015` | +| LoCoMo full | `BENCH_JUDGE_MODEL=gpt-5.4-mini-2026-03-17 make bench-eval BENCH=locomo CONFIG=baseline` | 84.74% (1683/1986) | `benchmarks/results/locomo_baseline_20260517_193934.json`, sha256 `a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716` | +| LongMemEval mini | `./test-longmemeval-benchmark.sh --llm-eval --llm-model gpt-5-mini --per-type 5 --output benchmarks/results/longmemeval-mini-publication-20260517` | 70.00% (21/30), recall@5 96.67% (29/30) | `benchmarks/results/longmemeval-mini-publication-20260517.json`, sha256 `7ea922b77e312a17c313bbf8c0e81f0268b48d1082080cae1db3c38e906577b8` | +| LongMemEval full | `./test-longmemeval-benchmark.sh --llm-eval --llm-model gpt-5-mini --eval-llm-model gpt-5.4-mini-2026-03-17 --output benchmarks/results/longmemeval-full-publication-20260518` | 87.00% (435/500), recall@5 97.00% (485/500) | `benchmarks/results/longmemeval-full-publication-20260518.json`, sha256 `ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75` | + +The official LongMemEval full claim is now the fresh publication verification +artifact: + +- `benchmarks/results/longmemeval-full-publication-20260518.json` +- sha256 `ed6f7cf69b7be6fa0050536ec2b0f947f5510afd8c2a374b3fafb9cde009da75` +- 87.00% (435/500), recall@5 97.00% (485/500) +- `memory_ingest_failures=0`, `judge_errors=0`, `publishable=true` + +Console caveat: the full run logged transient `gpt-5-mini` empty-answer +warnings and one local recall read timeout, but the harness completed and marked +the aggregate artifact publishable. Treat those warnings as answerer/service +stability notes rather than hidden failures. + +The official LoCoMo full claim is now the fresh pinned-judge publication +verification artifact: + +- `benchmarks/results/locomo_baseline_20260517_193934.json` +- sha256 `a75816e9a6d3302c22b34852b75ac19a9d9f5cb27d1a109e0af7e49359330716` +- 84.74% (1683/1986) +- Judge: `gpt-5.4-mini-2026-03-17`, 444 calls, 0 skips/errors + +## Supplemental Eval Repo Checks + +From a clone of the `automem-evals` repository. The local verification used a +sibling checkout referenced as `../automem-evals` below. + +| Check | Result | +|---|---| +| `python3 -m unittest discover -s runners -p 'test_*.py'` | pass, 95 tests | +| `python3 -m unittest discover -s scripts -p 'test_*.py'` | pass, 10 tests | +| `npm test` in `../automem-evals/third_party/writ` | pass, 72 tests | +| `npm run build` in `../automem-evals/third_party/writ` | pass | + +Writ drift evidence remains exploratory and lives in +`../automem-evals/docs/writ_integration.md`: AutoMem recall accuracy 100.0%, +update fidelity 20.0%, drift rate 0.0% across 5 drift scenarios. + +## Paper Checks + +The separate AutoMem paper source checkout passed static checks for input-file +existence and BibTeX cite-key resolution. No local LaTeX compiler (`pdflatex`, +`latexmk`, `tectonic`, or `pandoc`) was available, so no PDF compilation is +claimed. diff --git a/docs/COMPARISON.md b/docs/COMPARISON.md index 42854e7..e7afaa3 100644 --- a/docs/COMPARISON.md +++ b/docs/COMPARISON.md @@ -84,7 +84,7 @@ When building your own is the right answer: | **Confidence** | 0.05 | The memory's `confidence` score (0–1) | | **Relevance** | 0.00 | Consolidation decay relevance — disabled by default | -These defaults reflect the current LoCoMo baseline (89.27% judge-off, 87.56% judge-on). For a query like `GET /recall?query=database+migration&tags=decision&time_query=last+month`, the temporal-alignment and tag components dominate; for `GET /recall?query=why+postgres&expand_relations=true`, the relation component does. +These defaults reflect the current canonical benchmark posture: LongMemEval full at 87.00% with 97.00% recall@5, and LoCoMo full at 84.74%. For a query like `GET /recall?query=database+migration&tags=decision&time_query=last+month`, the temporal-alignment and tag components dominate; for `GET /recall?query=why+postgres&expand_relations=true`, the relation component does. The Recall Quality Lab (`scripts/lab/`) lets you sweep any weight and A/B-compare configs against snapshots of production data without touching the service. diff --git a/docs/TESTING.md b/docs/TESTING.md index b4315b4..62ea6e3 100644 --- a/docs/TESTING.md +++ b/docs/TESTING.md @@ -262,12 +262,14 @@ LoCoMo evaluates AI systems' ability to remember and reason across very long con Historical note: older public LoCoMo references such as CORE's **88.24%** are still useful background context, but they are not AutoMem's primary comparison target because the public setups are not perfectly apples-to-apples, especially around category-5 handling. -AutoMem currently publishes two LoCoMo baselines: +AutoMem currently publishes the following LoCoMo baselines: | Setup | Scope | Score | Notes | |------|-------|-------|-------| -| `locomo-mini`, judge off | 2 conversations, categories 1-4 only | **89.27% (208/233)** | 71 category-5 questions skipped | -| `locomo`, judge on (`gpt-4o`) | Full 10 conversations | **87.56% (1739/1986)** | Category 5 scored at 95.74% (427/446) | +| `locomo`, canonical judge on (`gpt-5.4-mini-2026-03-17`) | Full 10 conversations | **84.74% (1683/1986)** | Current canonical full run from the May 2026 publication verification; category 5 scored **95.52% (426/446)** with 0 skips/errors. | +| `locomo`, historical judge on (`gpt-5.1`) | Full 10 conversations | **83.99% (1668/1986)** | Historical #128 full run; category 5 scored **92.83% (414/446)** with 0 skips. | +| `locomo-mini`, judge off | 2 conversations, categories 1-4 only | **89.27% (208/233)** | Historical mini anchor after evaluator fixes; not the current headline full-run claim. | +| `locomo`, judge on (`gpt-4o`) | Full 10 conversations | **87.56% (1739/1986)** | Historical March 2026 run; kept for trend context only. | ### Running the Benchmark @@ -304,20 +306,16 @@ Memory usage: Example benchmark output: ```text 📊 FINAL RESULTS -🎯 Overall Accuracy: 87.56% (1739/1986) -⏱️ Total Time: 3497s +🎯 Overall Accuracy: 84.74% (1683/1986) +⏱️ Total Time: 3164s 💾 Total Memories Stored: 5882 -📈 Category Breakdown: - Single-hop Recall : 66.31% (187/282) - Temporal Understanding : 87.23% (280/321) - Multi-hop Reasoning : 45.83% ( 44/ 96) - Open Domain : 95.24% (801/841) - Complex Reasoning : 95.74% (427/446) +📈 Category Breakdown excerpt: + Complex Reasoning : 95.52% (426/446) ``` -If you run without the judge, category 5 will show as `N/A`. +See `benchmarks/EXPERIMENT_LOG.md` for the full per-category table. If you run without the judge, category 5 will show as `N/A`. Current baselines and methodology notes live in `benchmarks/EXPERIMENT_LOG.md`. @@ -343,8 +341,9 @@ Current LongMemEval results: | Setup | Scope | Score | Retrieval | Notes | |------|-------|-------|-----------|-------| -| `longmemeval-mini` representative | 30 questions, stratified 5 per question type | **60.0% (18/30)** | recall@5 **96.67% (29/30)** | Canonical run: `gpt-5-mini` answerer, `gpt-5.4-mini-2026-03-17` judge, `judge_errors=0`, `memory_ingest_failures=0`. | -| `longmemeval` full canonical | 500 questions | **86.20% (431/500)** | recall@5 **97.20% (486/500)** | Canonical run: `gpt-5-mini` answerer, `gpt-5.4-mini-2026-03-17` judge, `judge_errors=0`, `memory_ingest_failures=0`. | +| `longmemeval-mini` representative | 30 questions, stratified 5 per question type | **70.00% (21/30)** | recall@5 **96.67% (29/30)** | Fresh publication verification run: `gpt-5-mini` answerer, script LLM eval, 2 empty-answer warnings. | +| `longmemeval` full canonical | 500 questions | **87.00% (435/500)** | recall@5 **97.00% (485/500)** | Fresh publication verification run: `gpt-5-mini` answerer, `gpt-5.4-mini-2026-03-17` judge, `judge_errors=0`, `memory_ingest_failures=0`, `publishable=true`. | +| `longmemeval` full historical | 500 questions | **86.20% (431/500)** | recall@5 **97.20% (486/500)** | April 2026 canonical milestone with the same answerer and judge policy; kept for trend context. | | `longmemeval` partial legacy prefix (50q) | 50 questions, single-session-user type | **82.0% (41/50)** | recall@5 **92.0% (46/50)** | Provisional prefix run with legacy `gpt-4o` answerer; recall_limit=10, no entity/relation expansion. Not reproduced by the current stratified `bench-mini-longmemeval` target and not directly comparable to the older 35.6% / 500-question setup. | For future published LongMemEval results, use the pinned judge policy in [`docs/BENCHMARK_JUDGE_POLICY.md`](BENCHMARK_JUDGE_POLICY.md) so runs remain comparable over time. The primary answerer is `gpt-5-mini`; `gpt-4o` is legacy continuity only and should be labeled as such. Result metadata distinguishes the answer model (`answerer_model` / `llm_model`) from the judge model (`judge_model`, currently `gpt-5.4-mini-2026-03-17` when `--llm-eval` is enabled). @@ -385,8 +384,8 @@ AutoMem is expected to perform well due to: - `DERIVED_FROM`, `PART_OF` 2. **Hybrid Search**: Vector + keyword + tags + importance + time - - Better than pure semantic search - - More reliable than vector-only systems + - Designed to recover both semantic matches and explicit structured context + - Useful for testing graph/vector tradeoffs against snapshot-based evals 3. **Background Intelligence**: - Entity extraction for structured queries diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..e9c2176 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,32 @@ +[tool.black] +line-length = 100 +extend-exclude = ''' +/( + \.bench-venv + | \.venv + | \.worktrees + | benchmarks/results + | benchmarks/snapshots + | node_modules + | packages/graph-viewer/node_modules + | showcase + | venv +)/ +''' + +[tool.isort] +profile = "black" +line_length = 100 +skip_gitignore = true +skip = [ + ".bench-venv", + ".venv", + ".worktrees", + "benchmarks/results", + "benchmarks/snapshots", + "node_modules", + "packages/graph-viewer/node_modules", + "showcase", + "tests/benchmarks/locomo", + "venv", +] diff --git a/tests/test_api_endpoints.py b/tests/test_api_endpoints.py index 2e7e513..aa7449f 100644 --- a/tests/test_api_endpoints.py +++ b/tests/test_api_endpoints.py @@ -1800,6 +1800,37 @@ def query(self, query: str, params: dict[str, Any] | None = None) -> SimpleNames assert "PARALLEL_CONTEXT" in query +def test_related_memories_fallback_inlines_sanitized_depth(client, mock_state, auth_headers): + class Graph: + def __init__(self) -> None: + self.calls: list[tuple[str, dict[str, Any]]] = [] + + def query(self, query: str, params: dict[str, Any] | None = None) -> SimpleNamespace: + self.calls.append((query, params or {})) + if len(self.calls) == 1: + raise RuntimeError("apoc unavailable") + if "$max_depth" in query: + raise RuntimeError("FalkorDB rejects parameterized variable-length ranges") + return SimpleNamespace(result_set=[]) + + graph = Graph() + mock_state.memory_graph = graph + + response = client.get( + "/memories/aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa/related?max_depth=2", + headers=auth_headers, + ) + + assert response.status_code == 200 + assert len(graph.calls) == 2 + fallback_query, fallback_params = graph.calls[1] + assert "*1..2" in fallback_query + assert "$max_depth" not in fallback_query + assert "max_depth" not in fallback_params + assert fallback_params["id"] == "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa" + assert fallback_params["limit"] == 5 + + # ==================== Test Rate Limiting (if implemented) ====================