diff --git a/README.md b/README.md index 471322f..5d289c9 100644 --- a/README.md +++ b/README.md @@ -199,6 +199,66 @@ await runAgentControlLoop({ }) ``` +## Researcher profile + +`@tangle-network/agent-knowledge/profiles` ships a sandbox-SDK +`AgentProfile` preset for source-grounded research agents. Pairs with +`runLoop` from `@tangle-network/agent-runtime/loops` — the profile owns +the prompt + output adapter + validator; the kernel owns iteration, +concurrency, cost, and trace emission. + +```ts +import { runLoop } from '@tangle-network/agent-runtime/loops' +import { multiHarnessResearcherFanout } from '@tangle-network/agent-knowledge/profiles' + +const research = multiHarnessResearcherFanout({ + harnesses: ['opencode/zai-coding-plan/glm-5.1', 'claude-code', 'codex'], +}) + +const result = await runLoop({ + driver: research.driver, + agentRuns: research.agentRuns, + output: research.output, + validator: research.validator, + task: { + question: 'What content does cpg-founder ICP engage with on Twitter?', + knowledgeNamespace: 'cust_42', + sources: ['twitter', 'web'], + maxItems: 20, + minConfidence: 0.6, + }, + ctx: { sandboxClient }, +}) + +if (result.winner?.verdict?.valid) { + // result.winner.output.proposedWrites: KnowledgeUpdate[] + // The profile does NOT materialize. Decide whether to apply. + for (const write of result.winner.output.proposedWrites) { + // route through applyKnowledgeWriteBlocks / a KbStore put when ready + } +} +``` + +Three invariants are enforced by the validator: + +- **Namespace isolation** — every `KnowledgeItem` + `KnowledgeUpdate` + must carry `task.knowledgeNamespace`. Cross-tenant writes hard-fail. +- **Provenance** — every item carries at least one evidence entry. +- **Citation density** — quotes-with-source / items >= 0.7 by default. + +Validator scoring (default; overridable): + +``` +score = 0.4 · citation_density + + 0.2 · source_diversity + + 0.2 · recency_match + + 0.2 · gap_coverage +``` + +The output preserves agent intelligence — `items`, `citations`, +`proposedWrites` are typed; `gaps`, `notes`, and any extras the agent +emitted land in `raw` rather than getting dropped. + ## Pluggable Knowledge Sources Static knowledge rots. Authorities like Cornell LII, the IRS, and state diff --git a/package.json b/package.json index de30686..81efae8 100644 --- a/package.json +++ b/package.json @@ -33,6 +33,11 @@ "types": "./dist/sources/index.d.ts", "import": "./dist/sources/index.js", "default": "./dist/sources/index.js" + }, + "./profiles": { + "types": "./dist/profiles/index.d.ts", + "import": "./dist/profiles/index.js", + "default": "./dist/profiles/index.js" } }, "bin": { @@ -59,6 +64,8 @@ }, "dependencies": { "@tangle-network/agent-eval": "^0.29.1", + "@tangle-network/agent-runtime": "^0.19.0", + "@tangle-network/sandbox": "^0.2.1", "zod": "^4.3.6" }, "devDependencies": { diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c78454b..68ea7a1 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -11,6 +11,12 @@ importers: '@tangle-network/agent-eval': specifier: ^0.29.1 version: 0.29.1(typescript@5.9.3) + '@tangle-network/agent-runtime': + specifier: ^0.19.0 + version: 0.19.0(@tangle-network/sandbox@0.2.1(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3) + '@tangle-network/sandbox': + specifier: ^0.2.1 + version: 0.2.1(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) zod: specifier: ^4.3.6 version: 4.4.2 @@ -72,24 +78,28 @@ packages: engines: {node: '>=14.21.3'} cpu: [arm64] os: [linux] + libc: [musl] '@biomejs/cli-linux-arm64@2.4.15': resolution: {integrity: sha512-owaAMZD/T4LrD0ELNCk0Km3qrRHuM0X6EAyVE1FSqGY0rbLoiDLrO4Us2tllm6cAeB2Ioa9C2C08NZPdr8+0Ug==} engines: {node: '>=14.21.3'} cpu: [arm64] os: [linux] + libc: [glibc] '@biomejs/cli-linux-x64-musl@2.4.15': resolution: {integrity: sha512-CNq/9W38SYSH023lfcQ4KKU8K0YX8T//FZUhcgtMMRABDojx5XsMV7jlweAvGSl389wJQB29Qo6Zb/a+jdvt+w==} engines: {node: '>=14.21.3'} cpu: [x64] os: [linux] + libc: [musl] '@biomejs/cli-linux-x64@2.4.15': resolution: {integrity: sha512-0jj7THz12GbUOLmMibktK6DZjqz2zV64KFxyBtcFTKPiiOIY0a7vns1elpO1dERvxpsZ5ik0oFfz0oGwFde1+g==} engines: {node: '>=14.21.3'} cpu: [x64] os: [linux] + libc: [glibc] '@biomejs/cli-win32-arm64@2.4.15': resolution: {integrity: sha512-ouhkYdlhp/1GghEJPdWwD/Vi3gQ1nFxuSpMolWsbq3Lsq3QUR4jl6UdhhscdCugKU5vOEuMiJhvKj66O0OCq+w==} @@ -336,66 +346,79 @@ packages: resolution: {integrity: sha512-2QxQrM+KQ7DAW4o22j+XZ6RKdxjLD7BOWTP0Bv0tmjdyhXSsr2Ul1oJDQqh9Zf5qOwTuTc7Ek83mOFaKnodPjg==} cpu: [arm] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm-musleabihf@4.60.2': resolution: {integrity: sha512-TbziEu2DVsTEOPif2mKWkMeDMLoYjx95oESa9fkQQK7r/Orta0gnkcDpzwufEcAO2BLBsD7mZkXGFqEdMRRwfw==} cpu: [arm] os: [linux] + libc: [musl] '@rollup/rollup-linux-arm64-gnu@4.60.2': resolution: {integrity: sha512-bO/rVDiDUuM2YfuCUwZ1t1cP+/yqjqz+Xf2VtkdppefuOFS2OSeAfgafaHNkFn0t02hEyXngZkxtGqXcXwO8Rg==} cpu: [arm64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-arm64-musl@4.60.2': resolution: {integrity: sha512-hr26p7e93Rl0Za+JwW7EAnwAvKkehh12BU1Llm9Ykiibg4uIr2rbpxG9WCf56GuvidlTG9KiiQT/TXT1yAWxTA==} cpu: [arm64] os: [linux] + libc: [musl] '@rollup/rollup-linux-loong64-gnu@4.60.2': resolution: {integrity: sha512-pOjB/uSIyDt+ow3k/RcLvUAOGpysT2phDn7TTUB3n75SlIgZzM6NKAqlErPhoFU+npgY3/n+2HYIQVbF70P9/A==} cpu: [loong64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-loong64-musl@4.60.2': resolution: {integrity: sha512-2/w+q8jszv9Ww1c+6uJT3OwqhdmGP2/4T17cu8WuwyUuuaCDDJ2ojdyYwZzCxx0GcsZBhzi3HmH+J5pZNXnd+Q==} cpu: [loong64] os: [linux] + libc: [musl] '@rollup/rollup-linux-ppc64-gnu@4.60.2': resolution: {integrity: sha512-11+aL5vKheYgczxtPVVRhdptAM2H7fcDR5Gw4/bTcteuZBlH4oP9f5s9zYO9aGZvoGeBpqXI/9TZZihZ609wKw==} cpu: [ppc64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-ppc64-musl@4.60.2': resolution: {integrity: sha512-i16fokAGK46IVZuV8LIIwMdtqhin9hfYkCh8pf8iC3QU3LpwL+1FSFGej+O7l3E/AoknL6Dclh2oTdnRMpTzFQ==} cpu: [ppc64] os: [linux] + libc: [musl] '@rollup/rollup-linux-riscv64-gnu@4.60.2': resolution: {integrity: sha512-49FkKS6RGQoriDSK/6E2GkAsAuU5kETFCh7pG4yD/ylj9rKhTmO3elsnmBvRD4PgJPds5W2PkhC82aVwmUcJ7A==} cpu: [riscv64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-riscv64-musl@4.60.2': resolution: {integrity: sha512-mjYNkHPfGpUR00DuM1ZZIgs64Hpf4bWcz9Z41+4Q+pgDx73UwWdAYyf6EG/lRFldmdHHzgrYyge5akFUW0D3mQ==} cpu: [riscv64] os: [linux] + libc: [musl] '@rollup/rollup-linux-s390x-gnu@4.60.2': resolution: {integrity: sha512-ALyvJz965BQk8E9Al/JDKKDLH2kfKFLTGMlgkAbbYtZuJt9LU8DW3ZoDMCtQpXAltZxwBHevXz5u+gf0yA0YoA==} cpu: [s390x] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-gnu@4.60.2': resolution: {integrity: sha512-UQjrkIdWrKI626Du8lCQ6MJp/6V1LAo2bOK9OTu4mSn8GGXIkPXk/Vsp4bLHCd9Z9Iz2OTEaokUE90VweJgIYQ==} cpu: [x64] os: [linux] + libc: [glibc] '@rollup/rollup-linux-x64-musl@4.60.2': resolution: {integrity: sha512-bTsRGj6VlSdn/XD4CGyzMnzaBs9bsRxy79eTqTCBsA8TMIEky7qg48aPkvJvFe1HyzQ5oMZdg7AnVlWQSKLTnw==} cpu: [x64] os: [linux] + libc: [musl] '@rollup/rollup-openbsd-x64@4.60.2': resolution: {integrity: sha512-6d4Z3534xitaA1FcMWP7mQPq5zGwBmGbhphh2DwaA1aNIXUu3KTOfwrWpbwI4/Gr0uANo7NTtaykFyO2hPuFLg==} @@ -450,6 +473,22 @@ packages: engines: {node: '>=20'} hasBin: true + '@tangle-network/agent-eval@0.33.1': + resolution: {integrity: sha512-VAbg1UkC480Xzfi2jqiFMQLYykWvDMO47UHx4bb2rOeiogN1zzM10kPst3OotM+k1B2lbu51uoVnKDBnqK8zcw==} + engines: {node: '>=20'} + hasBin: true + + '@tangle-network/agent-integrations@0.25.7': + resolution: {integrity: sha512-5Iuymcoq6d1oZlyORfmVXiP2G/tJQe0ADYBUNwDlbk9uulSa3c6rztlr6sKm100NqDavVlJ0Jo75j9CsaemhIA==} + engines: {node: '>=20'} + hasBin: true + + '@tangle-network/agent-runtime@0.19.0': + resolution: {integrity: sha512-WbXEnPRPqeg27b+FWxIkoBCAgyPUWyJo7dgPIUcGWYX6O5FR6gcSBKDxvLorpAC5fKSh1mn3INcpXpuflPZKrA==} + engines: {node: '>=20'} + peerDependencies: + '@tangle-network/sandbox': '>=0.1.2 <0.3.0' + '@tangle-network/sandbox@0.1.2': resolution: {integrity: sha512-6TPH9QgCgou9Bhc1kzLNL4/PRiT1mjId6NONY5Le/KT2kh77cXH8KN3TTY/cU+/eW+WM5FYJOy32FWl2HShXbw==} peerDependencies: @@ -458,6 +497,17 @@ packages: viem: optional: true + '@tangle-network/sandbox@0.2.1': + resolution: {integrity: sha512-CQ3MdfnWcdjKa2UzyqDkjJarhkVDl4GqAKRhbQdHmHccl/pOm6qSRiPdu40XEA34A/SVPLpfE1ySxchU1rq6BQ==} + peerDependencies: + openai: ^6.36.0 + viem: ^2.0.0 + peerDependenciesMeta: + openai: + optional: true + viem: + optional: true + '@tangle-network/tcloud-attestation@0.1.1': resolution: {integrity: sha512-+TAF9s5t1jOWGyGHvKhIWe2FYmG7puVaxmmg0Et67ylAjGa7GqUAvISXGjG/6dzld7A170V0kQHK0WVdh2Wh0Q==} engines: {node: '>=18'} @@ -1211,10 +1261,40 @@ snapshots: - typescript - utf-8-validate + '@tangle-network/agent-eval@0.33.1(typescript@5.9.3)': + dependencies: + '@asteasolutions/zod-to-openapi': 8.5.0(zod@4.4.2) + '@ax-llm/ax': 19.0.45(zod@4.4.2) + '@hono/node-server': 2.0.1(hono@4.12.16) + '@tangle-network/tcloud': 0.4.6(typescript@5.9.3)(zod@4.4.2) + hono: 4.12.16 + zod: 4.4.2 + transitivePeerDependencies: + - bufferutil + - typescript + - utf-8-validate + + '@tangle-network/agent-integrations@0.25.7': {} + + '@tangle-network/agent-runtime@0.19.0(@tangle-network/sandbox@0.2.1(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)))(typescript@5.9.3)': + dependencies: + '@tangle-network/agent-eval': 0.33.1(typescript@5.9.3) + '@tangle-network/sandbox': 0.2.1(viem@2.48.8(typescript@5.9.3)(zod@4.4.2)) + transitivePeerDependencies: + - bufferutil + - typescript + - utf-8-validate + '@tangle-network/sandbox@0.1.2(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))': optionalDependencies: viem: 2.48.8(typescript@5.9.3)(zod@4.4.2) + '@tangle-network/sandbox@0.2.1(viem@2.48.8(typescript@5.9.3)(zod@4.4.2))': + dependencies: + '@tangle-network/agent-integrations': 0.25.7 + optionalDependencies: + viem: 2.48.8(typescript@5.9.3)(zod@4.4.2) + '@tangle-network/tcloud-attestation@0.1.1': {} '@tangle-network/tcloud@0.4.6(typescript@5.9.3)(zod@4.4.2)': diff --git a/src/profiles/index.ts b/src/profiles/index.ts new file mode 100644 index 0000000..c38285e --- /dev/null +++ b/src/profiles/index.ts @@ -0,0 +1,24 @@ +/** + * @experimental + * + * Pre-built `AgentRunSpec` + output adapter + validator bundles for + * knowledge-focused agent roles. Each preset bundles a sandbox-SDK + * `AgentProfile`, a task-to-prompt formatter, an output adapter, and a + * per-task validator constructor — all of the pieces `runLoop` needs to + * drive a topology around `@tangle-network/agent-runtime/loops`. + */ + +export type { + KnowledgeItem, + KnowledgeUpdate, + MultiHarnessResearcherFanoutOptions, + ResearcherProfileOptions, + ResearchOutput, + ResearchSource, + ResearchTask, +} from './researcher' +export { + createResearcherValidator, + multiHarnessResearcherFanout, + researcherProfile, +} from './researcher' diff --git a/src/profiles/researcher.ts b/src/profiles/researcher.ts new file mode 100644 index 0000000..045a82e --- /dev/null +++ b/src/profiles/researcher.ts @@ -0,0 +1,585 @@ +/** + * @experimental + * + * `researcherProfile` — opinionated preset for source-grounded research + * tasks. The agent is told to: + * - bound its work to a single `knowledgeNamespace` + * - emit `items[]` carrying provenance + confidence + * - emit `citations[]` linking quotes back to source urls + * - emit `proposedWrites[]` — never call materialize itself + * - describe `gaps` it could not answer + * + * The profile is stateless and agent-agnostic. `harness` selects the + * sandbox-SDK backend. For heterogeneous fanout, use + * `multiHarnessResearcherFanout`. + * + * Propose-don't-apply: the profile NEVER writes to the knowledge base. + * It produces `proposedWrites: KnowledgeUpdate[]` in the output. The + * caller (gtm-agent, journey-eval, user) decides whether to feed those + * updates through `applyKnowledgeWriteBlocks` / a KbStore put. + * + * Namespace isolation: every `KnowledgeItem` + `KnowledgeUpdate` in the + * output carries `namespace`. The validator hard-fails when any item + * touches a namespace other than `task.knowledgeNamespace`. + */ + +import { + type AgentProfile, + type AgentRunSpec, + createFanoutVoteDriver, + type DefaultVerdict, + type Driver, + type OutputAdapter, + type SandboxEvent, + type Validator, +} from '@tangle-network/agent-runtime/loops' + +/** @experimental */ +export type ResearchSource = 'web' | 'corpus' | 'twitter' | 'github' | 'docs' + +/** @experimental */ +export interface ResearchTask { + /** The research question to answer. */ + question: string + /** Bound: e.g. "audience for cpg-founder ICP". */ + scope?: string + /** Multi-tenant scope (customer-id, workspace-id). Validator enforces. */ + knowledgeNamespace: string + sources?: ResearchSource[] + recencyWindow?: { since?: Date; until?: Date } + maxItems?: number + /** Per-item minimum confidence in [0, 1]. Validator scores recall vs this. */ + minConfidence?: number +} + +/** + * Knowledge item emitted by the researcher. + * + * Profile-local type. When agent-knowledge promotes `KnowledgeClaim` → + * top-level `KnowledgeItem` substrate-wide, these fields collapse 1:1. + * + * @experimental + */ +export interface KnowledgeItem { + id: string + /** Multi-tenant scope. MUST equal `task.knowledgeNamespace`. */ + namespace: string + /** The factual claim, in the researcher's words. */ + claim: string + /** Provenance — at least one entry required. */ + evidence: Array<{ source: string; quote?: string; url?: string; capturedAt: number }> + /** Researcher's self-reported confidence in [0, 1]. */ + confidence: number + /** Prior item ids this supersedes (chain). */ + supersedes?: string[] + /** Set if the agent is retracting an earlier item. Unix ms. */ + retractedAt?: number + authoredBy: { kind: 'human' | 'agent'; id: string } +} + +/** + * A proposed write to the knowledge base. The profile does NOT apply + * these — the caller decides. + * + * @experimental + */ +export type KnowledgeUpdate = + | { kind: 'insert'; namespace: string; item: KnowledgeItem } + | { kind: 'supersede'; namespace: string; previousId: string; item: KnowledgeItem } + | { kind: 'retract'; namespace: string; itemId: string; reason: string } + +/** + * Researcher output. Required fields are typed; optional fields preserve + * the agent's free-form intelligence (`notes`, `raw`). The validator + * enforces the typed minimum. + * + * @experimental + */ +export interface ResearchOutput { + items: KnowledgeItem[] + citations: Array<{ url: string; quote: string; confidence: number }> + proposedWrites: KnowledgeUpdate[] + gaps?: string[] + notes?: string + /** Anything the agent emitted beyond the typed fields. */ + raw?: unknown +} + +/** @experimental */ +export interface ResearcherProfileOptions { + /** Sandbox-SDK backend.type. Default `'opencode/zai-coding-plan/glm-5.1'`. */ + harness?: string + /** Default model id passed in `AgentProfile.model.default`. */ + model?: string + /** Custom system prompt replacement. Default = built-in researcher preset. */ + systemPrompt?: string + /** Stable name for `AgentRunSpec.name`. Default = `researcher-${harness}`. */ + name?: string + /** + * Default 0.7. Minimum (citations with quote) / items ratio for `valid=true`. + * Below this floor, citation_density scores < 1 and the item set is gated. + */ + citationDensityMin?: number +} + +const DEFAULT_HARNESS = 'opencode/zai-coding-plan/glm-5.1' +const DEFAULT_CITATION_DENSITY_MIN = 0.7 + +/** @experimental */ +export function researcherProfile( + options: ResearcherProfileOptions & { task?: ResearchTask } = {}, +): { + profile: AgentProfile + taskToPrompt: (task: ResearchTask) => string + output: OutputAdapter + validator: Validator + agentRunSpec: AgentRunSpec +} { + const harness = options.harness ?? DEFAULT_HARNESS + const name = options.name ?? `researcher-${harness}` + const systemPrompt = options.systemPrompt ?? DEFAULT_RESEARCHER_SYSTEM_PROMPT + const citationDensityMin = options.citationDensityMin ?? DEFAULT_CITATION_DENSITY_MIN + const profile: AgentProfile = { + name, + description: "Source-grounded research agent. Propose-don't-apply.", + prompt: { systemPrompt }, + model: options.model ? { default: options.model } : undefined, + tools: { web_search: true, fs: true, shell: true }, + metadata: { backendType: harness, role: 'researcher' }, + } + const output: OutputAdapter = { parse: parseResearcherEvents } + const validator: Validator = options.task + ? createResearcherValidator(options.task, { citationDensityMin }) + : createResearcherValidator( + { question: '', knowledgeNamespace: '' }, + { citationDensityMin, namespaceCheck: false }, + ) + const agentRunSpec: AgentRunSpec = { + name, + profile, + taskToPrompt: formatResearcherPrompt, + } + return { profile, taskToPrompt: formatResearcherPrompt, output, validator, agentRunSpec } +} + +/** @experimental */ +export interface MultiHarnessResearcherFanoutOptions { + /** Backend.type identifiers, one per parallel agent. */ + harnesses?: string[] + /** Optional per-harness model override. Indexed parallel to `harnesses`. */ + models?: (string | undefined)[] + /** Default citation density floor for the shared validator. */ + citationDensityMin?: number + /** Optional task — narrows the validator's namespace check. */ + task?: ResearchTask +} + +/** + * Build a fanout topology over multiple harnesses. The kernel round-robins + * `agentRuns` across the N parallel iterations and the `FanoutVote` driver + * picks the highest-scoring valid output. + * + * @experimental + */ +export function multiHarnessResearcherFanout(options: MultiHarnessResearcherFanoutOptions = {}): { + agentRuns: AgentRunSpec[] + output: OutputAdapter + validator: Validator + driver: Driver +} { + const harnesses = + options.harnesses && options.harnesses.length > 0 + ? options.harnesses + : ['opencode/zai-coding-plan/glm-5.1', 'claude-code', 'codex'] + const models = options.models ?? [] + const agentRuns = harnesses.map((harness, i) => { + const { agentRunSpec } = researcherProfile({ harness, model: models[i] }) + return agentRunSpec + }) + const { output, validator } = researcherProfile({ + citationDensityMin: options.citationDensityMin, + task: options.task, + }) + const driver = createFanoutVoteDriver({ n: harnesses.length }) + return { agentRuns, output, validator, driver } +} + +/** + * Build a validator that closes over a specific `ResearchTask`'s constraints. + * + * Checks in order: + * 1. Items must be non-empty. + * 2. Every item carries `evidence.length >= 1`. + * 3. Every item + proposedWrite is scoped to `task.knowledgeNamespace` + * (hard-fail on any namespace mismatch — defence in depth for the + * multi-tenant invariant). + * 4. Citation density (citations with quote / items) >= floor. + * + * Aggregate score: + * 0.4 · citation_density + * + 0.2 · source_diversity (distinct sources / max(items, 1)) + * + 0.2 · recency_match (mean fraction within `recencyWindow`) + * + 0.2 · (1 − gaps/maxGaps), maxGaps = max(items, 1) + * + * @experimental + */ +export function createResearcherValidator( + task: ResearchTask, + config: { citationDensityMin?: number; namespaceCheck?: boolean } = {}, +): Validator { + const citationDensityMin = config.citationDensityMin ?? DEFAULT_CITATION_DENSITY_MIN + const namespaceCheck = config.namespaceCheck ?? true + return { + async validate(output) { + const notes: string[] = [] + const scores: Record = {} + let pass = true + + if (!Array.isArray(output.items) || output.items.length === 0) { + pass = false + notes.push('no items') + scores.items = 0 + } else { + scores.items = 1 + } + + const missingEvidence = (output.items ?? []).filter( + (item) => !Array.isArray(item.evidence) || item.evidence.length === 0, + ) + if (missingEvidence.length > 0) { + pass = false + notes.push(`${missingEvidence.length} item(s) without evidence`) + scores.provenance = 0 + } else { + scores.provenance = 1 + } + + if (namespaceCheck) { + const foreignItems = (output.items ?? []).filter( + (item) => item.namespace !== task.knowledgeNamespace, + ) + const foreignWrites = (output.proposedWrites ?? []).filter( + (write) => write.namespace !== task.knowledgeNamespace, + ) + if (foreignItems.length > 0 || foreignWrites.length > 0) { + pass = false + notes.push( + `namespace violation: ${foreignItems.length} item(s) + ${foreignWrites.length} write(s) ` + + `outside ${task.knowledgeNamespace}`, + ) + scores.namespace = 0 + } else { + scores.namespace = 1 + } + } + + const itemCount = Math.max(output.items?.length ?? 0, 1) + const citationsWithQuote = (output.citations ?? []).filter( + (citation) => typeof citation.quote === 'string' && citation.quote.length > 0, + ).length + const citationDensity = Math.min(1, citationsWithQuote / itemCount) + if (citationDensity < citationDensityMin) { + pass = false + notes.push( + `citation density ${citationDensity.toFixed(2)} below floor ${citationDensityMin.toFixed(2)}`, + ) + } + scores.citation_density = citationDensity + + const sourceSet = new Set() + for (const item of output.items ?? []) { + for (const evidence of item.evidence ?? []) { + if (evidence.source) sourceSet.add(evidence.source) + } + } + scores.source_diversity = Math.min(1, sourceSet.size / itemCount) + + scores.recency_match = recencyMatchScore(output.items ?? [], task.recencyWindow) + + const maxGaps = itemCount + const gapCount = output.gaps?.length ?? 0 + scores.gap_coverage = Math.max(0, 1 - gapCount / maxGaps) + + const score = + 0.4 * scores.citation_density + + 0.2 * scores.source_diversity + + 0.2 * scores.recency_match + + 0.2 * scores.gap_coverage + + const verdict: DefaultVerdict = { + valid: pass, + score: Number.isFinite(score) ? score : 0, + scores, + } + if (notes.length > 0) verdict.notes = notes.join('; ') + return verdict + }, + } +} + +function recencyMatchScore(items: KnowledgeItem[], window: ResearchTask['recencyWindow']): number { + if (!window || (window.since === undefined && window.until === undefined)) return 1 + if (items.length === 0) return 0 + const sinceMs = window.since?.getTime() ?? Number.NEGATIVE_INFINITY + const untilMs = window.until?.getTime() ?? Number.POSITIVE_INFINITY + let hits = 0 + let total = 0 + for (const item of items) { + for (const evidence of item.evidence ?? []) { + if (typeof evidence.capturedAt !== 'number') continue + total += 1 + if (evidence.capturedAt >= sinceMs && evidence.capturedAt <= untilMs) hits += 1 + } + } + return total === 0 ? 0 : hits / total +} + +const DEFAULT_RESEARCHER_SYSTEM_PROMPT = [ + 'You are a research agent. Your job is to answer a research question with', + 'source-grounded knowledge items that the caller will choose whether to', + 'persist to a multi-tenant knowledge base.', + '', + 'Hard rules:', + " 1. Every item you emit MUST carry the task's knowledgeNamespace exactly.", + ' Never write to a different namespace.', + ' 2. Every item MUST carry at least one evidence entry with a source.', + ' A quote + url is strongly preferred; capturedAt is unix ms.', + ' 3. You propose writes — you do NOT apply them. The caller decides.', + ' 4. Self-report confidence honestly in [0, 1]. Do not inflate.', + ' 5. List what you could not answer in `gaps`. Better to admit a gap', + ' than fabricate.', + '', + 'When you finish, emit a single final structured message of the shape:', + ' ```json', + ' { "items": [{ "id": "...", "namespace": "...", "claim": "...",', + ' "evidence": [{ "source": "...", "quote": "...",', + ' "url": "...", "capturedAt": 0 }],', + ' "confidence": 0.0,', + ' "authoredBy": { "kind": "agent", "id": "..." } }],', + ' "citations": [{ "url": "...", "quote": "...", "confidence": 0.0 }],', + ' "proposedWrites": [{ "kind": "insert", "namespace": "...",', + ' "item": { /* same shape as items[] */ } }],', + ' "gaps": ["..."],', + ' "notes": "free-form commentary" }', + ' ```', +].join('\n') + +function formatResearcherPrompt(task: ResearchTask): string { + const sources = task.sources?.length ? task.sources.join(', ') : '(no preference)' + const window = formatRecencyWindow(task.recencyWindow) + return [ + `Question: ${task.question}`, + `Knowledge namespace (DO NOT cross): ${task.knowledgeNamespace}`, + `Scope: ${task.scope ?? '(unspecified)'}`, + `Preferred sources: ${sources}`, + `Recency window: ${window}`, + `Max items: ${task.maxItems ?? '(no cap)'}`, + `Per-item minimum confidence: ${task.minConfidence ?? '(no floor)'}`, + '', + 'Produce knowledge items with provenance + citations + proposed writes.', + 'List gaps for anything you could not answer. Emit the final JSON', + 'result block exactly as instructed.', + ].join('\n') +} + +function formatRecencyWindow(window: ResearchTask['recencyWindow']): string { + if (!window) return '(none)' + const since = window.since ? window.since.toISOString() : '-∞' + const until = window.until ? window.until.toISOString() : 'now' + return `${since} .. ${until}` +} + +/** + * Walk the event stream and return the last structured `research.result` + * payload. Falls back to scanning text deltas for a fenced JSON block. + */ +function parseResearcherEvents(events: SandboxEvent[]): ResearchOutput { + for (let i = events.length - 1; i >= 0; i -= 1) { + const event = events[i] + if (!event) continue + const type = String(event.type ?? '') + const data = isRecord(event.data) ? event.data : {} + if (type === 'result' || type === 'final' || type === 'research.result') { + const direct = coerceResearchOutput(data.result ?? data.output ?? data) + if (direct) return direct + } + } + for (let i = events.length - 1; i >= 0; i -= 1) { + const event = events[i] + if (!event) continue + const data = isRecord(event.data) ? event.data : {} + const text = pickString(data.text) ?? pickString(data.delta) + if (!text) continue + const fenced = extractFencedJson(text) + if (!fenced) continue + const coerced = coerceResearchOutput(fenced) + if (coerced) return coerced + } + return { items: [], citations: [], proposedWrites: [] } +} + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === 'object' && !Array.isArray(value) +} + +function pickString(value: unknown): string | undefined { + return typeof value === 'string' && value.length > 0 ? value : undefined +} + +function extractFencedJson(text: string): unknown | undefined { + const match = text.match(/```(?:json)?\s*([\s\S]*?)```/i) + if (!match) return undefined + const body = (match[1] ?? '').trim() + if (!body) return undefined + try { + return JSON.parse(body) + } catch { + return undefined + } +} + +function coerceResearchOutput(value: unknown): ResearchOutput | undefined { + if (!isRecord(value)) return undefined + const items = coerceItems(value.items) + const citations = coerceCitations(value.citations) + const proposedWrites = coerceProposedWrites(value.proposedWrites) + // Reject completely empty payloads — those signal "no result block", + // not "a valid empty result". + if (items === undefined && citations === undefined && proposedWrites === undefined) { + return undefined + } + const output: ResearchOutput = { + items: items ?? [], + citations: citations ?? [], + proposedWrites: proposedWrites ?? [], + } + if (Array.isArray(value.gaps)) { + output.gaps = value.gaps.filter((entry): entry is string => typeof entry === 'string') + } + const notes = pickString(value.notes) + if (notes) output.notes = notes + // Preserve any extra fields the agent emitted that don't fit the typed + // surface — callers can inspect `raw` without forcing them through the + // typed coercion. We only include `raw` when the agent emitted fields + // beyond the known set. + const known = new Set(['items', 'citations', 'proposedWrites', 'gaps', 'notes']) + const extras: Record = {} + let extrasCount = 0 + for (const [key, val] of Object.entries(value)) { + if (known.has(key)) continue + extras[key] = val + extrasCount += 1 + } + if (extrasCount > 0) output.raw = extras + return output +} + +function coerceItems(value: unknown): KnowledgeItem[] | undefined { + if (!Array.isArray(value)) return undefined + const out: KnowledgeItem[] = [] + for (const entry of value) { + if (!isRecord(entry)) continue + const id = pickString(entry.id) + const namespace = pickString(entry.namespace) + const claim = pickString(entry.claim) + const evidence = coerceEvidence(entry.evidence) + const confidence = toFiniteNumber(entry.confidence) + const authoredBy = coerceAuthoredBy(entry.authoredBy) + if (!id || !namespace || !claim || !authoredBy) continue + const item: KnowledgeItem = { + id, + namespace, + claim, + evidence, + confidence: clamp01(confidence), + authoredBy, + } + if (Array.isArray(entry.supersedes)) { + item.supersedes = entry.supersedes.filter((s): s is string => typeof s === 'string') + } + const retractedAt = toFiniteNumber(entry.retractedAt) + if (retractedAt > 0) item.retractedAt = retractedAt + out.push(item) + } + return out +} + +function coerceEvidence(value: unknown): KnowledgeItem['evidence'] { + if (!Array.isArray(value)) return [] + const out: KnowledgeItem['evidence'] = [] + for (const entry of value) { + if (!isRecord(entry)) continue + const source = pickString(entry.source) + if (!source) continue + const item: KnowledgeItem['evidence'][number] = { + source, + capturedAt: toFiniteNumber(entry.capturedAt), + } + const quote = pickString(entry.quote) + if (quote) item.quote = quote + const url = pickString(entry.url) + if (url) item.url = url + out.push(item) + } + return out +} + +function coerceAuthoredBy(value: unknown): KnowledgeItem['authoredBy'] | undefined { + if (!isRecord(value)) return undefined + const kind = value.kind === 'human' || value.kind === 'agent' ? value.kind : undefined + const id = pickString(value.id) + if (!kind || !id) return undefined + return { kind, id } +} + +function coerceCitations(value: unknown): ResearchOutput['citations'] | undefined { + if (!Array.isArray(value)) return undefined + const out: ResearchOutput['citations'] = [] + for (const entry of value) { + if (!isRecord(entry)) continue + const url = pickString(entry.url) + const quote = pickString(entry.quote) + if (!url || !quote) continue + out.push({ url, quote, confidence: clamp01(toFiniteNumber(entry.confidence)) }) + } + return out +} + +function coerceProposedWrites(value: unknown): KnowledgeUpdate[] | undefined { + if (!Array.isArray(value)) return undefined + const out: KnowledgeUpdate[] = [] + for (const entry of value) { + if (!isRecord(entry)) continue + const namespace = pickString(entry.namespace) + if (!namespace) continue + if (entry.kind === 'insert') { + const items = coerceItems([entry.item]) + const item = items?.[0] + if (!item) continue + out.push({ kind: 'insert', namespace, item }) + } else if (entry.kind === 'supersede') { + const previousId = pickString(entry.previousId) + const items = coerceItems([entry.item]) + const item = items?.[0] + if (!previousId || !item) continue + out.push({ kind: 'supersede', namespace, previousId, item }) + } else if (entry.kind === 'retract') { + const itemId = pickString(entry.itemId) + const reason = pickString(entry.reason) + if (!itemId || !reason) continue + out.push({ kind: 'retract', namespace, itemId, reason }) + } + } + return out +} + +function toFiniteNumber(value: unknown): number { + return typeof value === 'number' && Number.isFinite(value) ? value : 0 +} + +function clamp01(value: number): number { + if (!Number.isFinite(value)) return 0 + if (value < 0) return 0 + if (value > 1) return 1 + return value +} diff --git a/tests/loops/researcher-integration.test.ts b/tests/loops/researcher-integration.test.ts new file mode 100644 index 0000000..9dbd8a4 --- /dev/null +++ b/tests/loops/researcher-integration.test.ts @@ -0,0 +1,198 @@ +import { runLoop } from '@tangle-network/agent-runtime/loops' +import type { CreateSandboxOptions, SandboxEvent, SandboxInstance } from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { + multiHarnessResearcherFanout, + type ResearchOutput, + type ResearchTask, + researcherProfile, +} from '../../src/profiles/researcher' + +function ageMs(days: number): number { + return Date.now() - days * 24 * 60 * 60 * 1000 +} + +function researchPayload(overrides: { namespace?: string; quality?: number } = {}): ResearchOutput { + const namespace = overrides.namespace ?? 'cust_42' + const quality = overrides.quality ?? 1 + const item = { + id: 'item-1', + namespace, + claim: 'cpg-founder ICP engages with founder-pov threads', + evidence: [ + { + source: 'twitter', + quote: quality >= 0.5 ? 'Engagement rate 4.2x' : '', + url: 'https://x.com/example/status/1', + capturedAt: ageMs(7), + }, + ], + confidence: 0.82 * quality, + authoredBy: { kind: 'agent' as const, id: 'researcher-stub' }, + } + return { + items: [item], + citations: + quality >= 0.5 + ? [ + { + url: 'https://x.com/example/status/1', + quote: 'Engagement rate 4.2x', + confidence: 0.82, + }, + ] + : [], + proposedWrites: [{ kind: 'insert', namespace, item }], + } +} + +function stubClient(perCall: Array<() => ResearchOutput | { __error: string }>): { + client: { create(opts?: CreateSandboxOptions): Promise } + callCount: () => number +} { + let i = 0 + return { + callCount: () => i, + client: { + async create() { + const idx = i + i += 1 + const factory = perCall[idx] ?? perCall[perCall.length - 1] + return { + async *streamPrompt() { + if (!factory) { + yield { type: 'noise', data: {} } satisfies SandboxEvent + return + } + const value = factory() + if ('__error' in value) { + throw new Error(value.__error) + } + yield { + type: 'result', + data: { result: value }, + } satisfies SandboxEvent + }, + } as unknown as SandboxInstance + }, + }, + } +} + +const task: ResearchTask = { + question: 'What does cpg-founder ICP engage with on Twitter?', + knowledgeNamespace: 'cust_42', + sources: ['twitter', 'web'], + maxItems: 10, + minConfidence: 0.6, +} + +describe('researcherProfile end-to-end through runLoop', () => { + it('drives a single AgentRunSpec via fanout-vote and selects a clean winner', async () => { + const { agentRuns, output, validator, driver } = multiHarnessResearcherFanout({ + harnesses: ['researcher-a', 'researcher-b', 'researcher-c'], + task, + }) + const { client } = stubClient([ + () => researchPayload({ quality: 1 }), + () => researchPayload({ quality: 1 }), + () => researchPayload({ quality: 1 }), + ]) + + const result = await runLoop({ + driver, + agentRuns, + output, + validator, + task, + ctx: { sandboxClient: client }, + }) + + expect(result.decision).toBe('pick-winner') + expect(result.iterations).toHaveLength(3) + expect(result.winner).toBeDefined() + expect(result.winner?.output.items).toHaveLength(1) + expect(result.winner?.output.proposedWrites).toHaveLength(1) + expect(result.winner?.verdict?.valid).toBe(true) + }) + + it('fails-out when every harness emits a cross-namespace leak', async () => { + const { agentRuns, output, validator, driver } = multiHarnessResearcherFanout({ + harnesses: ['researcher-a', 'researcher-b'], + task, + }) + const { client } = stubClient([ + () => researchPayload({ namespace: 'cust_99' }), + () => researchPayload({ namespace: 'cust_77' }), + ]) + + const result = await runLoop({ + driver, + agentRuns, + output, + validator, + task, + ctx: { sandboxClient: client }, + }) + + expect(result.decision).toBe('fail') + expect(result.iterations).toHaveLength(2) + for (const iter of result.iterations) { + expect(iter.verdict?.valid).toBe(false) + expect(iter.verdict?.notes).toMatch(/namespace violation/) + } + // The kernel surfaces a structural top-of-attempts even on `fail`. + // The contract is `decision === 'fail'` + `winner.verdict.valid === false`; + // never a winner with `valid === true` when every output leaked. + if (result.winner) { + expect(result.winner.verdict?.valid).toBe(false) + } + }) + + it('selects the higher-quality output across heterogeneous harnesses', async () => { + const { agentRuns, output, validator, driver } = multiHarnessResearcherFanout({ + harnesses: ['low-quality', 'high-quality'], + task, + }) + const { client } = stubClient([ + () => researchPayload({ quality: 0.1 }), // no quoted citation → density floor fails + () => researchPayload({ quality: 1 }), + ]) + + const result = await runLoop({ + driver, + agentRuns, + output, + validator, + task, + ctx: { sandboxClient: client }, + }) + + expect(result.decision).toBe('pick-winner') + expect(result.winner?.iterationIndex).toBe(1) + expect(result.winner?.agentRunName).toBe('researcher-high-quality') + }) + + it('passes proposedWrites through unchanged — caller is responsible for materialize', async () => { + const single = researcherProfile({ harness: 'researcher-stub' }) + const driver = multiHarnessResearcherFanout({ harnesses: ['researcher-stub'], task }).driver + const { client } = stubClient([() => researchPayload({ quality: 1 })]) + + const result = await runLoop({ + driver, + agentRuns: [single.agentRunSpec], + output: single.output, + validator: multiHarnessResearcherFanout({ harnesses: ['researcher-stub'], task }).validator, + task, + ctx: { sandboxClient: client }, + }) + + // The winner carries proposedWrites that the caller must explicitly + // route to applyKnowledgeWriteBlocks (or a KbStore put). The profile + // itself never wrote anything to disk — that's the invariant. + const writes = result.winner?.output.proposedWrites ?? [] + expect(writes).toHaveLength(1) + expect(writes[0]?.kind).toBe('insert') + expect(writes[0]?.namespace).toBe('cust_42') + }) +}) diff --git a/tests/profiles/researcher.test.ts b/tests/profiles/researcher.test.ts new file mode 100644 index 0000000..cc5e4e2 --- /dev/null +++ b/tests/profiles/researcher.test.ts @@ -0,0 +1,424 @@ +import type { SandboxEvent } from '@tangle-network/sandbox' +import { describe, expect, it } from 'vitest' +import { + createResearcherValidator, + type KnowledgeItem, + multiHarnessResearcherFanout, + type ResearchOutput, + type ResearchTask, + researcherProfile, +} from '../../src/profiles/researcher' + +function ageMs(days: number): number { + return Date.now() - days * 24 * 60 * 60 * 1000 +} + +function item(overrides: Partial = {}): KnowledgeItem { + return { + id: overrides.id ?? 'item-1', + namespace: overrides.namespace ?? 'cust_42', + claim: overrides.claim ?? 'cpg-founder ICP engages with founder-pov threads', + evidence: overrides.evidence ?? [ + { + source: 'twitter', + quote: 'Engagement rate 4.2x', + url: 'https://x.com/example/status/1', + capturedAt: ageMs(7), + }, + ], + confidence: overrides.confidence ?? 0.82, + authoredBy: overrides.authoredBy ?? { kind: 'agent', id: 'researcher-glm' }, + ...overrides, + } +} + +function output(overrides: Partial = {}): ResearchOutput { + const items = overrides.items ?? [item()] + return { + items, + citations: overrides.citations ?? [ + { url: 'https://x.com/example/status/1', quote: 'Engagement rate 4.2x', confidence: 0.82 }, + ], + proposedWrites: + overrides.proposedWrites ?? + items.map((i) => ({ + kind: 'insert' as const, + namespace: i.namespace, + item: i, + })), + gaps: overrides.gaps, + notes: overrides.notes, + raw: overrides.raw, + } +} + +const task: ResearchTask = { + question: 'What does cpg-founder ICP engage with on Twitter?', + knowledgeNamespace: 'cust_42', + sources: ['twitter', 'web'], + maxItems: 10, + minConfidence: 0.6, +} + +describe('researcherProfile()', () => { + it('builds an AgentProfile + AgentRunSpec with role=researcher metadata', () => { + const { profile, agentRunSpec } = researcherProfile({ + harness: 'opencode/zai-coding-plan/glm-5.1', + }) + expect(profile.name).toBe('researcher-opencode/zai-coding-plan/glm-5.1') + expect(profile.metadata?.role).toBe('researcher') + expect(profile.metadata?.backendType).toBe('opencode/zai-coding-plan/glm-5.1') + expect(profile.tools).toMatchObject({ web_search: true, fs: true }) + expect(agentRunSpec.name).toBe(profile.name) + expect(typeof agentRunSpec.taskToPrompt).toBe('function') + }) + + it('formats the prompt with namespace + scope + recency window', () => { + const { taskToPrompt } = researcherProfile() + const prompt = taskToPrompt({ + question: 'q?', + knowledgeNamespace: 'ws_1', + scope: 'b2b SaaS', + sources: ['web', 'twitter'], + recencyWindow: { since: new Date('2026-01-01'), until: new Date('2026-05-01') }, + maxItems: 5, + minConfidence: 0.7, + }) + expect(prompt).toContain('Knowledge namespace (DO NOT cross): ws_1') + expect(prompt).toContain('Scope: b2b SaaS') + expect(prompt).toContain('Preferred sources: web, twitter') + expect(prompt).toContain('Recency window: 2026-01-01T00:00:00.000Z .. 2026-05-01T00:00:00.000Z') + expect(prompt).toContain('Max items: 5') + }) +}) + +describe('validator scoring', () => { + it('passes a fully-grounded, in-namespace output', async () => { + const validator = createResearcherValidator(task) + const verdict = await validator.validate(output(), { + iteration: 0, + signal: new AbortController().signal, + }) + expect(verdict.valid).toBe(true) + expect(verdict.score).toBeGreaterThan(0.5) + expect(verdict.scores?.citation_density).toBe(1) + expect(verdict.scores?.provenance).toBe(1) + expect(verdict.scores?.namespace).toBe(1) + }) + + it('hard-fails when items.length === 0', async () => { + const validator = createResearcherValidator(task) + const verdict = await validator.validate( + output({ items: [], proposedWrites: [], citations: [] }), + { iteration: 0, signal: new AbortController().signal }, + ) + expect(verdict.valid).toBe(false) + expect(verdict.notes).toContain('no items') + }) + + it('hard-fails when any item lacks evidence (provenance missing)', async () => { + const validator = createResearcherValidator(task) + const verdict = await validator.validate(output({ items: [item({ evidence: [] })] }), { + iteration: 0, + signal: new AbortController().signal, + }) + expect(verdict.valid).toBe(false) + expect(verdict.notes).toContain('without evidence') + expect(verdict.scores?.provenance).toBe(0) + }) + + it('hard-fails on cross-namespace item leak', async () => { + const validator = createResearcherValidator(task) + const verdict = await validator.validate( + output({ items: [item({ namespace: 'cust_99' })], proposedWrites: [] }), + { iteration: 0, signal: new AbortController().signal }, + ) + expect(verdict.valid).toBe(false) + expect(verdict.notes).toMatch(/namespace violation/) + expect(verdict.scores?.namespace).toBe(0) + }) + + it('hard-fails on cross-namespace proposedWrite leak (even when items are clean)', async () => { + const validator = createResearcherValidator(task) + const cleanItem = item({ namespace: 'cust_42' }) + const verdict = await validator.validate( + output({ + items: [cleanItem], + proposedWrites: [{ kind: 'insert', namespace: 'cust_99', item: cleanItem }], + }), + { iteration: 0, signal: new AbortController().signal }, + ) + expect(verdict.valid).toBe(false) + expect(verdict.notes).toMatch(/namespace violation/) + }) + + it('hard-fails when citation density falls below the floor', async () => { + const validator = createResearcherValidator(task, { citationDensityMin: 0.8 }) + const itemsArr = [ + item({ id: 'a' }), + item({ id: 'b' }), + item({ id: 'c' }), + item({ id: 'd' }), + item({ id: 'e' }), + ] + const verdict = await validator.validate( + output({ + items: itemsArr, + citations: [{ url: 'https://x.com/1', quote: 'q1', confidence: 0.8 }], + proposedWrites: itemsArr.map((i) => ({ + kind: 'insert' as const, + namespace: i.namespace, + item: i, + })), + }), + { iteration: 0, signal: new AbortController().signal }, + ) + expect(verdict.valid).toBe(false) + expect(verdict.notes).toMatch(/citation density 0\.20 below floor 0\.80/) + }) + + it('rewards source diversity in the aggregate score', async () => { + const validator = createResearcherValidator(task) + const single = await validator.validate( + output({ + items: [ + item({ id: 'a', evidence: [{ source: 'twitter', capturedAt: ageMs(1) }] }), + item({ id: 'b', evidence: [{ source: 'twitter', capturedAt: ageMs(2) }] }), + ], + citations: [ + { url: 'https://x.com/1', quote: 'q', confidence: 0.8 }, + { url: 'https://x.com/2', quote: 'q', confidence: 0.8 }, + ], + proposedWrites: [], + }), + { iteration: 0, signal: new AbortController().signal }, + ) + const diverse = await validator.validate( + output({ + items: [ + item({ id: 'a', evidence: [{ source: 'twitter', capturedAt: ageMs(1) }] }), + item({ id: 'b', evidence: [{ source: 'github', capturedAt: ageMs(2) }] }), + ], + citations: [ + { url: 'https://x.com/1', quote: 'q', confidence: 0.8 }, + { url: 'https://github.com/1', quote: 'q', confidence: 0.8 }, + ], + proposedWrites: [], + }), + { iteration: 0, signal: new AbortController().signal }, + ) + expect(diverse.scores?.source_diversity ?? 0).toBeGreaterThan( + single.scores?.source_diversity ?? 0, + ) + expect(diverse.score).toBeGreaterThan(single.score) + }) + + it('penalises recency mismatches when a recencyWindow is supplied', async () => { + const windowed: ResearchTask = { + ...task, + recencyWindow: { since: new Date(ageMs(5)) }, + } + const validator = createResearcherValidator(windowed) + const stale = await validator.validate( + output({ + items: [item({ evidence: [{ source: 'twitter', capturedAt: ageMs(100) }] })], + }), + { iteration: 0, signal: new AbortController().signal }, + ) + const fresh = await validator.validate( + output({ + items: [item({ evidence: [{ source: 'twitter', capturedAt: ageMs(1) }] })], + }), + { iteration: 0, signal: new AbortController().signal }, + ) + expect(stale.scores?.recency_match).toBe(0) + expect(fresh.scores?.recency_match).toBe(1) + }) + + it('penalises gaps in the score, never auto-fails on gaps alone', async () => { + const validator = createResearcherValidator(task) + const noGaps = await validator.validate(output(), { + iteration: 0, + signal: new AbortController().signal, + }) + const withGaps = await validator.validate( + output({ gaps: ['no data on Q4 2025 engagement window'] }), + { iteration: 0, signal: new AbortController().signal }, + ) + expect(withGaps.valid).toBe(true) + expect(withGaps.scores?.gap_coverage ?? 0).toBeLessThan(noGaps.scores?.gap_coverage ?? 1) + }) + + it('namespaceCheck=false disables the namespace gate (for shared validators)', async () => { + const validator = createResearcherValidator(task, { namespaceCheck: false }) + const verdict = await validator.validate( + output({ items: [item({ namespace: 'cust_99' })], proposedWrites: [] }), + { iteration: 0, signal: new AbortController().signal }, + ) + expect(verdict.valid).toBe(true) + expect(verdict.scores?.namespace).toBeUndefined() + }) +}) + +describe('propose-without-materialize invariant', () => { + it('emits proposedWrites but never invokes any FS / KB primitive', async () => { + // The profile has no FS dependency by construction. We verify by + // checking that `researcherProfile()` returns the same shape on + // every call without touching disk — and by inspecting the static + // imports of the module. + const { profile, output: adapter, validator, taskToPrompt, agentRunSpec } = researcherProfile() + expect(profile).toBeDefined() + expect(adapter.parse([])).toEqual({ items: [], citations: [], proposedWrites: [] }) + expect(typeof validator.validate).toBe('function') + expect(typeof taskToPrompt).toBe('function') + expect(agentRunSpec.profile).toBe(profile) + // The caller — not the profile — applies updates. Sanity-check that + // the profile module does not import any FS / KB primitive that would + // let it materialize writes itself. + const modText = await import('node:fs/promises').then((fs) => + fs.readFile(new URL('../../src/profiles/researcher.ts', import.meta.url), 'utf8'), + ) + // Forbid imports from the materialize / FS surface. + expect(modText).not.toMatch(/from ['"]\.\.\/proposals['"]/) + expect(modText).not.toMatch(/from ['"]\.\.\/store['"]/) + expect(modText).not.toMatch(/from ['"]\.\.\/kb-store['"]/) + expect(modText).not.toMatch(/from ['"]\.\.\/sources['"]/) + expect(modText).not.toMatch(/from ['"]node:fs(\/promises)?['"]/) + // Forbid actually calling the materialize / write APIs anywhere + // outside an `@example` or fenced-prose docblock. We accept the + // identifier inside source-level prose (audit context) but never as + // a callable identifier — a callable is `name(` with parens. + expect(modText).not.toMatch(/applyKnowledgeWriteBlocks\s*\(/) + expect(modText).not.toMatch(/writeFile\s*\(/) + expect(modText).not.toMatch(/addSourcePath\s*\(/) + expect(modText).not.toMatch(/addSourceText\s*\(/) + }) +}) + +describe('loose-output passthrough', () => { + it('parses a result event with the typed shape', () => { + const { output: adapter } = researcherProfile() + const items = [item()] + const events: SandboxEvent[] = [ + { + type: 'result', + data: { + result: { + items, + citations: [{ url: 'https://x.com/1', quote: 'q', confidence: 0.8 }], + proposedWrites: [{ kind: 'insert', namespace: 'cust_42', item: items[0] }], + gaps: ['no Q4 data'], + notes: 'inspected 4 sources', + }, + }, + }, + ] + const parsed = adapter.parse(events) + expect(parsed.items).toHaveLength(1) + expect(parsed.items[0]?.namespace).toBe('cust_42') + expect(parsed.citations).toHaveLength(1) + expect(parsed.proposedWrites).toHaveLength(1) + expect(parsed.gaps).toEqual(['no Q4 data']) + expect(parsed.notes).toBe('inspected 4 sources') + }) + + it('preserves agent extras under raw — never drops free-form intelligence', () => { + const { output: adapter } = researcherProfile() + const events: SandboxEvent[] = [ + { + type: 'final', + data: { + result: { + items: [item()], + citations: [{ url: 'https://x.com/1', quote: 'q', confidence: 0.8 }], + proposedWrites: [], + customField: { agentMood: 'curious', extraInsight: 42 }, + anotherUnknown: ['a', 'b'], + }, + }, + }, + ] + const parsed = adapter.parse(events) + expect(parsed.items).toHaveLength(1) + expect(parsed.raw).toMatchObject({ + customField: { agentMood: 'curious', extraInsight: 42 }, + anotherUnknown: ['a', 'b'], + }) + }) + + it('parses a fenced JSON block from a text delta when no structured result exists', () => { + const { output: adapter } = researcherProfile() + const payload = { + items: [item()], + citations: [{ url: 'https://x.com/1', quote: 'q', confidence: 0.8 }], + proposedWrites: [], + } + const events: SandboxEvent[] = [ + { + type: 'text', + data: { delta: `here is my answer:\n\`\`\`json\n${JSON.stringify(payload)}\n\`\`\`\n` }, + }, + ] + const parsed = adapter.parse(events) + expect(parsed.items).toHaveLength(1) + expect(parsed.citations).toHaveLength(1) + }) + + it('drops items that lack required fields rather than crashing', () => { + const { output: adapter } = researcherProfile() + const events: SandboxEvent[] = [ + { + type: 'result', + data: { + result: { + items: [ + { + id: 'good', + namespace: 'cust_42', + claim: 'ok', + evidence: [{ source: 'twitter', capturedAt: 0 }], + confidence: 0.5, + authoredBy: { kind: 'agent', id: 'r' }, + }, + { id: 'no-claim', namespace: 'cust_42' }, + null, + ], + citations: [], + proposedWrites: [], + }, + }, + }, + ] + const parsed = adapter.parse(events) + expect(parsed.items).toHaveLength(1) + expect(parsed.items[0]?.id).toBe('good') + }) + + it('returns an empty result when no events match the expected shape', () => { + const { output: adapter } = researcherProfile() + const parsed = adapter.parse([{ type: 'noise', data: { random: 1 } }]) + expect(parsed).toEqual({ items: [], citations: [], proposedWrites: [] }) + }) +}) + +describe('multiHarnessResearcherFanout', () => { + it('builds N AgentRunSpecs with a FanoutVote driver', () => { + const fan = multiHarnessResearcherFanout({ + harnesses: ['claude-code', 'codex', 'opencode/zai-coding-plan/glm-5.1'], + }) + expect(fan.agentRuns).toHaveLength(3) + expect(fan.agentRuns.map((spec) => spec.name)).toEqual([ + 'researcher-claude-code', + 'researcher-codex', + 'researcher-opencode/zai-coding-plan/glm-5.1', + ]) + expect(typeof fan.driver.plan).toBe('function') + expect(typeof fan.driver.decide).toBe('function') + expect(fan.driver.name).toBe('fanout-vote') + }) + + it('falls back to three default harnesses when none supplied', () => { + const fan = multiHarnessResearcherFanout() + expect(fan.agentRuns).toHaveLength(3) + }) +}) diff --git a/tsup.config.ts b/tsup.config.ts index a9ae6c7..a78bcac 100644 --- a/tsup.config.ts +++ b/tsup.config.ts @@ -6,6 +6,7 @@ export default defineConfig({ 'viz/index': 'src/viz/index.ts', cli: 'src/cli.ts', 'sources/index': 'src/sources/index.ts', + 'profiles/index': 'src/profiles/index.ts', }, format: ['esm'], dts: true,