Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
d19a9e8
feat(goal): ✨ replace self-attestation goal_scored with independent J…
jorben Jun 7, 2026
3b77dd1
refactor(goal): ♻️ remove mark_complete and complete verdict
jorben Jun 7, 2026
b204d9b
docs: 📝 update and reorder README feature list
jorben Jun 7, 2026
e284fbe
refactor(goal): ♻️ extract resolveGoalStatusKey for testability
jorben Jun 7, 2026
e8a58f2
refactor(subagent): 🔧 increase builtin default max delegation depth to 5
jorben Jun 7, 2026
c15e885
docs: 📝 remove obsolete design document
jorben Jun 7, 2026
d60daec
docs(judge): 📝 add size-first verification strategy and delegation gu…
jorben Jun 7, 2026
dc8fca0
refactor(goal): ♻️ remove goal-level time_used_seconds in favor of ru…
jorben Jun 7, 2026
4481759
feat(judge): ✨ redesign Judge evaluation for independence and complet…
jorben Jun 10, 2026
0e8b153
fix(subagent): 🐛 make task field optional and fix UTF-8 safe truncation
jorben Jun 10, 2026
f65683a
merge: resolve origin/master conflicts on judge redesign
jorben Jun 11, 2026
539005c
chore(deps): 🔧 align tiycore to 0.2.10-rc.2 and adopt Usage::context_…
jorben Jun 11, 2026
afd221e
refactor(goal): ♻️ centralize status transitions to explicit commands…
jorben Jun 11, 2026
f80d652
fix(agent): 🐛 fix timestamp slicing panic and add has_process_require…
jorben Jun 11, 2026
0cca885
feat(compression): ✨ reserve 20% context window for auto-compression …
jorben Jun 11, 2026
73c7cb5
fix(run): 🐛 record elapsed running time when interrupting active runs
jorben Jun 11, 2026
eb4b722
test: cover Judge summary builders and mapRunSummaryToContextUsage fa…
jorben Jun 12, 2026
de0b542
merge: resolve origin/master conflicts after review fixes
jorben Jun 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 206 additions & 31 deletions src-tauri/src/core/agent_session_execution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1612,21 +1612,23 @@ impl AgentSession {
tool_call_storage_id: &str,

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Automated review completed for this PR diff. No concrete inline issue was selected after aggregation.

tool_input: &serde_json::Value,
) -> AgentToolResult {
// Parse the main agent's task / rationale.
let request = match crate::core::subagent::JudgeRequest::from_tool_input(tool_input) {
Ok(request) => request,
Err(error) => {
tool_call_repo::update_result(
&self.pool,
tool_call_storage_id,
&serde_json::json!({ "error": &error }).to_string(),
"failed",
)
.await
.ok();
return agent_error_result(error);
}
};
// Validate the tool input shape. The main agent's optional note is
// not injected into the Judge prompt — the Judge evaluates the
// project state independently against the goal. Parsing is retained
// to reject malformed input (e.g. non-string `task` values) that
// would violate the tool JSON schema. A missing `task` is acceptable
// and falls back to a neutral default.
if let Err(error) = crate::core::subagent::JudgeRequest::from_tool_input(tool_input) {
tool_call_repo::update_result(
&self.pool,
tool_call_storage_id,
&serde_json::json!({ "error": &error }).to_string(),
"failed",
)
.await
.ok();
return agent_error_result(error);
}

// Backstop: re-query goal state. agent_judge is injected only when an
// un-verified goal exists, but a stale tool set or a direct call must be
Expand Down Expand Up @@ -1679,41 +1681,68 @@ impl AgentSession {
return agent_error_result(err_msg);
}

// Build the Judge task: inject the goal objective + status + last verdict
// so the Judge does not rely on the main agent's self-report.
let mut prior_verdict = String::new();
if goal.judge_evaluated_run_id.is_some() {
// Build the Judge task: inject the goal objective, the task board
// state, (when applicable) process compliance evidence, and (when
// this is a re-verification) the previous Judge verdict. The Judge
// receives no input from the main agent — it evaluates the project
// state independently against the goal. The previous verdict is
// included only as objective context so the Judge can confirm prior
// findings have actually been addressed, not as a starting point
// for a self-assessment.

// Query task board state for cross-reference.
let task_board_summary = build_task_board_summary(&self.pool, &goal.thread_id).await;

// Conditionally include process compliance layer for goals that
// require reviews or phase-by-phase verification.
let process_compliance = if has_process_requirements(&goal.objective) {
build_process_compliance_summary(&self.pool, &goal.thread_id).await

This comment was marked as outdated.

} else {
String::new()
};

// On re-verification, surface the prior Judge verdict as objective
// context so the Judge can confirm each prior finding has been
// genuinely resolved. This is read from the goal record (not from
// the main agent) and is empty on the first verification.
let prior_verdict = if goal.judge_evaluated_run_id.is_some() {
let mut section = String::new();
if let Some(summary) = goal.judge_summary.as_deref() {
if !summary.trim().is_empty() {
prior_verdict.push_str(&format!("\nPrevious Judge summary: {summary}"));
section.push_str(&format!("\nPrevious Judge summary: {summary}"));
}
}
if let Some(findings_json) = goal.judge_findings.as_deref() {
if let Ok(findings) = serde_json::from_str::<Vec<String>>(findings_json) {
if !findings.is_empty() {
prior_verdict.push_str("\nPrevious Judge findings:");
section.push_str("\nPrevious Judge findings:");
for finding in findings {
prior_verdict.push_str(&format!("\n- {finding}"));
section.push_str(&format!("\n- {finding}"));
}
}
}
}
}
section
} else {
String::new()
};

let judge_task = format!(

This comment was marked as outdated.

"You are verifying acceptance of the following goal for the current project.\n\n\
Goal id: {goal_id}\n\
Goal status: {status:?}\n\
Goal objective:\n{objective}\n\
{prior_verdict}\n\n\
The main agent's note for this verification request:\n{task}\n\n\
Goal objective:\n{objective}\n\n\
{task_board_summary}\
{process_compliance}\
{prior_verdict}\n\
Independently inspect the project's current state and decide whether it satisfies the goal. \
You must verify ALL requirements in the goal, not just those that seem to have been worked on. \
Cross-reference the task board state above with your file-system findings. \
If this is a re-verification, confirm that every prior finding has been genuinely \
resolved (do NOT accept claims of fix without verifying the actual change). \
Return your structured JudgeReport verdict.",
goal_id = goal.id,
status = goal.status,
objective = goal.objective,
task_board_summary = task_board_summary,

This comment was marked as outdated.

process_compliance = process_compliance,
prior_verdict = prior_verdict,
task = request.task,
);

// Build a Judge delegate (depth 2, primary model) and run it.
Expand Down Expand Up @@ -1837,6 +1866,152 @@ Return your structured JudgeReport verdict.",
}
}

/// Build a human-readable summary of the task board state for the Judge.
/// Returns a string describing each step and its stage, or a note that no
/// task board exists.
async fn build_task_board_summary(pool: &sqlx::SqlitePool, thread_id: &str) -> String {

This comment was marked as outdated.

use crate::persistence::repo::{task_board_repo, task_item_repo};

let boards = match task_board_repo::list_by_thread(pool, thread_id).await {
Ok(boards) => boards,
Err(_) => return "(No task board data available.)\n".to_string(),
};

if boards.is_empty() {
return "(No task board exists for this goal. Verify entirely from file system and goal text.)\n"
.to_string();
}

let mut summary = String::from("## Associated task board state\n\n");
for board in &boards {
// Skip abandoned boards — they are not relevant to the current goal.
if board.status.as_str() == "abandoned" {
continue;
}
summary.push_str(&format!(
"**{}** (status: {}):\n",
board.title,
board.status.as_str()
));

let items = match task_item_repo::list_by_task_board(pool, &board.id).await {
Ok(items) => items,
Err(_) => {
summary.push_str(" (Could not load task items.)\n");
continue;
}
};

if items.is_empty() {
summary.push_str(" (No task items.)\n");
continue;
}

for item in &items {
summary.push_str(&format!(
" - [{}] {}\n",
item.stage.as_str(),
item.description
));
}
}

summary.push_str(
"\n**Important**: Any step above that is not `completed` and maps to a goal \
requirement is evidence of incomplete work. Report these as findings.\n",
);
summary
}

/// Check whether the goal objective contains process requirements (e.g.,
/// "review each phase", "每阶段验收"). When true, the Judge prompt will
/// include a process compliance layer showing the thread's review call history.
fn has_process_requirements(objective: &str) -> bool {

This comment was marked as outdated.

let lower = objective.to_lowercase();
let keywords = [
"review",
"验收",
"检查",
"verify each",
"verify every",
"per phase",
"每个阶段",
"每一阶段",
"每轮",
"阶段完成",
];
keywords.iter().any(|kw| lower.contains(&kw.to_lowercase()))
}

/// Build a process compliance summary from the thread's run_helper history.
/// Lists all review-related helper calls chronologically with their input
/// summaries and status. Only meaningful when the goal objective contains
/// process requirements (e.g., "each phase must have a review").
async fn build_process_compliance_summary(pool: &sqlx::SqlitePool, thread_id: &str) -> String {
use crate::persistence::repo::run_helper_repo;

let helpers = match run_helper_repo::list_by_thread_id(pool, thread_id).await {
Ok(h) => h,
Err(_) => return String::new(),
};

// Filter for review-related calls: agent_review, helper_review
let reviews: Vec<_> = helpers
.iter()
.filter(|h| h.helper_kind.contains("review"))
.collect();

if reviews.is_empty() {
return format!(
"## Process compliance\n\n\
No review calls found in thread history. \
If the goal requires reviews, this is evidence of non-compliance.\n\n"
);
}

let mut summary = String::from("## Process compliance\n\n");
summary.push_str("The following review calls were recorded during this goal:\n\n");

for (i, review) in reviews.iter().enumerate() {
let status_label = match review.status.as_str() {
"completed" => "✓ completed",
"failed" => "✗ failed",
"interrupted" => "⚠ interrupted",
_ => &review.status,
};

let input_preview = review
.input_summary
.as_deref()
.map(|s| {
// Truncate to first 200 chars for readability (character-safe,
// avoids panicking on multi-byte UTF-8 sequences).
if s.chars().count() > 200 {
format!("{}...", s.chars().take(200).collect::<String>())
} else {
s.to_string()
}
})
.unwrap_or_else(|| "(no task description)".to_string());

summary.push_str(&format!(
"{}. `{}` called at {} (status: {})\n Scope: {}\n",
i + 1,
review.helper_kind,
&review.started_at[..review.started_at.len().min(19)],

This comment was marked as outdated.

status_label,
input_preview,
));
}

summary.push_str(
"\n**Guidance**: If the goal requires reviews at specific milestones \
(e.g., \"after each phase\"), verify that the review calls above \
cover all required milestones. Missing or failed reviews are findings.\n\n",
);
summary
}

#[cfg(test)]
mod tests {
use super::{
Expand Down
4 changes: 2 additions & 2 deletions src-tauri/src/core/prompt/templates/active_goal.tpl.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ Turns used: {{turns_used}}/{{max_turns}}
**Completion is decided by independent verification — you cannot self-declare it.**
1. Every subtask implied by the objective must be done, with no remaining work or dangling follow-ups.
2. Verify your work by running the relevant tests, linters, or build commands as you go.
3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge(task="...")`.
3. When you believe the goal is achieved, you MUST request acceptance by calling `agent_judge()`.

Rules:
- Call `agent_judge(task="explain why you believe the goal is achieved / what to verify")` when you think the goal is complete. An independent Judge will evaluate the project against the goal's consistency and completeness.
- Call `agent_judge()` to request independent goal acceptance verification. An independent Judge will evaluate the project against the goal's completeness. You do not need to provide a self-assessment — the Judge evaluates the project state directly.
- The goal is only marked verified when the Judge returns passed=true. You cannot mark the goal complete yourself.
- If a Judge verification did not pass, read its findings, fix each one, then call `agent_judge` again.
- Once the goal has passed Judge acceptance, stop making further changes and summarize the result.
Expand Down
Loading
Loading