From 9f4436600b2c9943f2a0f1e6f32fb46c672b876f Mon Sep 17 00:00:00 2001
From: Mukunda Katta <mukunda.vjcs6@gmail.com>
Date: Thu, 7 May 2026 22:57:55 -0700
Subject: [PATCH] Add agent-tool-routing eval (12 samples, Match template)

Tests an LLM's ability to select the single correct tool from a small
list when given a natural-language user request. Each prompt presents
4-6 thematically related tools and asks for the canonical tool name
as the only output.

Failure modes targeted:
  - emitting explanation text instead of a bare tool name
  - selecting a similar-but-incorrect tool that is also in the list
  - inventing tools that are not in the list

Real-world use case: LLM tool routing in chat assistants and IDE copilots,
which is the call site that downstream libraries like AgentVet
(@mukundakatta/agentvet on npm) wrap with arg-shape validation. This
eval scopes to the correct-tool-name choice as a foundational capability
that precedes argument validation.

Domain mix across the 12 samples: contact lookup, file system,
analytics SQL, GitHub, web fetching, billing, ops/Kubernetes, travel
booking, git workflow, weather, user management, audio transcription.

Uses evals.elsuite.basic.match:Match (no custom code) as required by
the docs/build-eval.md current contribution guidelines.
---
 evals/registry/data/agent_tool_routing/samples.jsonl | 12 ++++++++++++
 evals/registry/evals/agent-tool-routing.yaml         |  9 +++++++++
 2 files changed, 21 insertions(+)
 create mode 100644 evals/registry/data/agent_tool_routing/samples.jsonl
 create mode 100644 evals/registry/evals/agent-tool-routing.yaml

diff --git a/evals/registry/data/agent_tool_routing/samples.jsonl b/evals/registry/data/agent_tool_routing/samples.jsonl
new file mode 100644
index 0000000000..d06403b73b
--- /dev/null
+++ b/evals/registry/data/agent_tool_routing/samples.jsonl
@@ -0,0 +1,12 @@
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- send_email: Send an email to a recipient\n- create_calendar_event: Create an event on the user's calendar\n- search_contacts: Search the address book by name or email\n- list_recent_files: List recently edited files in cloud storage\n\nUser request: I need to find Sarah Patel's phone number to call her back about the proposal.\n\nWhich tool?"}], "ideal": "search_contacts"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- read_file: Read the contents of a file at a given path\n- write_file: Write content to a file at a given path\n- list_directory: List files and directories at a given path\n- delete_file: Delete a file at a given path\n- copy_file: Copy a file from a source path to a destination path\n\nUser request: Show me what's inside ~/projects so I can see which folders exist there.\n\nWhich tool?"}], "ideal": "list_directory"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- query_database: Run a SQL SELECT query against the analytics database\n- run_aggregation: Compute a precomputed aggregation by name\n- export_to_csv: Export the result of the last query to a CSV file\n- list_tables: List all tables in the analytics database\n- describe_schema: Return the schema of a specific table\n\nUser request: I want to find every customer who made a purchase over $500 last month.\n\nWhich tool?"}], "ideal": "query_database"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_pull_request: Open a new pull request on GitHub\n- list_open_issues: List all open issues on a GitHub repository\n- comment_on_issue: Add a comment to an existing GitHub issue\n- create_issue: Open a new issue on a GitHub repository\n- assign_issue: Assign a GitHub issue to a user\n\nUser request: Hey, can you tell the team on issue 142 that the bug is reproducible on macOS 14?\n\nWhich tool?"}], "ideal": "comment_on_issue"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- web_search: Search the public web for a query string\n- fetch_url: Fetch the contents of a specific URL\n- summarize_text: Summarize a block of text\n- translate_text: Translate text from one language to another\n\nUser request: I have the link https://example.com/announcement.html and I want to know what it says.\n\nWhich tool?"}], "ideal": "fetch_url"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_invoice: Create a new invoice for a customer\n- send_invoice: Send an existing invoice to a customer by email\n- void_invoice: Void an existing invoice\n- list_unpaid_invoices: List invoices that have not yet been paid\n- record_payment: Record a payment received against an invoice\n\nUser request: Acme Corp just wired us $4,500 against invoice INV-1042. Can you log that?\n\nWhich tool?"}], "ideal": "record_payment"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- restart_service: Restart a named service on the production cluster\n- scale_deployment: Scale a Kubernetes deployment to N replicas\n- view_logs: Read recent log lines for a named service\n- rollback_deployment: Roll a Kubernetes deployment back to its previous revision\n- list_deployments: List all deployments in a namespace\n\nUser request: Last night's deploy of the checkout service is throwing 500s. Can you put us back on the previous version?\n\nWhich tool?"}], "ideal": "rollback_deployment"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- search_flights: Search available flights between two airports on a given date\n- book_flight: Book a specific flight by flight number and date\n- cancel_booking: Cancel an existing booking by booking reference\n- get_booking_details: Look up the details of a booking by reference\n- check_in: Check the user in for an upcoming flight\n\nUser request: I'm looking at trips for next Tuesday from SFO to JFK, what's available?\n\nWhich tool?"}], "ideal": "search_flights"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_branch: Create a new git branch off of main\n- commit_changes: Commit the staged changes with a message\n- push_branch: Push the current branch to the remote\n- merge_branch: Merge a named branch into main\n- list_branches: List all local git branches\n\nUser request: I just finished editing the docs. Save them to git with the message 'docs: clarify install steps'.\n\nWhich tool?"}], "ideal": "commit_changes"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- get_weather: Get the current weather for a city\n- get_forecast: Get the multi-day weather forecast for a city\n- list_saved_cities: List the user's saved cities\n- save_city: Add a new city to the user's saved cities\n\nUser request: Will it rain in Seattle this weekend?\n\nWhich tool?"}], "ideal": "get_forecast"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_user: Create a new user account\n- delete_user: Delete a user account\n- reset_password: Send a password reset link to a user\n- disable_user: Disable a user account without deleting it\n- list_users: List all users\n\nUser request: Jordan from finance forgot their login again. Can you help them get back in without giving them a fresh account?\n\nWhich tool?"}], "ideal": "reset_password"}
+{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- transcribe_audio: Convert an audio file to text\n- summarize_text: Summarize a block of text\n- translate_text: Translate text from one language to another\n- detect_language: Identify the language of a piece of text\n\nUser request: I have a 30-minute meeting recording in meeting.mp3 and I need it as written notes.\n\nWhich tool?"}], "ideal": "transcribe_audio"}
diff --git a/evals/registry/evals/agent-tool-routing.yaml b/evals/registry/evals/agent-tool-routing.yaml
new file mode 100644
index 0000000000..48c8481a67
--- /dev/null
+++ b/evals/registry/evals/agent-tool-routing.yaml
@@ -0,0 +1,9 @@
+agent-tool-routing:
+  id: agent-tool-routing.dev.v0
+  description: Tests an LLM's ability to select the single correct tool from a small list when given a natural-language user request. Each prompt presents 4 to 6 thematically related tools and asks for the canonical tool name as the only output. Failure modes targeted are (a) emitting explanation text instead of a bare tool name, (b) selecting a similar-but-incorrect tool that is also in the list, and (c) inventing tools that are not in the list. This is a foundational capability for tool-using agents and a real-world product use case (LLM tool routing in chat assistants and IDE copilots).
+  disclaimer: This eval is hand-curated with 12 samples covering a thematic mix of common tool-routing scenarios (file ops, communication, dev-ops, data, scheduling, audio, billing). It is intended as a foundational starter set; the same Match template can be extended by adding more samples.jsonl rows without code changes.
+  metrics: [accuracy]
+agent-tool-routing.dev.v0:
+  class: evals.elsuite.basic.match:Match
+  args:
+    samples_jsonl: agent_tool_routing/samples.jsonl