From 9f4436600b2c9943f2a0f1e6f32fb46c672b876f Mon Sep 17 00:00:00 2001 From: Mukunda Katta Date: Thu, 7 May 2026 22:57:55 -0700 Subject: [PATCH] Add agent-tool-routing eval (12 samples, Match template) Tests an LLM's ability to select the single correct tool from a small list when given a natural-language user request. Each prompt presents 4-6 thematically related tools and asks for the canonical tool name as the only output. Failure modes targeted: - emitting explanation text instead of a bare tool name - selecting a similar-but-incorrect tool that is also in the list - inventing tools that are not in the list Real-world use case: LLM tool routing in chat assistants and IDE copilots, which is the call site that downstream libraries like AgentVet (@mukundakatta/agentvet on npm) wrap with arg-shape validation. This eval scopes to the correct-tool-name choice as a foundational capability that precedes argument validation. Domain mix across the 12 samples: contact lookup, file system, analytics SQL, GitHub, web fetching, billing, ops/Kubernetes, travel booking, git workflow, weather, user management, audio transcription. Uses evals.elsuite.basic.match:Match (no custom code) as required by the docs/build-eval.md current contribution guidelines. --- evals/registry/data/agent_tool_routing/samples.jsonl | 12 ++++++++++++ evals/registry/evals/agent-tool-routing.yaml | 9 +++++++++ 2 files changed, 21 insertions(+) create mode 100644 evals/registry/data/agent_tool_routing/samples.jsonl create mode 100644 evals/registry/evals/agent-tool-routing.yaml diff --git a/evals/registry/data/agent_tool_routing/samples.jsonl b/evals/registry/data/agent_tool_routing/samples.jsonl new file mode 100644 index 0000000000..d06403b73b --- /dev/null +++ b/evals/registry/data/agent_tool_routing/samples.jsonl @@ -0,0 +1,12 @@ +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- send_email: Send an email to a recipient\n- create_calendar_event: Create an event on the user's calendar\n- search_contacts: Search the address book by name or email\n- list_recent_files: List recently edited files in cloud storage\n\nUser request: I need to find Sarah Patel's phone number to call her back about the proposal.\n\nWhich tool?"}], "ideal": "search_contacts"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- read_file: Read the contents of a file at a given path\n- write_file: Write content to a file at a given path\n- list_directory: List files and directories at a given path\n- delete_file: Delete a file at a given path\n- copy_file: Copy a file from a source path to a destination path\n\nUser request: Show me what's inside ~/projects so I can see which folders exist there.\n\nWhich tool?"}], "ideal": "list_directory"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- query_database: Run a SQL SELECT query against the analytics database\n- run_aggregation: Compute a precomputed aggregation by name\n- export_to_csv: Export the result of the last query to a CSV file\n- list_tables: List all tables in the analytics database\n- describe_schema: Return the schema of a specific table\n\nUser request: I want to find every customer who made a purchase over $500 last month.\n\nWhich tool?"}], "ideal": "query_database"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_pull_request: Open a new pull request on GitHub\n- list_open_issues: List all open issues on a GitHub repository\n- comment_on_issue: Add a comment to an existing GitHub issue\n- create_issue: Open a new issue on a GitHub repository\n- assign_issue: Assign a GitHub issue to a user\n\nUser request: Hey, can you tell the team on issue 142 that the bug is reproducible on macOS 14?\n\nWhich tool?"}], "ideal": "comment_on_issue"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- web_search: Search the public web for a query string\n- fetch_url: Fetch the contents of a specific URL\n- summarize_text: Summarize a block of text\n- translate_text: Translate text from one language to another\n\nUser request: I have the link https://example.com/announcement.html and I want to know what it says.\n\nWhich tool?"}], "ideal": "fetch_url"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_invoice: Create a new invoice for a customer\n- send_invoice: Send an existing invoice to a customer by email\n- void_invoice: Void an existing invoice\n- list_unpaid_invoices: List invoices that have not yet been paid\n- record_payment: Record a payment received against an invoice\n\nUser request: Acme Corp just wired us $4,500 against invoice INV-1042. Can you log that?\n\nWhich tool?"}], "ideal": "record_payment"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- restart_service: Restart a named service on the production cluster\n- scale_deployment: Scale a Kubernetes deployment to N replicas\n- view_logs: Read recent log lines for a named service\n- rollback_deployment: Roll a Kubernetes deployment back to its previous revision\n- list_deployments: List all deployments in a namespace\n\nUser request: Last night's deploy of the checkout service is throwing 500s. Can you put us back on the previous version?\n\nWhich tool?"}], "ideal": "rollback_deployment"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- search_flights: Search available flights between two airports on a given date\n- book_flight: Book a specific flight by flight number and date\n- cancel_booking: Cancel an existing booking by booking reference\n- get_booking_details: Look up the details of a booking by reference\n- check_in: Check the user in for an upcoming flight\n\nUser request: I'm looking at trips for next Tuesday from SFO to JFK, what's available?\n\nWhich tool?"}], "ideal": "search_flights"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_branch: Create a new git branch off of main\n- commit_changes: Commit the staged changes with a message\n- push_branch: Push the current branch to the remote\n- merge_branch: Merge a named branch into main\n- list_branches: List all local git branches\n\nUser request: I just finished editing the docs. Save them to git with the message 'docs: clarify install steps'.\n\nWhich tool?"}], "ideal": "commit_changes"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- get_weather: Get the current weather for a city\n- get_forecast: Get the multi-day weather forecast for a city\n- list_saved_cities: List the user's saved cities\n- save_city: Add a new city to the user's saved cities\n\nUser request: Will it rain in Seattle this weekend?\n\nWhich tool?"}], "ideal": "get_forecast"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- create_user: Create a new user account\n- delete_user: Delete a user account\n- reset_password: Send a password reset link to a user\n- disable_user: Disable a user account without deleting it\n- list_users: List all users\n\nUser request: Jordan from finance forgot their login again. Can you help them get back in without giving them a fresh account?\n\nWhich tool?"}], "ideal": "reset_password"} +{"input": [{"role": "system", "content": "You are an AI agent that selects the most appropriate tool to handle a user request. Given a list of available tools and a user request, output ONLY the name of the single best tool to invoke. Output the tool name on a single line with no quotes, no JSON, no explanation, and no surrounding whitespace."}, {"role": "user", "content": "Available tools:\n- transcribe_audio: Convert an audio file to text\n- summarize_text: Summarize a block of text\n- translate_text: Translate text from one language to another\n- detect_language: Identify the language of a piece of text\n\nUser request: I have a 30-minute meeting recording in meeting.mp3 and I need it as written notes.\n\nWhich tool?"}], "ideal": "transcribe_audio"} diff --git a/evals/registry/evals/agent-tool-routing.yaml b/evals/registry/evals/agent-tool-routing.yaml new file mode 100644 index 0000000000..48c8481a67 --- /dev/null +++ b/evals/registry/evals/agent-tool-routing.yaml @@ -0,0 +1,9 @@ +agent-tool-routing: + id: agent-tool-routing.dev.v0 + description: Tests an LLM's ability to select the single correct tool from a small list when given a natural-language user request. Each prompt presents 4 to 6 thematically related tools and asks for the canonical tool name as the only output. Failure modes targeted are (a) emitting explanation text instead of a bare tool name, (b) selecting a similar-but-incorrect tool that is also in the list, and (c) inventing tools that are not in the list. This is a foundational capability for tool-using agents and a real-world product use case (LLM tool routing in chat assistants and IDE copilots). + disclaimer: This eval is hand-curated with 12 samples covering a thematic mix of common tool-routing scenarios (file ops, communication, dev-ops, data, scheduling, audio, billing). It is intended as a foundational starter set; the same Match template can be extended by adding more samples.jsonl rows without code changes. + metrics: [accuracy] +agent-tool-routing.dev.v0: + class: evals.elsuite.basic.match:Match + args: + samples_jsonl: agent_tool_routing/samples.jsonl