From 005c53791e647f338ab526172254c332dcc99ebb Mon Sep 17 00:00:00 2001 From: scrum Date: Sat, 14 Feb 2026 21:25:59 +0800 Subject: [PATCH 1/2] feat: add interactive wait tool for human-in-the-loop tasks --- agent.py | 29 +++++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/agent.py b/agent.py index f650381..2a7e285 100644 --- a/agent.py +++ b/agent.py @@ -58,7 +58,23 @@ def multiply_numbers(x: float, y: float) -> dict: """Multiplies two numbers.""" return {"result": x * y} - +def wait_for_user_input(prompt_text: str) -> dict: + """Pauses the agent and waits for the user to press Enter in the terminal. + + Useful when the agent needs the user to perform an action outside the browser + (like scanning a QR code, entering a 2FA code) before continuing. + + Args: + prompt_text: The message to display to the user explaining what they need to do. + """ + print(f"\n" + "="*40) + print(f"🛑 AGENT REQUEST: {prompt_text}") + print(f"="*40 + "\n") + + # 这行代码会让程序“卡住”,直到你按回车 + input("Press Enter to continue execution...") + + return {"status": "success", "message": "User confirmed action complete."} class BrowserAgent: def __init__( @@ -96,7 +112,10 @@ def __init__( # For example: types.FunctionDeclaration.from_callable( client=self._client, callable=multiply_numbers - ) + ), + types.FunctionDeclaration.from_callable( + client=self._client, callable=wait_for_user_input + ), ] self._generate_content_config = GenerateContentConfig( @@ -122,6 +141,7 @@ def handle_action(self, action: types.FunctionCall) -> FunctionResponseT: """Handles the action and returns the environment state.""" if action.name == "open_web_browser": return self._browser_computer.open_web_browser() + elif action.name == "click_at": x = self.denormalize_x(action.args["x"]) y = self.denormalize_y(action.args["y"]) @@ -193,6 +213,9 @@ def handle_action(self, action: types.FunctionCall) -> FunctionResponseT: # Handle the custom function declarations here. elif action.name == multiply_numbers.__name__: return multiply_numbers(x=action.args["x"], y=action.args["y"]) + elif action.name == wait_for_user_input.__name__: + return wait_for_user_input(prompt_text=action.args["prompt_text"]) + # -------------------- else: raise ValueError(f"Unsupported function: {action}") @@ -347,6 +370,8 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]: ) ) elif isinstance(fc_result, dict): + response_data = fc_result.copy() + response_data.update(extra_fr_fields) function_responses.append( FunctionResponse(name=function_call.name, response=fc_result) ) From 36689b4802dff4358d33bb77d0b06d9030cb4fa5 Mon Sep 17 00:00:00 2001 From: scrum Date: Sat, 14 Feb 2026 21:45:54 +0800 Subject: [PATCH 2/2] fix: address code review feedback (security sanitization + logic bug) --- agent.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/agent.py b/agent.py index 2a7e285..73ae768 100644 --- a/agent.py +++ b/agent.py @@ -59,19 +59,14 @@ def multiply_numbers(x: float, y: float) -> dict: """Multiplies two numbers.""" return {"result": x * y} def wait_for_user_input(prompt_text: str) -> dict: - """Pauses the agent and waits for the user to press Enter in the terminal. + """Pauses the agent and waits for the user to press Enter in the terminal.""" + + safe_prompt = "".join(ch for ch in prompt_text if ch.isprintable()) - Useful when the agent needs the user to perform an action outside the browser - (like scanning a QR code, entering a 2FA code) before continuing. - - Args: - prompt_text: The message to display to the user explaining what they need to do. - """ print(f"\n" + "="*40) - print(f"🛑 AGENT REQUEST: {prompt_text}") + print(f"🛑 AGENT REQUEST: {safe_prompt}") print(f"="*40 + "\n") - # 这行代码会让程序“卡住”,直到你按回车 input("Press Enter to continue execution...") return {"status": "success", "message": "User confirmed action complete."} @@ -214,8 +209,7 @@ def handle_action(self, action: types.FunctionCall) -> FunctionResponseT: elif action.name == multiply_numbers.__name__: return multiply_numbers(x=action.args["x"], y=action.args["y"]) elif action.name == wait_for_user_input.__name__: - return wait_for_user_input(prompt_text=action.args["prompt_text"]) - # -------------------- + return wait_for_user_input(prompt_text=action.args["prompt_text"]) else: raise ValueError(f"Unsupported function: {action}") @@ -373,7 +367,7 @@ def run_one_iteration(self) -> Literal["COMPLETE", "CONTINUE"]: response_data = fc_result.copy() response_data.update(extra_fr_fields) function_responses.append( - FunctionResponse(name=function_call.name, response=fc_result) + FunctionResponse(name=function_call.name, response=response_data) ) self._contents.append(