init

2025-10-17 21:40:45 +08:00
commit 7d0451131f
155 changed files with 14873 additions and 0 deletions
--- a/browser_use/agent_browser/browser_agent.py
+++ b/browser_use/agent_browser/browser_agent.py
@@ -0,0 +1,395 @@
+# -*- coding: utf-8 -*-
+"""Browser Agent"""
+# pylint: disable=W0212
+
+import re
+import uuid
+from typing import Any, Optional
+
+from agentscope.agent import ReActAgent
+from agentscope.formatter import FormatterBase
+from agentscope.memory import MemoryBase
+from agentscope.message import Msg, TextBlock, ToolUseBlock
+from agentscope.model import ChatModelBase
+from agentscope.token import OpenAITokenCounter, TokenCounterBase
+from agentscope.tool import Toolkit
+
+_BROWSER_AGENT_DEFAULT_SYS_PROMPT = (
+    "You are a helpful browser automation assistant. "
+    "You can navigate websites, take screenshots, and interact with web pages."
+    "Always describe what you see and meta_planner_agent your next steps clearly. "
+    "When taking actions, explain what you're doing and why."
+)
+_BROWSER_AGENT_REASONING_PROMPT = (
+    "You are browsing the current website. "
+    "The snapshot (and screenshot) of the current webpage is (are) given "
+    "below. Since you can only view the latest webpage, "
+    "you must promptly summarize current status, record required data, "
+    "and meta_planner_agent your next steps."
+)
+
+
+async def browser_agent_default_url_pre_reply(
+    self: "BrowserAgent",  # pylint: disable=W0613
+    *args: Any,  # pylint: disable=W0613
+    **kwargs: Any,  # pylint: disable=W0613
+) -> None:
+    """Navigate to start URL if this is the first interaction"""
+    if self.start_url and not self._has_initial_navigated:
+        await self._navigate_to_start_url()
+        self._has_initial_navigated = True
+
+
+async def browser_agent_summarize_mem_pre_reasoning(
+    self: "BrowserAgent",  # pylint: disable=W0613
+    *args: Any,
+    **kwargs: Any,
+) -> None:
+    """Summarize memory if too long"""
+    mem_len = await self.memory.size()
+    if mem_len > self.max_memory_length:
+        await self._memory_summarizing()
+
+
+async def browser_agent_observe_pre_reasoning(
+    self: "BrowserAgent",  # pylint: disable=W0613
+    *args: Any,
+    **kwargs: Any,
+) -> None:
+    """Get a snapshot in text before reasoning"""
+    snapshot_msg = await self._get_snapshot_in_text()
+    await self.memory.add(snapshot_msg)
+
+
+async def browser_agent_remove_observation_post_reasoning(
+    self: "BrowserAgent",  # pylint: disable=W0613
+    *args: Any,
+    **kwargs: Any,
+) -> None:
+    """Remove the snapshot msg after reasoning"""
+    mem_len = await self.memory.size()
+    if mem_len >= 2:
+        await self.memory.delete(mem_len - 2)
+
+
+async def browser_agent_post_acting_clean_content(
+    self: "BrowserAgent",  # pylint: disable=W0613
+    *args: Any,
+    **kwargs: Any,
+) -> None:
+    """
+    Hook func for cleaning the messy return after action.
+    Observation will be done before reasoning steps.
+    """
+    mem_msgs = await self.memory.get_memory()
+    mem_length = await self.memory.size()
+    if len(mem_msgs) == 0:
+        return
+    last_output_msg = mem_msgs[-1]
+    for i, b in enumerate(last_output_msg.content):
+        if b["type"] == "tool_result":
+            for j, return_json in enumerate(b.get("output", [])):
+                if isinstance(return_json, dict) and "text" in return_json:
+                    last_output_msg.content[i]["output"][j][
+                        "output"
+                    ] = self._filter_execution_text(return_json["text"])
+    await self.memory.delete(mem_length - 1)
+    await self.memory.add(last_output_msg)
+
+
+class BrowserAgent(ReActAgent):
+    """
+    Browser Agent that extends ReActAgent with browser-specific capabilities.
+
+    The agent leverages MCP (Model Context Protocol) servers to access browser
+    tools with Playwright, enabling sophisticated web automation tasks.
+
+    Example:
+        .. code-block:: python
+
+            agent = BrowserAgent(
+                name="web_navigator",
+                model=my_chat_model,
+                formatter=my_formatter,
+                memory=my_memory,
+                toolkit=browser_toolkit,
+                start_url="https://example.com"
+            )
+
+            response = await agent.reply("Search for Python tutorials")
+    """
+
+    def __init__(
+        self,
+        name: str,
+        model: ChatModelBase,
+        formatter: FormatterBase,
+        memory: MemoryBase,
+        toolkit: Toolkit,
+        sys_prompt: str = _BROWSER_AGENT_DEFAULT_SYS_PROMPT,
+        max_iters: int = 50,
+        start_url: Optional[str] = "https://www.google.com",
+        reasoning_prompt: str = _BROWSER_AGENT_REASONING_PROMPT,
+        token_counter: TokenCounterBase = OpenAITokenCounter("gpt-4o"),
+        max_mem_length: int = 20,
+    ) -> None:
+        """Initialize the Browser Agent.
+
+        Args:
+            name (str):
+                The unique identifier name for the agent instance.
+            model (ChatModelBase):
+                The chat model used for generating responses and reasoning.
+            formatter (FormatterBase):
+                The formatter used to convert messages into the required format
+                 for the model API.
+            memory (MemoryBase):
+                The memory component used to store and retrieve dialogue
+                history.
+            toolkit (Toolkit):
+                A toolkit object containing the browser tool functions and
+                utilities.
+            sys_prompt (str, optional):
+                The system prompt that defines the agent's behavior and
+                personality.
+                Defaults to _BROWSER_AGENT_DEFAULT_SYS_PROMPT.
+            max_iters (int, optional):
+                The maximum number of reasoning-acting loop iterations.
+                Defaults to 50.
+            start_url (Optional[str], optional):
+                The initial URL to navigate to when the agent starts.
+                Defaults to "https://www.google.com".
+            reasoning_prompt (str, optional):
+                The prompt used during the reasoning phase to guide
+                decision-making.
+                Defaults to _BROWSER_AGENT_REASONING_PROMPT.
+
+        Returns:
+            None
+        """
+        super().__init__(
+            name=name,
+            sys_prompt=sys_prompt,
+            model=model,
+            formatter=formatter,
+            memory=memory,
+            toolkit=toolkit,
+            max_iters=max_iters,
+        )
+
+        self.start_url = start_url
+        self._has_initial_navigated = False
+        self.reasoning_prompt = reasoning_prompt
+        self.max_memory_length = max_mem_length
+        self.token_estimator = token_counter
+
+        self.register_instance_hook(
+            "pre_reply",
+            "browser_agent_default_url_pre_reply",
+            browser_agent_default_url_pre_reply,
+        )
+
+        self.register_instance_hook(
+            "pre_reasoning",
+            "browser_agent_summarize_mem_pre_reasoning",
+            browser_agent_summarize_mem_pre_reasoning,
+        )
+
+        self.register_instance_hook(
+            "pre_reasoning",
+            "browser_agent_observe_pre_reasoning",
+            browser_agent_observe_pre_reasoning,
+        )
+
+        self.register_instance_hook(
+            "post_reasoning",
+            "browser_agent_remove_observation_post_reasoning",
+            browser_agent_remove_observation_post_reasoning,
+        )
+
+        self.register_instance_hook(
+            "post_acting",
+            "browser_agent_post_acting_clean_content",
+            browser_agent_post_acting_clean_content,
+        )
+
+    async def _navigate_to_start_url(self) -> None:
+        """
+        Navigate to the specified start URL using the browser_navigate tool.
+
+        This method is automatically called during the first interaction to
+        navigate to the configured start URL. It executes the browser
+        navigation tool and processes the response to ensure the
+        initial page is loaded.
+
+        Returns:
+            None
+        """
+        tool_call = ToolUseBlock(
+            id=str(uuid.uuid4()),
+            type="tool_use",
+            name="browser_navigate",
+            input={"url": self.start_url},
+        )
+
+        # Execute the navigation tool
+        await self.toolkit.call_tool_function(tool_call)
+
+    async def _get_snapshot_in_text(self) -> Msg:
+        """Capture a text-based snapshot of the current webpage content.
+
+        This method uses the browser_snapshot tool to retrieve the current
+        webpage content in text format, which is used during the reasoning
+        phase to provide context about the current browser state.
+
+        Returns:
+            str: A text representation of the current webpage content,
+                including elements, structure, and visible text.
+
+        Note:
+            This method is called automatically during the reasoning phase and
+            provides essential context for decision-making about next actions.
+        """
+        snapshot_tool_call = ToolUseBlock(
+            type="tool_use",
+            id=str(uuid.uuid4()),  # Generate a unique ID for the tool call
+            name="browser_snapshot",
+            input={},  # No parameters required for this tool
+        )
+        snapshot_response = await self.toolkit.call_tool_function(
+            snapshot_tool_call,
+        )
+        snapshot_str = ""
+        async for chunk in snapshot_response:
+            snapshot_str = chunk.content[0]["text"]
+
+        msg_observe = Msg(
+            "user",
+            content=[
+                TextBlock(
+                    type="text",
+                    text=self.reasoning_prompt + "\n" + snapshot_str,
+                ),
+            ],
+            role="user",
+        )
+
+        return msg_observe
+
+    async def _memory_summarizing(self) -> None:
+        """Summarize the current memory content to prevent context overflow.
+
+        This method is called periodically to condense the conversation history
+        by generating a summary of progress and maintaining only essential
+        information. It preserves the initial user question and creates a
+        concise summary of what has been accomplished and what remains to be
+        done.
+
+        Returns:
+            None
+
+        Note:
+            This method is automatically called every 10 iterations to manage
+            memory usage and maintain context relevance. The summarization
+            helps prevent token limit issues while preserving important task
+            context.
+        """
+        # Extract the initial user question
+        initial_question = None
+        memory_msgs = await self.memory.get_memory()
+        for msg in memory_msgs:
+            if msg.role == "user":
+                initial_question = msg.content
+                break
+
+        # Generate a summary of the current progress
+        hint_msg = Msg(
+            "user",
+            (
+                "Summarize the current progress and outline the next steps "
+                "for this task. Your summary should include:\n"
+                "1. What has been completed so far.\n"
+                "2. What key information has been found.\n"
+                "3. What remains to be done.\n"
+                "Ensure that your summary is clear, concise, and t"
+                "hat no tasks are repeated or skipped."
+            ),
+            role="user",
+        )
+
+        # Format the prompt for the model
+        prompt = self.formatter.format(
+            msgs=[
+                Msg("system", self.sys_prompt, "system"),
+                *memory_msgs,
+                hint_msg,
+            ],
+        )
+
+        # Call the model to generate the summary
+        res = await self.model(prompt)
+
+        # Handle response
+        summary_text = ""
+        if self.model.stream:
+            async for content_chunk in res:
+                summary_text = content_chunk.content[0]["text"]
+        else:
+            summary_text = res.content[0]["text"]
+
+        # Update the memory with the summarized content
+        summarized_memory = []
+        if initial_question:
+            summarized_memory.append(
+                Msg("user", initial_question, role="user"),
+            )
+        summarized_memory.append(
+            Msg(self.name, summary_text, role="assistant"),
+        )
+
+        # Clear and reload memory
+        await self.memory.clear()
+        for msg in summarized_memory:
+            await self.memory.add(msg)
+
+    @staticmethod
+    def _filter_execution_text(
+        text: str,
+        keep_page_state: bool = False,
+    ) -> str:
+        """
+        Filter and clean browser tool execution output to remove verbose
+        content.
+
+        This utility method removes unnecessary verbose content from browser
+        tool responses, including JavaScript code blocks, console messages,
+        and YAML content that can overwhelm the context window without
+        providing useful information.
+
+        Args:
+            text (str):
+                The raw execution text from browser tools that
+                needs to be filtered.
+            keep_page_state (bool, optional):
+                Whether to preserve page state information
+                including URL and YAML content. Defaults to False.
+
+        Returns:
+            str: The filtered execution text.
+        """
+        if not keep_page_state:
+            # Remove Page Snapshot and YAML content
+            text = re.sub(r"- Page URL.*", "", text, flags=re.DOTALL)
+            text = re.sub(r"```yaml.*?```", "", text, flags=re.DOTALL)
+        # Remove JavaScript code blocks
+        text = re.sub(r"```js.*?```", "", text, flags=re.DOTALL)
+        # Remove console messages section that can be very verbose
+        # (between "### New console messages" and "### Page state")
+        text = re.sub(
+            r"### New console messages.*?(?=### Page state)",
+            "",
+            text,
+            flags=re.DOTALL,
+        )
+        # Trim leading/trailing whitespace
+        return text.strip()