From ac8c5255f8fae6855a022e2a358ee10c1fd18421 Mon Sep 17 00:00:00 2001
From: cuiyuebing <cuipaofu@gmail.com>
Date: Fri, 31 Oct 2025 18:03:37 +0800
Subject: [PATCH 1/2] use base64 data for image context

---
 .../src/alias/agent/agents/_browser_agent.py  | 80 +++++--------------
 1 file changed, 21 insertions(+), 59 deletions(-)

diff --git a/alias/src/alias/agent/agents/_browser_agent.py b/alias/src/alias/agent/agents/_browser_agent.py
index 6424c93..375acfa 100644
--- a/alias/src/alias/agent/agents/_browser_agent.py
+++ b/alias/src/alias/agent/agents/_browser_agent.py
@@ -11,8 +11,6 @@ import json
 from typing import Type, Optional, Any
 import asyncio
 import copy
-import base64
-import shutil
 from loguru import logger
 from pydantic import BaseModel
 
@@ -194,22 +192,6 @@ class BrowserAgent(AliasAgentBase):
         self.toolkit.register_tool_function(self.browser_subtask_manager)
         self.toolkit.register_tool_function(self.image_understanding)
 
-        if (
-            self.model.model_name.startswith("qvq")
-            or "-vl" in self.model.model_name
-            or "4o" in self.model.model_name
-            or "gpt-5" in self.model.model_name
-        ):
-            # If the model supports multimodal input,
-            # prepare a directory for screenshots
-            screenshot_dir = os.path.join(
-                "./logs/screenshots/",
-                "tmp" + "_browser_agent",
-            )
-            if os.path.exists(screenshot_dir):
-                shutil.rmtree(screenshot_dir)
-            os.makedirs(screenshot_dir, exist_ok=True)
-            self.screenshot_dir = screenshot_dir
         self.no_screenshot_tool_list = [
             tool
             for tool in self.toolkit.get_json_schemas()
@@ -243,7 +225,7 @@ class BrowserAgent(AliasAgentBase):
             if isinstance(msg, list)
             else ""
         )
-        
+
         if self.start_url and not self._has_initial_navigated:
             await self._navigate_to_start_url()
             self._has_initial_navigated = True
@@ -264,7 +246,6 @@ class BrowserAgent(AliasAgentBase):
             await self._summarize_mem()
 
             msg_reasoning = await self._pure_reasoning()
-
             tool_calls = msg_reasoning.get_content_blocks("tool_use")
             if tool_calls and tool_calls[0]["name"] == "browser_snapshot":
                 msg_reasoning = await self._reasoning_with_observation()
@@ -389,7 +370,6 @@ class BrowserAgent(AliasAgentBase):
 
         for _ in self.snapshot_in_chunk:
             observe_msg = await self._build_observation()
-
             prompt = await self.formatter.format(
                 msgs=[
                     Msg("system", self.sys_prompt, "system"),
@@ -467,8 +447,7 @@ class BrowserAgent(AliasAgentBase):
         self,
     ) -> Msg:
         """Get a snapshot in text before reasoning"""
-
-        image_path: Optional[str] = None
+        image_data: Optional[str] = None
         if (
             self.model.model_name.startswith("qvq")
             or "-vl" in self.model.model_name
@@ -476,17 +455,10 @@ class BrowserAgent(AliasAgentBase):
             or "gpt-5" in self.model.model_name
         ):
             # If the model supports multimodal input, take a screenshot
-            # and pass it to the observation message
-            img_path = os.path.join(
-                self.screenshot_dir,
-                f"screenshot_{self.iter_n}.png",
-            )
-            # if the img_path already exists,
-            # do not need to take a screenshot again
-            if not os.path.exists(img_path):
-                image_path = await self._get_screenshot(img_path)
+            # and pass it to the observation message as base64
+            image_data = await self._get_screenshot()
 
-        observe_msg = self.observe_by_chunk(image_path)
+        observe_msg = self.observe_by_chunk(image_data)
         return observe_msg
 
     async def _update_chunk_observation_status(
@@ -902,11 +874,10 @@ class BrowserAgent(AliasAgentBase):
         for msg in summarized_memory:
             await self.memory.add(msg)
 
-    async def _get_screenshot(self, img_path: str = "") -> Optional[str]:
+    async def _get_screenshot(self) -> Optional[str]:
         """
-        Optionally take a screenshot of the current web page
-        for use in multimodal prompts.
-        Returns the path to the image if available, else None.
+        Optionally take a screenshot of the current web page for multimodal prompts.
+        Returns base64-encoded PNG data if available, else None.
         """
         try:
             # Prepare tool call for screenshot
@@ -920,7 +891,7 @@ class BrowserAgent(AliasAgentBase):
             screenshot_response = await self.toolkit.call_tool_function(
                 tool_call,
             )
-            # Extract image path from response
+            # Extract image base64 from response
             async for chunk in screenshot_response:
                 if (
                     chunk.content
@@ -928,17 +899,12 @@ class BrowserAgent(AliasAgentBase):
                     and "data" in chunk.content[1]
                 ):
                     image_data = chunk.content[1]["data"]
-                    image_data = base64.b64decode(image_data)
-                    with open(img_path, "wb") as fi:
-                        fi.write(image_data)
-                    returned_img_path = img_path
-                    # Exit loop on success
                 else:
-                    returned_img_path = None
+                    image_data = None
 
         except Exception:
-            returned_img_path = None
-        return returned_img_path
+            image_data = None
+        return image_data
 
     @staticmethod
     def _filter_execution_text(
@@ -993,7 +959,7 @@ class BrowserAgent(AliasAgentBase):
             for i in range(0, len(snapshot_str), max_length)
         ]
 
-    def observe_by_chunk(self, image_path: str | None = "") -> Msg:
+    def observe_by_chunk(self, image_data: str | None = "") -> Msg:
         """Create an observation message for chunk-based reasoning.
 
         This method formats the current chunk of the webpage snapshot with
@@ -1024,12 +990,13 @@ class BrowserAgent(AliasAgentBase):
             or "4o" in self.model.model_name
             or "gpt-5" in self.model.model_name
         ):
-            if image_path:
+            if image_data:
                 image_block = ImageBlock(
                     type="image",
                     source={
-                        "type": "url",
-                        "url": image_path,
+                        "type": "base64",
+                        "media_type": "image/png",
+                        "data": image_data,
                     },
                 )
                 content.append(image_block)
@@ -1383,19 +1350,14 @@ class BrowserAgent(AliasAgentBase):
             ),
         ]
         # Attach screenshot if available
+
         if image_data:
-            image_data = base64.b64decode(image_data)
-            img_path = os.path.join(
-                self.screenshot_dir,
-                f"screenshot_image_understanding_{self.iter_n}.png",
-            )
-            with open(img_path, "wb") as fi:
-                fi.write(image_data)
             image_block = ImageBlock(
                 type="image",
                 source={
-                    "type": "url",
-                    "url": img_path,
+                    "type": "base64",
+                    "media_type": "image/png",
+                    "data": image_data,
                 },
             )
             content_blocks.append(image_block)

From fd2b408fb2f5a15990554462dd1075ee2410f220 Mon Sep 17 00:00:00 2001
From: cuiyuebing <cuipaofu@gmail.com>
Date: Tue, 4 Nov 2025 12:02:16 +0800
Subject: [PATCH 2/2] fix block type, optimize print messages, update gitignore

---
 .gitignore                                    |  3 ++
 alias/.gitignore                              |  3 --
 .../src/alias/agent/agents/_browser_agent.py  | 45 +++++++++----------
 3 files changed, 23 insertions(+), 28 deletions(-)

diff --git a/.gitignore b/.gitignore
index 70d227b..b6fa477 100644
--- a/.gitignore
+++ b/.gitignore
@@ -61,3 +61,6 @@ uv.lock
 # Logs
 logs/
 *.log
+
+# Agent-generated files
+**sessions_mount_dir/
diff --git a/alias/.gitignore b/alias/.gitignore
index 5c49376..44f9db5 100644
--- a/alias/.gitignore
+++ b/alias/.gitignore
@@ -7,12 +7,9 @@ __pycache__/
 # Logs
 logs/
 src/alias/agent/agents/log/
-sessions_mount_dir/
-
 # Python
 *.py[cod]
 *$py.class
 
 # Package
 alias.egg-info/
-
diff --git a/alias/src/alias/agent/agents/_browser_agent.py b/alias/src/alias/agent/agents/_browser_agent.py
index 375acfa..964abbb 100644
--- a/alias/src/alias/agent/agents/_browser_agent.py
+++ b/alias/src/alias/agent/agents/_browser_agent.py
@@ -22,6 +22,7 @@ from agentscope.message import (
     TextBlock,
     ToolResultBlock,
     ImageBlock,
+    Base64Source,
 )
 from agentscope.model import ChatModelBase
 from agentscope.tool import (
@@ -280,7 +281,7 @@ class BrowserAgent(AliasAgentBase):
 
     async def _pure_reasoning(
         self,
-    ):
+    ) -> Msg:
         msg = Msg(
             "user",
             content=self.pure_reasoning_prompt.format(
@@ -310,7 +311,7 @@ class BrowserAgent(AliasAgentBase):
                 msg = Msg(self.name, [], "assistant")
                 async for content_chunk in res:
                     msg.content = content_chunk.content
-                await self.print(msg, False)
+                await self.print(msg)
             else:
                 msg = Msg(self.name, list(res.content), "assistant")
                 await self.print(msg)
@@ -330,12 +331,6 @@ class BrowserAgent(AliasAgentBase):
 
             # Post-process for user interruption
             if interrupted_by_user and msg:
-                # Fake tool results
-                tool_use_blocks: list = (
-                    msg.get_content_blocks(  # pylint: disable=E1133
-                        "tool_use",
-                    )
-                )
                 for tool_call in tool_use_blocks:  # pylint: disable=E1133
                     msg_res = Msg(
                         "system",
@@ -352,7 +347,7 @@ class BrowserAgent(AliasAgentBase):
                     )
 
                     await self.memory.add(msg_res)
-                    await self.print(msg_res, True)
+                    await self.print(msg_res)
 
     async def _reasoning_with_observation(
         self,
@@ -428,7 +423,7 @@ class BrowserAgent(AliasAgentBase):
                     )
 
                     await self.memory.add(msg_res)
-                    await self.print(msg_res, True)
+                    await self.print(msg_res)
             if not self.chunk_continue_status:
                 break
 
@@ -522,7 +517,6 @@ class BrowserAgent(AliasAgentBase):
                 Return a message to the user if the `_finish_function` is
                 called, otherwise return `None`.
         """
-
         tool_res_msg = Msg(
             "system",
             [
@@ -547,6 +541,7 @@ class BrowserAgent(AliasAgentBase):
                     "output"
                 ] = chunk.content
                 # Return message if generate_response is called successfully
+
                 if tool_call[
                     "name"
                 ] == self.finish_function_name and chunk.metadata.get(
@@ -573,7 +568,8 @@ class BrowserAgent(AliasAgentBase):
                     await self.memory.delete(mem_len - 1)
             else:
                 await self.memory.add(tool_res_msg)
-            await self.print(tool_res_msg, False)
+            if tool_call["name"] != self.finish_function_name:
+                await self.print(tool_res_msg)
 
     def _clean_tool_excution_content(
         self,
@@ -623,11 +619,11 @@ class BrowserAgent(AliasAgentBase):
             async for content_chunk in res:
                 decompose_text = content_chunk.content[0]["text"]
                 print_msg.content = content_chunk.content
-                await self.print(print_msg, last=False)
+                await self.print(print_msg, False)
         else:
             decompose_text = res.content[0]["text"]
         print_msg.content = [TextBlock(type="text", text=decompose_text)]
-        await self.print(print_msg, last=True)
+        await self.print(print_msg, True)
 
         # Use path relative to this file for robustness
         reflection_prompt_path = os.path.join(
@@ -790,7 +786,6 @@ class BrowserAgent(AliasAgentBase):
         snapshot_in_chunk = self._split_snapshot_by_chunk(
             snapshot_str,
         )
-
         return snapshot_in_chunk
 
     async def _memory_summarizing(self) -> None:
@@ -993,11 +988,11 @@ class BrowserAgent(AliasAgentBase):
             if image_data:
                 image_block = ImageBlock(
                     type="image",
-                    source={
-                        "type": "base64",
-                        "media_type": "image/png",
-                        "data": image_data,
-                    },
+                    source=Base64Source(
+                        type="base64",
+                        media_type="image/png",
+                        data=image_data,
+                    ),
                 )
                 content.append(image_block)
 
@@ -1354,11 +1349,11 @@ class BrowserAgent(AliasAgentBase):
         if image_data:
             image_block = ImageBlock(
                 type="image",
-                source={
-                    "type": "base64",
-                    "media_type": "image/png",
-                    "data": image_data,
-                },
+                source=Base64Source(
+                    type="base64",
+                    media_type="image/png",
+                    data=image_data,
+                ),
             )
             content_blocks.append(image_block)