From ac8c5255f8fae6855a022e2a358ee10c1fd18421 Mon Sep 17 00:00:00 2001 From: cuiyuebing Date: Fri, 31 Oct 2025 18:03:37 +0800 Subject: [PATCH 1/2] use base64 data for image context --- .../src/alias/agent/agents/_browser_agent.py | 80 +++++-------------- 1 file changed, 21 insertions(+), 59 deletions(-) diff --git a/alias/src/alias/agent/agents/_browser_agent.py b/alias/src/alias/agent/agents/_browser_agent.py index 6424c93..375acfa 100644 --- a/alias/src/alias/agent/agents/_browser_agent.py +++ b/alias/src/alias/agent/agents/_browser_agent.py @@ -11,8 +11,6 @@ import json from typing import Type, Optional, Any import asyncio import copy -import base64 -import shutil from loguru import logger from pydantic import BaseModel @@ -194,22 +192,6 @@ class BrowserAgent(AliasAgentBase): self.toolkit.register_tool_function(self.browser_subtask_manager) self.toolkit.register_tool_function(self.image_understanding) - if ( - self.model.model_name.startswith("qvq") - or "-vl" in self.model.model_name - or "4o" in self.model.model_name - or "gpt-5" in self.model.model_name - ): - # If the model supports multimodal input, - # prepare a directory for screenshots - screenshot_dir = os.path.join( - "./logs/screenshots/", - "tmp" + "_browser_agent", - ) - if os.path.exists(screenshot_dir): - shutil.rmtree(screenshot_dir) - os.makedirs(screenshot_dir, exist_ok=True) - self.screenshot_dir = screenshot_dir self.no_screenshot_tool_list = [ tool for tool in self.toolkit.get_json_schemas() @@ -243,7 +225,7 @@ class BrowserAgent(AliasAgentBase): if isinstance(msg, list) else "" ) - + if self.start_url and not self._has_initial_navigated: await self._navigate_to_start_url() self._has_initial_navigated = True @@ -264,7 +246,6 @@ class BrowserAgent(AliasAgentBase): await self._summarize_mem() msg_reasoning = await self._pure_reasoning() - tool_calls = msg_reasoning.get_content_blocks("tool_use") if tool_calls and tool_calls[0]["name"] == "browser_snapshot": msg_reasoning = await self._reasoning_with_observation() @@ -389,7 +370,6 @@ class BrowserAgent(AliasAgentBase): for _ in self.snapshot_in_chunk: observe_msg = await self._build_observation() - prompt = await self.formatter.format( msgs=[ Msg("system", self.sys_prompt, "system"), @@ -467,8 +447,7 @@ class BrowserAgent(AliasAgentBase): self, ) -> Msg: """Get a snapshot in text before reasoning""" - - image_path: Optional[str] = None + image_data: Optional[str] = None if ( self.model.model_name.startswith("qvq") or "-vl" in self.model.model_name @@ -476,17 +455,10 @@ class BrowserAgent(AliasAgentBase): or "gpt-5" in self.model.model_name ): # If the model supports multimodal input, take a screenshot - # and pass it to the observation message - img_path = os.path.join( - self.screenshot_dir, - f"screenshot_{self.iter_n}.png", - ) - # if the img_path already exists, - # do not need to take a screenshot again - if not os.path.exists(img_path): - image_path = await self._get_screenshot(img_path) + # and pass it to the observation message as base64 + image_data = await self._get_screenshot() - observe_msg = self.observe_by_chunk(image_path) + observe_msg = self.observe_by_chunk(image_data) return observe_msg async def _update_chunk_observation_status( @@ -902,11 +874,10 @@ class BrowserAgent(AliasAgentBase): for msg in summarized_memory: await self.memory.add(msg) - async def _get_screenshot(self, img_path: str = "") -> Optional[str]: + async def _get_screenshot(self) -> Optional[str]: """ - Optionally take a screenshot of the current web page - for use in multimodal prompts. - Returns the path to the image if available, else None. + Optionally take a screenshot of the current web page for multimodal prompts. + Returns base64-encoded PNG data if available, else None. """ try: # Prepare tool call for screenshot @@ -920,7 +891,7 @@ class BrowserAgent(AliasAgentBase): screenshot_response = await self.toolkit.call_tool_function( tool_call, ) - # Extract image path from response + # Extract image base64 from response async for chunk in screenshot_response: if ( chunk.content @@ -928,17 +899,12 @@ class BrowserAgent(AliasAgentBase): and "data" in chunk.content[1] ): image_data = chunk.content[1]["data"] - image_data = base64.b64decode(image_data) - with open(img_path, "wb") as fi: - fi.write(image_data) - returned_img_path = img_path - # Exit loop on success else: - returned_img_path = None + image_data = None except Exception: - returned_img_path = None - return returned_img_path + image_data = None + return image_data @staticmethod def _filter_execution_text( @@ -993,7 +959,7 @@ class BrowserAgent(AliasAgentBase): for i in range(0, len(snapshot_str), max_length) ] - def observe_by_chunk(self, image_path: str | None = "") -> Msg: + def observe_by_chunk(self, image_data: str | None = "") -> Msg: """Create an observation message for chunk-based reasoning. This method formats the current chunk of the webpage snapshot with @@ -1024,12 +990,13 @@ class BrowserAgent(AliasAgentBase): or "4o" in self.model.model_name or "gpt-5" in self.model.model_name ): - if image_path: + if image_data: image_block = ImageBlock( type="image", source={ - "type": "url", - "url": image_path, + "type": "base64", + "media_type": "image/png", + "data": image_data, }, ) content.append(image_block) @@ -1383,19 +1350,14 @@ class BrowserAgent(AliasAgentBase): ), ] # Attach screenshot if available + if image_data: - image_data = base64.b64decode(image_data) - img_path = os.path.join( - self.screenshot_dir, - f"screenshot_image_understanding_{self.iter_n}.png", - ) - with open(img_path, "wb") as fi: - fi.write(image_data) image_block = ImageBlock( type="image", source={ - "type": "url", - "url": img_path, + "type": "base64", + "media_type": "image/png", + "data": image_data, }, ) content_blocks.append(image_block) From fd2b408fb2f5a15990554462dd1075ee2410f220 Mon Sep 17 00:00:00 2001 From: cuiyuebing Date: Tue, 4 Nov 2025 12:02:16 +0800 Subject: [PATCH 2/2] fix block type, optimize print messages, update gitignore --- .gitignore | 3 ++ alias/.gitignore | 3 -- .../src/alias/agent/agents/_browser_agent.py | 45 +++++++++---------- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/.gitignore b/.gitignore index 70d227b..b6fa477 100644 --- a/.gitignore +++ b/.gitignore @@ -61,3 +61,6 @@ uv.lock # Logs logs/ *.log + +# Agent-generated files +**sessions_mount_dir/ diff --git a/alias/.gitignore b/alias/.gitignore index 5c49376..44f9db5 100644 --- a/alias/.gitignore +++ b/alias/.gitignore @@ -7,12 +7,9 @@ __pycache__/ # Logs logs/ src/alias/agent/agents/log/ -sessions_mount_dir/ - # Python *.py[cod] *$py.class # Package alias.egg-info/ - diff --git a/alias/src/alias/agent/agents/_browser_agent.py b/alias/src/alias/agent/agents/_browser_agent.py index 375acfa..964abbb 100644 --- a/alias/src/alias/agent/agents/_browser_agent.py +++ b/alias/src/alias/agent/agents/_browser_agent.py @@ -22,6 +22,7 @@ from agentscope.message import ( TextBlock, ToolResultBlock, ImageBlock, + Base64Source, ) from agentscope.model import ChatModelBase from agentscope.tool import ( @@ -280,7 +281,7 @@ class BrowserAgent(AliasAgentBase): async def _pure_reasoning( self, - ): + ) -> Msg: msg = Msg( "user", content=self.pure_reasoning_prompt.format( @@ -310,7 +311,7 @@ class BrowserAgent(AliasAgentBase): msg = Msg(self.name, [], "assistant") async for content_chunk in res: msg.content = content_chunk.content - await self.print(msg, False) + await self.print(msg) else: msg = Msg(self.name, list(res.content), "assistant") await self.print(msg) @@ -330,12 +331,6 @@ class BrowserAgent(AliasAgentBase): # Post-process for user interruption if interrupted_by_user and msg: - # Fake tool results - tool_use_blocks: list = ( - msg.get_content_blocks( # pylint: disable=E1133 - "tool_use", - ) - ) for tool_call in tool_use_blocks: # pylint: disable=E1133 msg_res = Msg( "system", @@ -352,7 +347,7 @@ class BrowserAgent(AliasAgentBase): ) await self.memory.add(msg_res) - await self.print(msg_res, True) + await self.print(msg_res) async def _reasoning_with_observation( self, @@ -428,7 +423,7 @@ class BrowserAgent(AliasAgentBase): ) await self.memory.add(msg_res) - await self.print(msg_res, True) + await self.print(msg_res) if not self.chunk_continue_status: break @@ -522,7 +517,6 @@ class BrowserAgent(AliasAgentBase): Return a message to the user if the `_finish_function` is called, otherwise return `None`. """ - tool_res_msg = Msg( "system", [ @@ -547,6 +541,7 @@ class BrowserAgent(AliasAgentBase): "output" ] = chunk.content # Return message if generate_response is called successfully + if tool_call[ "name" ] == self.finish_function_name and chunk.metadata.get( @@ -573,7 +568,8 @@ class BrowserAgent(AliasAgentBase): await self.memory.delete(mem_len - 1) else: await self.memory.add(tool_res_msg) - await self.print(tool_res_msg, False) + if tool_call["name"] != self.finish_function_name: + await self.print(tool_res_msg) def _clean_tool_excution_content( self, @@ -623,11 +619,11 @@ class BrowserAgent(AliasAgentBase): async for content_chunk in res: decompose_text = content_chunk.content[0]["text"] print_msg.content = content_chunk.content - await self.print(print_msg, last=False) + await self.print(print_msg, False) else: decompose_text = res.content[0]["text"] print_msg.content = [TextBlock(type="text", text=decompose_text)] - await self.print(print_msg, last=True) + await self.print(print_msg, True) # Use path relative to this file for robustness reflection_prompt_path = os.path.join( @@ -790,7 +786,6 @@ class BrowserAgent(AliasAgentBase): snapshot_in_chunk = self._split_snapshot_by_chunk( snapshot_str, ) - return snapshot_in_chunk async def _memory_summarizing(self) -> None: @@ -993,11 +988,11 @@ class BrowserAgent(AliasAgentBase): if image_data: image_block = ImageBlock( type="image", - source={ - "type": "base64", - "media_type": "image/png", - "data": image_data, - }, + source=Base64Source( + type="base64", + media_type="image/png", + data=image_data, + ), ) content.append(image_block) @@ -1354,11 +1349,11 @@ class BrowserAgent(AliasAgentBase): if image_data: image_block = ImageBlock( type="image", - source={ - "type": "base64", - "media_type": "image/png", - "data": image_data, - }, + source=Base64Source( + type="base64", + media_type="image/png", + data=image_data, + ), ) content_blocks.append(image_block)