update browser-use agent in alias use base64 data for image context
update browser-use agent in alias use base64 data for image context
This commit is contained in:
3
.gitignore
vendored
3
.gitignore
vendored
@@ -61,3 +61,6 @@ uv.lock
|
|||||||
# Logs
|
# Logs
|
||||||
logs/
|
logs/
|
||||||
*.log
|
*.log
|
||||||
|
|
||||||
|
# Agent-generated files
|
||||||
|
**sessions_mount_dir/
|
||||||
|
|||||||
3
alias/.gitignore
vendored
3
alias/.gitignore
vendored
@@ -7,12 +7,9 @@ __pycache__/
|
|||||||
# Logs
|
# Logs
|
||||||
logs/
|
logs/
|
||||||
src/alias/agent/agents/log/
|
src/alias/agent/agents/log/
|
||||||
sessions_mount_dir/
|
|
||||||
|
|
||||||
# Python
|
# Python
|
||||||
*.py[cod]
|
*.py[cod]
|
||||||
*$py.class
|
*$py.class
|
||||||
|
|
||||||
# Package
|
# Package
|
||||||
alias.egg-info/
|
alias.egg-info/
|
||||||
|
|
||||||
|
|||||||
@@ -11,8 +11,6 @@ import json
|
|||||||
from typing import Type, Optional, Any
|
from typing import Type, Optional, Any
|
||||||
import asyncio
|
import asyncio
|
||||||
import copy
|
import copy
|
||||||
import base64
|
|
||||||
import shutil
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
|
|
||||||
@@ -24,6 +22,7 @@ from agentscope.message import (
|
|||||||
TextBlock,
|
TextBlock,
|
||||||
ToolResultBlock,
|
ToolResultBlock,
|
||||||
ImageBlock,
|
ImageBlock,
|
||||||
|
Base64Source,
|
||||||
)
|
)
|
||||||
from agentscope.model import ChatModelBase
|
from agentscope.model import ChatModelBase
|
||||||
from agentscope.tool import (
|
from agentscope.tool import (
|
||||||
@@ -194,22 +193,6 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
self.toolkit.register_tool_function(self.browser_subtask_manager)
|
self.toolkit.register_tool_function(self.browser_subtask_manager)
|
||||||
self.toolkit.register_tool_function(self.image_understanding)
|
self.toolkit.register_tool_function(self.image_understanding)
|
||||||
|
|
||||||
if (
|
|
||||||
self.model.model_name.startswith("qvq")
|
|
||||||
or "-vl" in self.model.model_name
|
|
||||||
or "4o" in self.model.model_name
|
|
||||||
or "gpt-5" in self.model.model_name
|
|
||||||
):
|
|
||||||
# If the model supports multimodal input,
|
|
||||||
# prepare a directory for screenshots
|
|
||||||
screenshot_dir = os.path.join(
|
|
||||||
"./logs/screenshots/",
|
|
||||||
"tmp" + "_browser_agent",
|
|
||||||
)
|
|
||||||
if os.path.exists(screenshot_dir):
|
|
||||||
shutil.rmtree(screenshot_dir)
|
|
||||||
os.makedirs(screenshot_dir, exist_ok=True)
|
|
||||||
self.screenshot_dir = screenshot_dir
|
|
||||||
self.no_screenshot_tool_list = [
|
self.no_screenshot_tool_list = [
|
||||||
tool
|
tool
|
||||||
for tool in self.toolkit.get_json_schemas()
|
for tool in self.toolkit.get_json_schemas()
|
||||||
@@ -243,7 +226,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
if isinstance(msg, list)
|
if isinstance(msg, list)
|
||||||
else ""
|
else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.start_url and not self._has_initial_navigated:
|
if self.start_url and not self._has_initial_navigated:
|
||||||
await self._navigate_to_start_url()
|
await self._navigate_to_start_url()
|
||||||
self._has_initial_navigated = True
|
self._has_initial_navigated = True
|
||||||
@@ -264,7 +247,6 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
await self._summarize_mem()
|
await self._summarize_mem()
|
||||||
|
|
||||||
msg_reasoning = await self._pure_reasoning()
|
msg_reasoning = await self._pure_reasoning()
|
||||||
|
|
||||||
tool_calls = msg_reasoning.get_content_blocks("tool_use")
|
tool_calls = msg_reasoning.get_content_blocks("tool_use")
|
||||||
if tool_calls and tool_calls[0]["name"] == "browser_snapshot":
|
if tool_calls and tool_calls[0]["name"] == "browser_snapshot":
|
||||||
msg_reasoning = await self._reasoning_with_observation()
|
msg_reasoning = await self._reasoning_with_observation()
|
||||||
@@ -299,7 +281,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
|
|
||||||
async def _pure_reasoning(
|
async def _pure_reasoning(
|
||||||
self,
|
self,
|
||||||
):
|
) -> Msg:
|
||||||
msg = Msg(
|
msg = Msg(
|
||||||
"user",
|
"user",
|
||||||
content=self.pure_reasoning_prompt.format(
|
content=self.pure_reasoning_prompt.format(
|
||||||
@@ -329,7 +311,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
msg = Msg(self.name, [], "assistant")
|
msg = Msg(self.name, [], "assistant")
|
||||||
async for content_chunk in res:
|
async for content_chunk in res:
|
||||||
msg.content = content_chunk.content
|
msg.content = content_chunk.content
|
||||||
await self.print(msg, False)
|
await self.print(msg)
|
||||||
else:
|
else:
|
||||||
msg = Msg(self.name, list(res.content), "assistant")
|
msg = Msg(self.name, list(res.content), "assistant")
|
||||||
await self.print(msg)
|
await self.print(msg)
|
||||||
@@ -349,12 +331,6 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
|
|
||||||
# Post-process for user interruption
|
# Post-process for user interruption
|
||||||
if interrupted_by_user and msg:
|
if interrupted_by_user and msg:
|
||||||
# Fake tool results
|
|
||||||
tool_use_blocks: list = (
|
|
||||||
msg.get_content_blocks( # pylint: disable=E1133
|
|
||||||
"tool_use",
|
|
||||||
)
|
|
||||||
)
|
|
||||||
for tool_call in tool_use_blocks: # pylint: disable=E1133
|
for tool_call in tool_use_blocks: # pylint: disable=E1133
|
||||||
msg_res = Msg(
|
msg_res = Msg(
|
||||||
"system",
|
"system",
|
||||||
@@ -371,7 +347,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
await self.memory.add(msg_res)
|
await self.memory.add(msg_res)
|
||||||
await self.print(msg_res, True)
|
await self.print(msg_res)
|
||||||
|
|
||||||
async def _reasoning_with_observation(
|
async def _reasoning_with_observation(
|
||||||
self,
|
self,
|
||||||
@@ -389,7 +365,6 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
|
|
||||||
for _ in self.snapshot_in_chunk:
|
for _ in self.snapshot_in_chunk:
|
||||||
observe_msg = await self._build_observation()
|
observe_msg = await self._build_observation()
|
||||||
|
|
||||||
prompt = await self.formatter.format(
|
prompt = await self.formatter.format(
|
||||||
msgs=[
|
msgs=[
|
||||||
Msg("system", self.sys_prompt, "system"),
|
Msg("system", self.sys_prompt, "system"),
|
||||||
@@ -448,7 +423,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
)
|
)
|
||||||
|
|
||||||
await self.memory.add(msg_res)
|
await self.memory.add(msg_res)
|
||||||
await self.print(msg_res, True)
|
await self.print(msg_res)
|
||||||
if not self.chunk_continue_status:
|
if not self.chunk_continue_status:
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -467,8 +442,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
self,
|
self,
|
||||||
) -> Msg:
|
) -> Msg:
|
||||||
"""Get a snapshot in text before reasoning"""
|
"""Get a snapshot in text before reasoning"""
|
||||||
|
image_data: Optional[str] = None
|
||||||
image_path: Optional[str] = None
|
|
||||||
if (
|
if (
|
||||||
self.model.model_name.startswith("qvq")
|
self.model.model_name.startswith("qvq")
|
||||||
or "-vl" in self.model.model_name
|
or "-vl" in self.model.model_name
|
||||||
@@ -476,17 +450,10 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
or "gpt-5" in self.model.model_name
|
or "gpt-5" in self.model.model_name
|
||||||
):
|
):
|
||||||
# If the model supports multimodal input, take a screenshot
|
# If the model supports multimodal input, take a screenshot
|
||||||
# and pass it to the observation message
|
# and pass it to the observation message as base64
|
||||||
img_path = os.path.join(
|
image_data = await self._get_screenshot()
|
||||||
self.screenshot_dir,
|
|
||||||
f"screenshot_{self.iter_n}.png",
|
|
||||||
)
|
|
||||||
# if the img_path already exists,
|
|
||||||
# do not need to take a screenshot again
|
|
||||||
if not os.path.exists(img_path):
|
|
||||||
image_path = await self._get_screenshot(img_path)
|
|
||||||
|
|
||||||
observe_msg = self.observe_by_chunk(image_path)
|
observe_msg = self.observe_by_chunk(image_data)
|
||||||
return observe_msg
|
return observe_msg
|
||||||
|
|
||||||
async def _update_chunk_observation_status(
|
async def _update_chunk_observation_status(
|
||||||
@@ -550,7 +517,6 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
Return a message to the user if the `_finish_function` is
|
Return a message to the user if the `_finish_function` is
|
||||||
called, otherwise return `None`.
|
called, otherwise return `None`.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
tool_res_msg = Msg(
|
tool_res_msg = Msg(
|
||||||
"system",
|
"system",
|
||||||
[
|
[
|
||||||
@@ -575,6 +541,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
"output"
|
"output"
|
||||||
] = chunk.content
|
] = chunk.content
|
||||||
# Return message if generate_response is called successfully
|
# Return message if generate_response is called successfully
|
||||||
|
|
||||||
if tool_call[
|
if tool_call[
|
||||||
"name"
|
"name"
|
||||||
] == self.finish_function_name and chunk.metadata.get(
|
] == self.finish_function_name and chunk.metadata.get(
|
||||||
@@ -601,7 +568,8 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
await self.memory.delete(mem_len - 1)
|
await self.memory.delete(mem_len - 1)
|
||||||
else:
|
else:
|
||||||
await self.memory.add(tool_res_msg)
|
await self.memory.add(tool_res_msg)
|
||||||
await self.print(tool_res_msg, False)
|
if tool_call["name"] != self.finish_function_name:
|
||||||
|
await self.print(tool_res_msg)
|
||||||
|
|
||||||
def _clean_tool_excution_content(
|
def _clean_tool_excution_content(
|
||||||
self,
|
self,
|
||||||
@@ -651,11 +619,11 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
async for content_chunk in res:
|
async for content_chunk in res:
|
||||||
decompose_text = content_chunk.content[0]["text"]
|
decompose_text = content_chunk.content[0]["text"]
|
||||||
print_msg.content = content_chunk.content
|
print_msg.content = content_chunk.content
|
||||||
await self.print(print_msg, last=False)
|
await self.print(print_msg, False)
|
||||||
else:
|
else:
|
||||||
decompose_text = res.content[0]["text"]
|
decompose_text = res.content[0]["text"]
|
||||||
print_msg.content = [TextBlock(type="text", text=decompose_text)]
|
print_msg.content = [TextBlock(type="text", text=decompose_text)]
|
||||||
await self.print(print_msg, last=True)
|
await self.print(print_msg, True)
|
||||||
|
|
||||||
# Use path relative to this file for robustness
|
# Use path relative to this file for robustness
|
||||||
reflection_prompt_path = os.path.join(
|
reflection_prompt_path = os.path.join(
|
||||||
@@ -818,7 +786,6 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
snapshot_in_chunk = self._split_snapshot_by_chunk(
|
snapshot_in_chunk = self._split_snapshot_by_chunk(
|
||||||
snapshot_str,
|
snapshot_str,
|
||||||
)
|
)
|
||||||
|
|
||||||
return snapshot_in_chunk
|
return snapshot_in_chunk
|
||||||
|
|
||||||
async def _memory_summarizing(self) -> None:
|
async def _memory_summarizing(self) -> None:
|
||||||
@@ -902,11 +869,10 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
for msg in summarized_memory:
|
for msg in summarized_memory:
|
||||||
await self.memory.add(msg)
|
await self.memory.add(msg)
|
||||||
|
|
||||||
async def _get_screenshot(self, img_path: str = "") -> Optional[str]:
|
async def _get_screenshot(self) -> Optional[str]:
|
||||||
"""
|
"""
|
||||||
Optionally take a screenshot of the current web page
|
Optionally take a screenshot of the current web page for multimodal prompts.
|
||||||
for use in multimodal prompts.
|
Returns base64-encoded PNG data if available, else None.
|
||||||
Returns the path to the image if available, else None.
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Prepare tool call for screenshot
|
# Prepare tool call for screenshot
|
||||||
@@ -920,7 +886,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
screenshot_response = await self.toolkit.call_tool_function(
|
screenshot_response = await self.toolkit.call_tool_function(
|
||||||
tool_call,
|
tool_call,
|
||||||
)
|
)
|
||||||
# Extract image path from response
|
# Extract image base64 from response
|
||||||
async for chunk in screenshot_response:
|
async for chunk in screenshot_response:
|
||||||
if (
|
if (
|
||||||
chunk.content
|
chunk.content
|
||||||
@@ -928,17 +894,12 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
and "data" in chunk.content[1]
|
and "data" in chunk.content[1]
|
||||||
):
|
):
|
||||||
image_data = chunk.content[1]["data"]
|
image_data = chunk.content[1]["data"]
|
||||||
image_data = base64.b64decode(image_data)
|
|
||||||
with open(img_path, "wb") as fi:
|
|
||||||
fi.write(image_data)
|
|
||||||
returned_img_path = img_path
|
|
||||||
# Exit loop on success
|
|
||||||
else:
|
else:
|
||||||
returned_img_path = None
|
image_data = None
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
returned_img_path = None
|
image_data = None
|
||||||
return returned_img_path
|
return image_data
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _filter_execution_text(
|
def _filter_execution_text(
|
||||||
@@ -993,7 +954,7 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
for i in range(0, len(snapshot_str), max_length)
|
for i in range(0, len(snapshot_str), max_length)
|
||||||
]
|
]
|
||||||
|
|
||||||
def observe_by_chunk(self, image_path: str | None = "") -> Msg:
|
def observe_by_chunk(self, image_data: str | None = "") -> Msg:
|
||||||
"""Create an observation message for chunk-based reasoning.
|
"""Create an observation message for chunk-based reasoning.
|
||||||
|
|
||||||
This method formats the current chunk of the webpage snapshot with
|
This method formats the current chunk of the webpage snapshot with
|
||||||
@@ -1024,13 +985,14 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
or "4o" in self.model.model_name
|
or "4o" in self.model.model_name
|
||||||
or "gpt-5" in self.model.model_name
|
or "gpt-5" in self.model.model_name
|
||||||
):
|
):
|
||||||
if image_path:
|
if image_data:
|
||||||
image_block = ImageBlock(
|
image_block = ImageBlock(
|
||||||
type="image",
|
type="image",
|
||||||
source={
|
source=Base64Source(
|
||||||
"type": "url",
|
type="base64",
|
||||||
"url": image_path,
|
media_type="image/png",
|
||||||
},
|
data=image_data,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
content.append(image_block)
|
content.append(image_block)
|
||||||
|
|
||||||
@@ -1383,20 +1345,15 @@ class BrowserAgent(AliasAgentBase):
|
|||||||
),
|
),
|
||||||
]
|
]
|
||||||
# Attach screenshot if available
|
# Attach screenshot if available
|
||||||
|
|
||||||
if image_data:
|
if image_data:
|
||||||
image_data = base64.b64decode(image_data)
|
|
||||||
img_path = os.path.join(
|
|
||||||
self.screenshot_dir,
|
|
||||||
f"screenshot_image_understanding_{self.iter_n}.png",
|
|
||||||
)
|
|
||||||
with open(img_path, "wb") as fi:
|
|
||||||
fi.write(image_data)
|
|
||||||
image_block = ImageBlock(
|
image_block = ImageBlock(
|
||||||
type="image",
|
type="image",
|
||||||
source={
|
source=Base64Source(
|
||||||
"type": "url",
|
type="base64",
|
||||||
"url": img_path,
|
media_type="image/png",
|
||||||
},
|
data=image_data,
|
||||||
|
),
|
||||||
)
|
)
|
||||||
content_blocks.append(image_block)
|
content_blocks.append(image_block)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user