Files
evotraders/evaluation/ace_bench/main.py
raykkk 7d0451131f init
2025-10-17 21:40:45 +08:00

133 lines
3.6 KiB
Python

# -*- coding: utf-8 -*-
"""Example of running ACEBench evaluation with AgentScope."""
import asyncio
import os
from argparse import ArgumentParser
from typing import Callable
from agentscope.agent import ReActAgent
from agentscope.evaluate import (
ACEBenchmark,
ACEPhone,
FileEvaluatorStorage,
RayEvaluator,
SolutionOutput,
Task,
)
from agentscope.formatter import DashScopeChatFormatter
from agentscope.message import Msg
from agentscope.model import DashScopeChatModel
from agentscope.tool import Toolkit
async def react_agent_solution(
ace_task: Task,
pre_hook: Callable,
) -> SolutionOutput:
"""Run ReAct agent with the given task in ACEBench.
Args:
ace_task (`Task`):
Task to run in ACEBench.
pre_hook (Callable):
The pre-hook function to save the agent's pre-print messages.
"""
# Equip tool functions
toolkit = Toolkit()
for tool, json_schema in ace_task.metadata["tools"]:
# register the tool function with the given json schema
toolkit.register_tool_function(tool, json_schema=json_schema)
# Create a ReAct agent
agent = ReActAgent(
name="Friday",
sys_prompt="You are a helpful assistant named Friday. "
"Your target is to solve the given task with your tools."
"Try to solve the task as best as you can.",
model=DashScopeChatModel(
api_key=os.environ.get("DASHSCOPE_API_KEY"),
model_name="qwen-max",
stream=False,
),
formatter=DashScopeChatFormatter(),
toolkit=toolkit,
)
agent.register_instance_hook(
"pre_print",
"save_logging",
pre_hook,
)
# Execute the agent to solve the task
msg_input = Msg("user", ace_task.input, role="user")
# Print the input by the running agent to call the pre-print hook
await agent.print(msg_input)
await agent(msg_input)
# Obtain tool calls sequence
memory_msgs = await agent.memory.get_memory()
# Obtain tool_use blocks as trajectory
traj = []
for msg in memory_msgs:
traj.extend(msg.get_content_blocks("tool_use"))
# Obtain the final state of the phone and travel system
phone: ACEPhone = ace_task.metadata["phone"]
final_state = phone.get_current_state()
# Wrap into a SolutionOutput
solution = SolutionOutput(
success=True,
output=final_state,
trajectory=traj,
)
return solution
async def main() -> None:
"""Main function for running ACEBench."""
# Prepare data and results directories
parser = ArgumentParser()
parser.add_argument(
"--data_dir",
type=str,
required=True,
help="Where to save the dataset.",
)
parser.add_argument(
"--result_dir",
type=str,
required=True,
help="Where to save the evaluation results.",
)
parser.add_argument(
"--n_workers",
type=int,
default=1,
help="The number of ray workers to use for evaluation.",
)
args = parser.parse_args()
# Create the evaluator
# or GeneralEvaluator, which more suitable for local debug
evaluator = RayEvaluator(
name="ACEbench evaluation",
benchmark=ACEBenchmark(
data_dir=args.data_dir,
),
# Repeat how many times
n_repeat=1,
storage=FileEvaluatorStorage(
save_dir=args.result_dir,
),
# How many workers to use
n_workers=args.n_workers,
)
# Run the evaluation
await evaluator.run(react_agent_solution)
asyncio.run(main())