init
This commit is contained in:
18
evaluation/ace_bench/README.md
Normal file
18
evaluation/ace_bench/README.md
Normal file
@@ -0,0 +1,18 @@
|
||||
# ACEBench Example
|
||||
|
||||
This is an example of agent-oriented evaluation in AgentScope.
|
||||
|
||||
We take [ACEBench](https://github.com/ACEBench/ACEBench) as an example benchmark, and run
|
||||
a ReAct agent with [Ray](https://github.com/ray-project/ray)-based evaluator, which supports
|
||||
**distributed** and **parallel** evaluation.
|
||||
|
||||
To run the example, you need to install AgentScope first, and then run the evaluation with the following command:
|
||||
|
||||
```bash
|
||||
python main.py --data_dir {data_dir} --result_dir {result_dir}
|
||||
```
|
||||
|
||||
## Further Reading
|
||||
|
||||
- [ACEBench](https://github.com/ACEBench/ACEBench)
|
||||
- [Ray](https://github.com/ray-project/ray)
|
||||
132
evaluation/ace_bench/main.py
Normal file
132
evaluation/ace_bench/main.py
Normal file
@@ -0,0 +1,132 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Example of running ACEBench evaluation with AgentScope."""
|
||||
import asyncio
|
||||
import os
|
||||
from argparse import ArgumentParser
|
||||
from typing import Callable
|
||||
|
||||
from agentscope.agent import ReActAgent
|
||||
from agentscope.evaluate import (
|
||||
ACEBenchmark,
|
||||
ACEPhone,
|
||||
FileEvaluatorStorage,
|
||||
RayEvaluator,
|
||||
SolutionOutput,
|
||||
Task,
|
||||
)
|
||||
from agentscope.formatter import DashScopeChatFormatter
|
||||
from agentscope.message import Msg
|
||||
from agentscope.model import DashScopeChatModel
|
||||
from agentscope.tool import Toolkit
|
||||
|
||||
|
||||
async def react_agent_solution(
|
||||
ace_task: Task,
|
||||
pre_hook: Callable,
|
||||
) -> SolutionOutput:
|
||||
"""Run ReAct agent with the given task in ACEBench.
|
||||
|
||||
Args:
|
||||
ace_task (`Task`):
|
||||
Task to run in ACEBench.
|
||||
pre_hook (Callable):
|
||||
The pre-hook function to save the agent's pre-print messages.
|
||||
"""
|
||||
# Equip tool functions
|
||||
toolkit = Toolkit()
|
||||
for tool, json_schema in ace_task.metadata["tools"]:
|
||||
# register the tool function with the given json schema
|
||||
toolkit.register_tool_function(tool, json_schema=json_schema)
|
||||
|
||||
# Create a ReAct agent
|
||||
agent = ReActAgent(
|
||||
name="Friday",
|
||||
sys_prompt="You are a helpful assistant named Friday. "
|
||||
"Your target is to solve the given task with your tools."
|
||||
"Try to solve the task as best as you can.",
|
||||
model=DashScopeChatModel(
|
||||
api_key=os.environ.get("DASHSCOPE_API_KEY"),
|
||||
model_name="qwen-max",
|
||||
stream=False,
|
||||
),
|
||||
formatter=DashScopeChatFormatter(),
|
||||
toolkit=toolkit,
|
||||
)
|
||||
|
||||
agent.register_instance_hook(
|
||||
"pre_print",
|
||||
"save_logging",
|
||||
pre_hook,
|
||||
)
|
||||
|
||||
# Execute the agent to solve the task
|
||||
msg_input = Msg("user", ace_task.input, role="user")
|
||||
# Print the input by the running agent to call the pre-print hook
|
||||
await agent.print(msg_input)
|
||||
await agent(msg_input)
|
||||
|
||||
# Obtain tool calls sequence
|
||||
memory_msgs = await agent.memory.get_memory()
|
||||
# Obtain tool_use blocks as trajectory
|
||||
traj = []
|
||||
for msg in memory_msgs:
|
||||
traj.extend(msg.get_content_blocks("tool_use"))
|
||||
|
||||
# Obtain the final state of the phone and travel system
|
||||
phone: ACEPhone = ace_task.metadata["phone"]
|
||||
final_state = phone.get_current_state()
|
||||
|
||||
# Wrap into a SolutionOutput
|
||||
solution = SolutionOutput(
|
||||
success=True,
|
||||
output=final_state,
|
||||
trajectory=traj,
|
||||
)
|
||||
return solution
|
||||
|
||||
|
||||
async def main() -> None:
|
||||
"""Main function for running ACEBench."""
|
||||
# Prepare data and results directories
|
||||
parser = ArgumentParser()
|
||||
parser.add_argument(
|
||||
"--data_dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Where to save the dataset.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--result_dir",
|
||||
type=str,
|
||||
required=True,
|
||||
help="Where to save the evaluation results.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--n_workers",
|
||||
type=int,
|
||||
default=1,
|
||||
help="The number of ray workers to use for evaluation.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create the evaluator
|
||||
# or GeneralEvaluator, which more suitable for local debug
|
||||
evaluator = RayEvaluator(
|
||||
name="ACEbench evaluation",
|
||||
benchmark=ACEBenchmark(
|
||||
data_dir=args.data_dir,
|
||||
),
|
||||
# Repeat how many times
|
||||
n_repeat=1,
|
||||
storage=FileEvaluatorStorage(
|
||||
save_dir=args.result_dir,
|
||||
),
|
||||
# How many workers to use
|
||||
n_workers=args.n_workers,
|
||||
)
|
||||
|
||||
# Run the evaluation
|
||||
await evaluator.run(react_agent_solution)
|
||||
|
||||
|
||||
asyncio.run(main())
|
||||
1
evaluation/ace_bench/requirements.txt
Normal file
1
evaluation/ace_bench/requirements.txt
Normal file
@@ -0,0 +1 @@
|
||||
agentscope[full]>=1.0.5
|
||||
Reference in New Issue
Block a user