init

2025-10-17 21:40:45 +08:00
commit 7d0451131f
155 changed files with 14873 additions and 0 deletions
--- a/evaluation/ace_bench/README.md
+++ b/evaluation/ace_bench/README.md
@@ -0,0 +1,18 @@
+# ACEBench Example
+
+This is an example of agent-oriented evaluation in AgentScope.
+
+We take [ACEBench](https://github.com/ACEBench/ACEBench) as an example benchmark, and run
+a ReAct agent with [Ray](https://github.com/ray-project/ray)-based evaluator, which supports
+**distributed** and **parallel** evaluation.
+
+To run the example, you need to install AgentScope first, and then run the evaluation with the following command:
+
+```bash
+python main.py --data_dir {data_dir} --result_dir {result_dir}
+```
+
+## Further Reading
+
+- [ACEBench](https://github.com/ACEBench/ACEBench)
+- [Ray](https://github.com/ray-project/ray)
--- a/evaluation/ace_bench/main.py
+++ b/evaluation/ace_bench/main.py
@@ -0,0 +1,132 @@
+# -*- coding: utf-8 -*-
+"""Example of running ACEBench evaluation with AgentScope."""
+import asyncio
+import os
+from argparse import ArgumentParser
+from typing import Callable
+
+from agentscope.agent import ReActAgent
+from agentscope.evaluate import (
+    ACEBenchmark,
+    ACEPhone,
+    FileEvaluatorStorage,
+    RayEvaluator,
+    SolutionOutput,
+    Task,
+)
+from agentscope.formatter import DashScopeChatFormatter
+from agentscope.message import Msg
+from agentscope.model import DashScopeChatModel
+from agentscope.tool import Toolkit
+
+
+async def react_agent_solution(
+    ace_task: Task,
+    pre_hook: Callable,
+) -> SolutionOutput:
+    """Run ReAct agent with the given task in ACEBench.
+
+    Args:
+        ace_task (`Task`):
+            Task to run in ACEBench.
+        pre_hook (Callable):
+            The pre-hook function to save the agent's pre-print messages.
+    """
+    # Equip tool functions
+    toolkit = Toolkit()
+    for tool, json_schema in ace_task.metadata["tools"]:
+        # register the tool function with the given json schema
+        toolkit.register_tool_function(tool, json_schema=json_schema)
+
+    # Create a ReAct agent
+    agent = ReActAgent(
+        name="Friday",
+        sys_prompt="You are a helpful assistant named Friday. "
+        "Your target is to solve the given task with your tools."
+        "Try to solve the task as best as you can.",
+        model=DashScopeChatModel(
+            api_key=os.environ.get("DASHSCOPE_API_KEY"),
+            model_name="qwen-max",
+            stream=False,
+        ),
+        formatter=DashScopeChatFormatter(),
+        toolkit=toolkit,
+    )
+
+    agent.register_instance_hook(
+        "pre_print",
+        "save_logging",
+        pre_hook,
+    )
+
+    # Execute the agent to solve the task
+    msg_input = Msg("user", ace_task.input, role="user")
+    # Print the input by the running agent to call the pre-print hook
+    await agent.print(msg_input)
+    await agent(msg_input)
+
+    # Obtain tool calls sequence
+    memory_msgs = await agent.memory.get_memory()
+    # Obtain tool_use blocks as trajectory
+    traj = []
+    for msg in memory_msgs:
+        traj.extend(msg.get_content_blocks("tool_use"))
+
+    # Obtain the final state of the phone and travel system
+    phone: ACEPhone = ace_task.metadata["phone"]
+    final_state = phone.get_current_state()
+
+    # Wrap into a SolutionOutput
+    solution = SolutionOutput(
+        success=True,
+        output=final_state,
+        trajectory=traj,
+    )
+    return solution
+
+
+async def main() -> None:
+    """Main function for running ACEBench."""
+    # Prepare data and results directories
+    parser = ArgumentParser()
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        required=True,
+        help="Where to save the dataset.",
+    )
+    parser.add_argument(
+        "--result_dir",
+        type=str,
+        required=True,
+        help="Where to save the evaluation results.",
+    )
+    parser.add_argument(
+        "--n_workers",
+        type=int,
+        default=1,
+        help="The number of ray workers to use for evaluation.",
+    )
+    args = parser.parse_args()
+
+    # Create the evaluator
+    #  or GeneralEvaluator, which more suitable for local debug
+    evaluator = RayEvaluator(
+        name="ACEbench evaluation",
+        benchmark=ACEBenchmark(
+            data_dir=args.data_dir,
+        ),
+        # Repeat how many times
+        n_repeat=1,
+        storage=FileEvaluatorStorage(
+            save_dir=args.result_dir,
+        ),
+        # How many workers to use
+        n_workers=args.n_workers,
+    )
+
+    # Run the evaluation
+    await evaluator.run(react_agent_solution)
+
+
+asyncio.run(main())
--- a/evaluation/ace_bench/requirements.txt
+++ b/evaluation/ace_bench/requirements.txt
@@ -0,0 +1 @@
+agentscope[full]>=1.0.5