evotraders/evaluation/ace_bench/main.py

# -*- coding: utf-8 -*-
"""Example of running ACEBench evaluation with AgentScope."""
import asyncio
import os
from argparse import ArgumentParser
from typing import Callable

from agentscope.agent import ReActAgent
from agentscope.evaluate import (
    ACEBenchmark,
    ACEPhone,
    FileEvaluatorStorage,
    RayEvaluator,
    SolutionOutput,
    Task,
)
from agentscope.formatter import DashScopeChatFormatter
from agentscope.message import Msg
from agentscope.model import DashScopeChatModel
from agentscope.tool import Toolkit


async def react_agent_solution(
    ace_task: Task,
    pre_hook: Callable,
) -> SolutionOutput:
    """Run ReAct agent with the given task in ACEBench.

    Args:
        ace_task (`Task`):
            Task to run in ACEBench.
        pre_hook (Callable):
            The pre-hook function to save the agent's pre-print messages.
    """
    # Equip tool functions
    toolkit = Toolkit()
    for tool, json_schema in ace_task.metadata["tools"]:
        # register the tool function with the given json schema
        toolkit.register_tool_function(tool, json_schema=json_schema)

    # Create a ReAct agent
    agent = ReActAgent(
        name="Friday",
        sys_prompt="You are a helpful assistant named Friday. "
        "Your target is to solve the given task with your tools."
        "Try to solve the task as best as you can.",
        model=DashScopeChatModel(
            api_key=os.environ.get("DASHSCOPE_API_KEY"),
            model_name="qwen-max",
            stream=False,
        ),
        formatter=DashScopeChatFormatter(),
        toolkit=toolkit,
    )

    agent.register_instance_hook(
        "pre_print",
        "save_logging",
        pre_hook,
    )

    # Execute the agent to solve the task
    msg_input = Msg("user", ace_task.input, role="user")
    # Print the input by the running agent to call the pre-print hook
    await agent.print(msg_input)
    await agent(msg_input)

    # Obtain tool calls sequence
    memory_msgs = await agent.memory.get_memory()
    # Obtain tool_use blocks as trajectory
    traj = []
    for msg in memory_msgs:
        traj.extend(msg.get_content_blocks("tool_use"))

    # Obtain the final state of the phone and travel system
    phone: ACEPhone = ace_task.metadata["phone"]
    final_state = phone.get_current_state()

    # Wrap into a SolutionOutput
    solution = SolutionOutput(
        success=True,
        output=final_state,
        trajectory=traj,
    )
    return solution


async def main() -> None:
    """Main function for running ACEBench."""
    # Prepare data and results directories
    parser = ArgumentParser()
    parser.add_argument(
        "--data_dir",
        type=str,
        required=True,
        help="Where to save the dataset.",
    )
    parser.add_argument(
        "--result_dir",
        type=str,
        required=True,
        help="Where to save the evaluation results.",
    )
    parser.add_argument(
        "--n_workers",
        type=int,
        default=1,
        help="The number of ray workers to use for evaluation.",
    )
    args = parser.parse_args()

    # Create the evaluator
    #  or GeneralEvaluator, which more suitable for local debug
    evaluator = RayEvaluator(
        name="ACEbench evaluation",
        benchmark=ACEBenchmark(
            data_dir=args.data_dir,
        ),
        # Repeat how many times
        n_repeat=1,
        storage=FileEvaluatorStorage(
            save_dir=args.result_dir,
        ),
        # How many workers to use
        n_workers=args.n_workers,
    )

    # Run the evaluation
    await evaluator.run(react_agent_solution)


asyncio.run(main())