init

2025-10-17 21:40:45 +08:00
commit 7d0451131f
155 changed files with 14873 additions and 0 deletions
--- a/deep_research/agent_deep_research/utils.py
+++ b/deep_research/agent_deep_research/utils.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+"""The utilities for deep research agent"""
+import json
+import os
+import re
+from typing import Any, Sequence, Type, Union
+
+from agentscope.tool import Toolkit, ToolResponse
+from pydantic import BaseModel
+
+TOOL_RESULTS_MAX_WORDS = 5000
+
+
+def get_prompt_from_file(
+    file_path: str,
+    return_json: bool,
+) -> Union[str, dict]:
+    """Get prompt from file"""
+    with open(os.path.join(file_path), "r", encoding="utf-8") as f:
+        if return_json:
+            prompt = json.load(f)
+        else:
+            prompt = f.read()
+    return prompt
+
+
+def truncate_by_words(sentence: str) -> str:
+    """Truncate too long sentences by words number"""
+    words = re.findall(
+        r"\w+|[^\w\s]",
+        sentence,
+        re.UNICODE,
+    )
+
+    word_count = 0
+    result = []
+    for word in words:
+        if re.match(r"\w+", word):
+            word_count += 1
+        if word_count > TOOL_RESULTS_MAX_WORDS:
+            break
+        result.append(word)
+
+    truncated_sentence = ""
+    for i, word in enumerate(result):
+        if i == 0:
+            truncated_sentence += word
+        elif re.match(r"\w+", word):
+            truncated_sentence += " " + word
+        else:
+            truncated_sentence += word
+    return truncated_sentence
+
+
+def truncate_search_result(
+    res: list,
+    search_func: str = "tavily-search",
+    extract_function: str = "tavily-extract",
+) -> list:
+    """Truncate search result in deep research agent"""
+    if search_func != "tavily-search" or extract_function != "tavily-extract":
+        raise NotImplementedError(
+            "Specific implementation of truncation should be provided.",
+        )
+
+    for i, val in enumerate(res):
+        res[i]["text"] = truncate_by_words(val["text"])
+
+    return res
+
+
+def generate_structure_output(**kwargs: Any) -> ToolResponse:
+    """Generate a structured output tool response.
+
+    This function is designed to be used as a tool function for generating
+    structured outputs. It takes arbitrary keyword arguments and wraps them
+    in a ToolResponse with metadata.
+
+    Args:
+        **kwargs: Arbitrary keyword arguments that should match the format
+            of the expected structured output specification.
+
+    Returns:
+        ToolResponse: A tool response object with empty content and the
+            provided kwargs as metadata.
+
+    Note:
+        The input parameters should be in the same format as the specification
+        and include as much detail as requested by the calling context.
+    """
+    return ToolResponse(content=[], metadata=kwargs)
+
+
+def get_dynamic_tool_call_json(data_model_type: Type[BaseModel]) -> list[dict]:
+    """Generate JSON schema for dynamic tool calling with a given data model.
+
+    Creates a temporary toolkit, registers the structure output function,
+    and configures it with the specified data model to generate appropriate
+    JSON schemas for tool calling.
+
+    Args:
+        data_model_type: A Pydantic BaseModel class that defines the expected
+            structure of the tool output.
+
+    Returns:
+        A list of dictionary that contains the JSON schemas for
+        the configured tool, suitable for use in API calls that
+        support structured outputs.
+
+    Example:
+        class MyModel(BaseModel):
+            name: str
+            value: int
+
+        schema = get_dynamic_tool_call_json(MyModel)
+    """
+    tmp_toolkit = Toolkit()
+    tmp_toolkit.register_tool_function(generate_structure_output)
+    tmp_toolkit.set_extended_model(
+        "generate_structure_output",
+        data_model_type,
+    )
+    return tmp_toolkit.get_json_schemas()
+
+
+def get_structure_output(blocks: list | Sequence) -> dict:
+    """Extract structured output from a sequence of blocks.
+
+    Processes a list or sequence of blocks to extract tool use outputs
+    and combine them into a single dictionary. This is typically used
+    to parse responses from language models that include tool calls.
+
+    Args:
+        blocks: A list or sequence of blocks that may contain tool use
+            information. Each block should be a dictionary with 'type'
+            and 'input' keys for tool use blocks.
+
+    Returns:
+        A dictionary containing the combined input data from all tool
+        use blocks found in the input sequence.
+
+    Example:
+        blocks = [
+            {"type": "tool_use", "input": {"name": "test"}},
+            {"type": "text", "content": "Some text"},
+            {"type": "tool_use", "input": {"value": 42}}
+        ]
+        result = PromptBase.get_structure_output(blocks)
+        # result: {"name": "test", "value": 42}
+    """
+
+    dict_output = {}
+    for block in blocks:
+        if isinstance(block, dict) and block.get("type") == "tool_use":
+            dict_output.update(block.get("input", {}))
+    return dict_output
+
+
+def load_prompt_dict() -> dict:
+    """Load prompt into dict"""
+    prompt_dict = {}
+    cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+    prompt_dict["add_note"] = get_prompt_from_file(
+        file_path=os.path.join(
+            cur_dir,
+            "built_in_prompt/prompt_worker_additional_sys_prompt.md",
+        ),
+        return_json=False,
+    )
+
+    prompt_dict["tool_use_rule"] = get_prompt_from_file(
+        file_path=os.path.join(
+            cur_dir,
+            "built_in_prompt/prompt_tool_usage_rules.md",
+        ),
+        return_json=False,
+    )
+
+    prompt_dict["decompose_sys_prompt"] = get_prompt_from_file(
+        file_path=os.path.join(
+            cur_dir,
+            "built_in_prompt/prompt_decompose_subtask.md",
+        ),
+        return_json=False,
+    )
+
+    prompt_dict["expansion_sys_prompt"] = get_prompt_from_file(
+        file_path=os.path.join(
+            cur_dir,
+            "built_in_prompt/prompt_deeper_expansion.md",
+        ),
+        return_json=False,
+    )
+
+    prompt_dict["summarize_sys_prompt"] = get_prompt_from_file(
+        file_path=os.path.join(
+            cur_dir,
+            "built_in_prompt/prompt_inprocess_report.md",
+        ),
+        return_json=False,
+    )
+
+    prompt_dict["reporting_sys_prompt"] = get_prompt_from_file(
+        file_path=os.path.join(
+            cur_dir,
+            "built_in_prompt/prompt_deepresearch_summary_report.md",
+        ),
+        return_json=False,
+    )
+
+    prompt_dict["reflect_sys_prompt"] = get_prompt_from_file(
+        file_path=os.path.join(
+            cur_dir,
+            "built_in_prompt/prompt_reflect_failure.md",
+        ),
+        return_json=False,
+    )
+
+    prompt_dict["reasoning_prompt"] = (
+        "## Current Subtask:\n{objective}\n"
+        "## Working Plan:\n{meta_planner_agent}\n"
+        "{knowledge_gap}\n"
+        "## Research Depth:\n{depth}"
+    )
+
+    prompt_dict["previous_plan_inst"] = (
+        "## Previous Plan:\n{previous_plan}\n"
+        "## Current Subtask:\n{objective}\n"
+    )
+
+    prompt_dict["max_depth_hint"] = (
+        "The search depth has reached the maximum limit. So the "
+        "current subtask can not be further decomposed and "
+        "expanded anymore. I need to find another way to get it "
+        "done no matter what."
+    )
+
+    prompt_dict["expansion_inst"] = (
+        "Review the web search results and identify whether "
+        "there is any information that can potentially help address "
+        "checklist items or fulfill knowledge gaps of the task, "
+        "but whose content is limited or only briefly mentioned.\n"
+        "**Task Description:**\n{objective}\n"
+        "**Checklist:**\n{checklist}\n"
+        "**Knowledge Gaps:**\n{knowledge_gaps}\n"
+        "**Search Results:**\n{search_results}\n"
+        "**Output:**\n"
+    )
+
+    prompt_dict["follow_up_judge_sys_prompt"] = (
+        "To provide sufficient external information for the user's "
+        "query, you have conducted a web search to obtain additional "
+        "data. However, you found that some of the information, while "
+        "important, was insufficient. Consequently, you extracted the "
+        "entire content from one of the URLs to gather more "
+        "comprehensive information. Now, you must rigorously and "
+        "carefully assess whether, after both the web search and "
+        "extraction process, the information content is adequate to "
+        "address the given task. Be aware that any arbitrary decisions "
+        "may result in unnecessary and unacceptable time costs.\n"
+    )
+
+    prompt_dict[
+        "retry_hint"
+    ] = "Something went wrong when {state}. I need to retry."
+
+    prompt_dict["need_deeper_hint"] = (
+        "The information is insufficient and I need to make deeper "
+        "research to fill the knowledge gap."
+    )
+
+    prompt_dict[
+        "sufficient_hint"
+    ] = "The information after web search and extraction is sufficient enough!"
+
+    prompt_dict["no_result_hint"] = (
+        "I mistakenly called the `summarize_intermediate_results` tool as "
+        "there exists no milestone result to summarize now."
+    )
+
+    prompt_dict["summarize_hint"] = (
+        "Based on your work history above, examine which step in the "
+        "following working meta_planner_agent has been completed. Mark the completed "
+        "step with [DONE] at the end of its line (e.g., k. step k [DONE]) "
+        "and leave the uncompleted steps unchanged. You MUST return only "
+        "the updated meta_planner_agent, preserving exactly the same format as the "
+        "original meta_planner_agent. Do not include any explanations, reasoning, "
+        "or section headers such as '## Working Plan:', just output the"
+        "updated meta_planner_agent itself."
+        "\n\n## Working Plan:\n{meta_planner_agent}"
+    )
+
+    prompt_dict["summarize_inst"] = (
+        "**Task Description:**\n{objective}\n"
+        "**Checklist:**\n{knowledge_gaps}\n"
+        "**Knowledge Gaps:**\n{working_plan}\n"
+        "**Search Results:**\n{tool_result}"
+    )
+
+    prompt_dict["update_report_hint"] = (
+        "Due to the overwhelming quantity of information, I have replaced the "
+        "original bulk search results from the research phase with the "
+        "following report that consolidates and summarizes the essential "
+        "findings:\n {intermediate_report}\n\n"
+        "Such report has been saved to the {report_path}. "
+        "I will now **proceed to the next item** in the working meta_planner_agent."
+    )
+
+    prompt_dict["save_report_hint"] = (
+        "The milestone results of the current item in working meta_planner_agent "
+        "are summarized into the following report:\n{intermediate_report}"
+    )
+
+    prompt_dict["reflect_instruction"] = (
+        "## Work History:\n{conversation_history}\n"
+        "## Working Plan:\n{meta_planner_agent}\n"
+    )
+
+    prompt_dict["subtask_complete_hint"] = (
+        "Subtask ‘{cur_obj}’ is completed. Now the current subtask "
+        "fallbacks to '{next_obj}'"
+    )
+
+    return prompt_dict