Optimize DataJuicer Agent doc & linter (#30)

This commit is contained in:
Daoyuan Chen
2025-11-10 18:17:27 +08:00
committed by GitHub
parent 1f0c5de27f
commit dba3b86ddf
14 changed files with 891 additions and 359 deletions

View File

@@ -2,8 +2,8 @@
"""
DataJuicer Development Tools
Tools for developing DataJuicer operators, including access to basic documentation
and example code for different operator types.
Tools for developing DataJuicer operators, including access to basic
documentation and example code for different operator types.
"""
import os
@@ -23,7 +23,8 @@ BASIC_LIST_RELATIVE = [
def get_basic_files() -> ToolResponse:
"""Get basic DataJuicer development files content.
Returns the content of essential files needed for DJ operator development:
Returns the content of essential files needed for DJ operator
development:
- base_op.py: Base operator class
- DeveloperGuide.md: English developer guide
- DeveloperGuide_ZH.md: Chinese developer guide
@@ -31,19 +32,23 @@ def get_basic_files() -> ToolResponse:
Returns:
ToolResponse: Combined content of all basic development files
"""
global DATA_JUICER_PATH, BASIC_LIST_RELATIVE
if DATA_JUICER_PATH is None:
return ToolResponse(
content=[
TextBlock(
type="text",
text="DATA_JUICER_PATH is not configured. Please ask the user to provide the DATA_JUICER_PATH",
)
]
text=(
"DATA_JUICER_PATH is not configured. Please ask the "
"user to provide the DATA_JUICER_PATH"
),
),
],
)
try:
combined_content = "# DataJuicer Operator Development Basic Files\n\n"
comb_content = "# DataJuicer Operator Development Basic Files\n\n"
for relative_path in BASIC_LIST_RELATIVE:
file_path = os.path.join(DATA_JUICER_PATH, relative_path)
@@ -52,20 +57,21 @@ def get_basic_files() -> ToolResponse:
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
filename = os.path.basename(file_path)
combined_content += f"## {filename}\n\n"
combined_content += (
f"```{'python' if filename.endswith('.py') else 'markdown'}\n"
)
combined_content += content
combined_content += "\n```\n\n"
file_n = os.path.basename(file_path)
comb_content += f"## {file_n}\n\n```"
flag = "python" if file_n.endswith(".py") else "markdown"
comb_content += f"{flag}\n"
comb_content += content
comb_content += "\n```\n\n"
except Exception as e:
combined_content += (
comb_content += (
f"## {os.path.basename(file_path)} (Read Failed)\n"
)
combined_content += f"Error: {str(e)}\n\n"
comb_content += f"Error: {str(e)}\n\n"
return ToolResponse(content=[TextBlock(type="text", text=combined_content)])
return ToolResponse(
content=[TextBlock(type="text", text=comb_content)],
)
except Exception as e:
return ToolResponse(
@@ -73,32 +79,41 @@ def get_basic_files() -> ToolResponse:
TextBlock(
type="text",
text=f"Error occurred while getting basic files: {str(e)}",
)
]
),
],
)
async def get_operator_example(
requirement_description: str, limit: int = 2
requirement_description: str,
limit: int = 2,
) -> ToolResponse:
"""Get example operators based on requirement description using dynamic search.
"""Get example operators based on requirement description using
dynamic search.
Args:
requirement_description (str): Natural language description of the operator requirement
limit (int): Maximum number of example operators to return (default: 2)
requirement_description (str): Natural language description of
the operator requirement
limit (int): Maximum number of example operators to return
(default: 2)
Returns:
ToolResponse: Example operator code and test files based on the requirement
ToolResponse: Example operator code and test files based on
the requirement
"""
global DATA_JUICER_PATH
if DATA_JUICER_PATH is None:
return ToolResponse(
content=[
TextBlock(
type="text",
text="DATA_JUICER_PATH is not configured. Please ask the user to provide the DATA_JUICER_PATH",
)
]
text=(
"DATA_JUICER_PATH is not configured. Please ask the "
"user to provide the DATA_JUICER_PATH"
),
),
],
)
try:
@@ -108,49 +123,56 @@ async def get_operator_example(
# Query relevant operators using the requirement description
# Use retrieval mode from environment variable if set
retrieval_mode = os.environ.get("RETRIEVAL_MODE", "auto")
tool_names = await retrieve_ops(requirement_description, limit=limit, mode=retrieval_mode)
tool_names = await retrieve_ops(
requirement_description,
limit=limit,
mode=retrieval_mode,
)
if not tool_names:
return ToolResponse(
content=[
TextBlock(
type="text",
text=f"No relevant operators found for requirement: {requirement_description}\n"
f"Please try with more specific keywords or check if DATA_JUICER_PATH is properly configured.",
)
]
text=(
"No relevant operators found for requirement: "
f"{requirement_description}\n"
"Please try with more specific keywords or "
"check if DATA_JUICER_PATH is properly "
"configured."
),
),
],
)
combined_content = (
comb_content = (
f"# Dynamic Operator Examples for: {requirement_description}\n\n"
)
combined_content += (
comb_content += (
f"Found {len(tool_names)} relevant operators (limit: {limit})\n\n"
)
# Process each found operator
for i, tool_name in enumerate(tool_names[:limit]):
combined_content += f"## {i+1}. {tool_name}\n\n"
comb_content += f"## {i+1}. {tool_name}\n\n"
op_type = tool_name.split("_")[-1]
operator_path = f"data_juicer/ops/{op_type}/{tool_name}.py"
# Try to find operator source file
full_path = os.path.join(DATA_JUICER_PATH, operator_path)
if os.path.exists(full_path):
with open(full_path, "r", encoding="utf-8") as f:
operator_code = f.read()
combined_content += f"### Source Code\n"
combined_content += "```python\n"
combined_content += operator_code
combined_content += "\n```\n\n"
comb_content += "### Source Code\n"
comb_content += "```python\n"
comb_content += operator_code
comb_content += "\n```\n\n"
else:
combined_content += (
f"**Note:** Source code file not found for `{tool_name}`.\n\n"
)
comb_content += "**Note:** Source code file not found for"
comb_content += f" `{tool_name}`.\n\n"
test_path = f"tests/ops/{op_type}/test_{tool_name}.py"
@@ -159,36 +181,43 @@ async def get_operator_example(
with open(full_test_path, "r", encoding="utf-8") as f:
test_code = f.read()
combined_content += f"### Test Code\n"
combined_content += f"**File Path:** `{test_path}`\n\n"
combined_content += "```python\n"
combined_content += test_code
combined_content += "\n```\n\n"
comb_content += "### Test Code\n"
comb_content += f"**File Path:** `{test_path}`\n\n"
comb_content += "```python\n"
comb_content += test_code
comb_content += "\n```\n\n"
else:
combined_content += (
comb_content += (
f"**Note:** Test file not found for `{tool_name}`.\n\n"
)
combined_content += "---\n\n"
comb_content += "---\n\n"
return ToolResponse(content=[TextBlock(type="text", text=combined_content)])
return ToolResponse(
content=[TextBlock(type="text", text=comb_content)],
)
except Exception as e:
return ToolResponse(
content=[
TextBlock(
type="text",
text=f"Error occurred while getting operator examples: {str(e)}\n"
f"Please check the requirement description and try again.",
)
]
text=(
"Error occurred while getting operator examples: "
f"{str(e)}\n"
"Please check the requirement description and try "
"again."
),
),
],
)
def configure_data_juicer_path(data_juicer_path: str) -> ToolResponse:
"""Configure DataJuicer path.
If the user provides the data_juicer_path, please use this method to configure it.
If the user provides the data_juicer_path, please use this method to
configure it.
Args:
data_juicer_path (str): Path to DataJuicer installation
@@ -196,8 +225,9 @@ def configure_data_juicer_path(data_juicer_path: str) -> ToolResponse:
Returns:
ToolResponse: Configuration result
"""
global DATA_JUICER_PATH
data_juicer_path = os.path.expanduser(data_juicer_path)
try:
@@ -206,9 +236,12 @@ def configure_data_juicer_path(data_juicer_path: str) -> ToolResponse:
content=[
TextBlock(
type="text",
text=f"Specified DataJuicer path does not exist: {data_juicer_path}",
)
]
text=(
"Specified DataJuicer path does not exist: "
f"{data_juicer_path}"
),
),
],
)
# Update global DATA_JUICER_PATH
@@ -218,9 +251,12 @@ def configure_data_juicer_path(data_juicer_path: str) -> ToolResponse:
content=[
TextBlock(
type="text",
text=f"DataJuicer path has been updated to: {DATA_JUICER_PATH}",
)
]
text=(
"DataJuicer path has been updated to: ",
f"{DATA_JUICER_PATH}",
),
),
],
)
except Exception as e:
@@ -228,7 +264,10 @@ def configure_data_juicer_path(data_juicer_path: str) -> ToolResponse:
content=[
TextBlock(
type="text",
text=f"Error occurred while configuring DataJuicer path: {str(e)}",
)
]
text=(
"Error occurred while configuring DataJuicer path: "
f"{str(e)}"
),
),
],
)