refactor(data_juicer_agent): update imports and add tests

This commit is contained in:
cmgzn
2025-10-30 15:36:25 +08:00
parent 55725959ae
commit 4377fe36cb
7 changed files with 308 additions and 196 deletions

View File

@@ -1,126 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.idea/
# macOS
.DS_Store
# Used to save loggings and files
*runs/
agentscope.db
tmp*.json
.vscode/
data_agent/
outputs/
tools/op_manager/cache_retrieve/
tools/op_manager/vector_index_cache/

View File

@@ -4,31 +4,32 @@ A multi-agent data processing system built on [AgentScope](https://github.com/mo
## 📋 Table of Contents
- [📋 Table of Contents](#-table-of-contents)
- [What Does This Agent Do?](#what-does-this-agent-do)
- [Architecture](#architecture)
- [Quick Start](#quick-start)
- [System Requirements](#system-requirements)
- [Installation](#installation)
- [Configuration](#configuration)
- [Usage](#usage)
- [Agent Introduction](#agent-introduction)
- [Data Processing Agent](#data-processing-agent)
- [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent)
- [Advanced Features](#advanced-features)
- [Operator Retrieval](#operator-retrieval)
- [Retrieval Modes](#retrieval-modes)
- [Usage](#usage-1)
- [MCP Agent](#mcp-agent)
- [MCP Server Types](#mcp-server-types)
- [Configuration](#configuration-1)
- [Usage Methods](#usage-methods)
- [Feature Preview](#feature-preview)
- [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available)
- [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development)
- [Troubleshooting](#troubleshooting)
- [Common Issues](#common-issues)
- [Optimization Recommendations](#optimization-recommendations)
- [DataJuicer Agent](#datajuicer-agent)
- [📋 Table of Contents](#-table-of-contents)
- [What Does This Agent Do?](#what-does-this-agent-do)
- [Architecture](#architecture)
- [Quick Start](#quick-start)
- [System Requirements](#system-requirements)
- [Installation](#installation)
- [Configuration](#configuration)
- [Usage](#usage)
- [Agent Introduction](#agent-introduction)
- [Data Processing Agent](#data-processing-agent)
- [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent)
- [Advanced Features](#advanced-features)
- [Operator Retrieval](#operator-retrieval)
- [Retrieval Modes](#retrieval-modes)
- [Usage](#usage-1)
- [MCP Agent](#mcp-agent)
- [MCP Server Types](#mcp-server-types)
- [Configuration](#configuration-1)
- [Usage Methods](#usage-methods)
- [Feature Preview](#feature-preview)
- [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available)
- [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development)
- [Troubleshooting](#troubleshooting)
- [Common Issues](#common-issues)
- [Optimization Recommendations](#optimization-recommendations)
## What Does This Agent Do?
@@ -68,7 +69,14 @@ Router Agent ──┐
### Installation
```bash
uv pip install -e .
# Recommended to use uv
uv pip install -r requirements.txt
```
or
```bash
pip install -r requirements.txt
```
### Configuration

View File

@@ -4,30 +4,31 @@
## 📋 目录
- [📋 目录](#-目录)
- [这个智能体做了什么?](#这个智能体做了什么)
- [架构](#架构)
- [快速开始](#快速开始)
- [系统要求](#系统要求)
- [安装](#安装)
- [配置](#配置)
- [使用](#使用)
- [智能体介绍](#智能体介绍)
- [数据处理智能体](#数据处理智能体)
- [代码开发智能体](#代码开发智能体)
- [高级功能](#高级功能)
- [算子检索](#算子检索)
- [检索模式](#检索模式)
- [使用](#使用-1)
- [MCP 智能体](#mcp-智能体)
- [MCP 服务器类型](#mcp-服务器类型)
- [配置](#配置-1)
- [使用方法](#使用方法)
- [功能预览](#功能预览)
- [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用)
- [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中)
- [常见问题](#常见问题)
- [优化建议](#优化建议)
- [DataJuicer 智能体](#datajuicer-智能体)
- [📋 目录](#-目录)
- [这个智能体做了什么?](#这个智能体做了什么)
- [架构](#架构)
- [快速开始](#快速开始)
- [系统要求](#系统要求)
- [安装](#安装)
- [配置](#配置)
- [使用](#使用)
- [智能体介绍](#智能体介绍)
- [数据处理智能体](#数据处理智能体)
- [代码开发智能体](#代码开发智能体)
- [高级功能](#高级功能)
- [算子检索](#算子检索)
- [检索模式](#检索模式)
- [使用](#使用-1)
- [MCP 智能体](#mcp-智能体)
- [MCP 服务器类型](#mcp-服务器类型)
- [配置](#配置-1)
- [使用方法](#使用方法)
- [功能预览](#功能预览)
- [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用)
- [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中)
- [常见问题](#常见问题)
- [优化建议](#优化建议)
## 这个智能体做了什么?
@@ -67,7 +68,14 @@ Data-Juicer (DJ) 是一个一站式系统,面向大模型的文本及多模态
### 安装
```bash
uv pip install -e .
# 推荐使用uv
uv pip install -r requirements.txt
```
```bash
pip install -r requirements.txt
```
### 配置

View File

@@ -7,11 +7,16 @@ from agentscope.model import DashScopeChatModel
from agentscope.formatter import DashScopeChatFormatter
from agentscope.memory import InMemoryMemory
from agentscope.agent import UserAgent
from agentscope.tool import Toolkit
from agent_factory import create_agent
from prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT
from tools import dj_toolkit, dj_dev_toolkit, mcp_tools, get_mcp_toolkit, agents2toolkit
from .agent_factory import create_agent
from .prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT
from .tools import (
dj_toolkit,
dj_dev_toolkit,
mcp_tools,
get_mcp_toolkit,
agents2toolkit,
)
# Create shared configuration
model = DashScopeChatModel(
@@ -145,10 +150,14 @@ async def main(
if __name__ == "__main__":
# Example tasks
# project_root = os.path.abspath(os.path.dirname(__file__))
# task = f"数据存储在{project_root}/data/demo-dataset-images.jsonl筛选掉样本中文本字段长度小于5的样本以及图片size小于100Kb的样本。并将输出结果保存到./outputs路径下。"
# task = (
# f"The data is stored in {project_root}/data/demo-dataset-images.jsonl. "
# "Among the samples, the text field length is less than 5 "
# "and the image size is less than 100Kb. "
# "And save the output results to the ./outputs path."
# )
#
# DJ Development example task:
# task = "我想开发一个新的DataJuicer过滤算子用于过滤掉没有人声的音频文件"
# task = "I want to develop a new DataJuicer filter operator to filter out audio files without vocals"
#
# MCP Agent will be automatically selected for advanced processing tasks
fire.Fire(main)

View File

@@ -1,12 +0,0 @@
[project]
name = "data-juicer-agent"
version = "0.1.0"
description = "A data processing agent with data juicer"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"agentscope>=1.0.5",
"faiss-cpu>=1.12.0",
"langchain-community",
"py-data-juicer>=1.4.2",
]

View File

@@ -0,0 +1,5 @@
agentscope>=1.0.5
py-data-juicer>=1.4.2
faiss-cpu>=1.12.0
fire>=0.7.1
langchain-community

View File

@@ -0,0 +1,220 @@
# -*- coding: utf-8 -*-
import os
import pytest
from unittest.mock import AsyncMock, Mock, patch
from agentscope.agent import ReActAgent
from agentscope.model import DashScopeChatModel
from agentscope.tool import Toolkit
from agentscope.message import Msg
from agentscope.formatter import DashScopeChatFormatter
from agentscope.memory import InMemoryMemory
from agentscope.tool import (
view_text_file,
write_text_file,
)
# Import the main function and related components
from data_juicer_agent.main import main
from data_juicer_agent.agent_factory import create_agent
from data_juicer_agent.tools import (
dj_toolkit,
dj_dev_toolkit,
dj_tools,
dj_dev_tools,
mcp_tools,
execute_safe_command,
query_dj_operators,
get_basic_files,
get_operator_example,
configure_data_juicer_path,
)
class TestDataJuicerAgent:
"""Test suite for the data_juicer_agent functionality"""
@pytest.fixture
def mock_toolkit(self):
"""Create a mocked Toolkit instance"""
return Mock(spec=Toolkit)
@pytest.fixture
def mock_model(self):
"""Create a mocked DashScopeChatModel"""
model = Mock(spec=DashScopeChatModel)
model.call = AsyncMock(
return_value=Msg("assistant", "test response", role="assistant"),
)
return model
@pytest.fixture
def mock_formatter(self):
"""Create a mocked DashScopeChatFormatter"""
return Mock(spec=DashScopeChatFormatter)
@pytest.fixture
def mock_memory(self):
"""Create a mocked InMemoryMemory"""
return Mock(spec=InMemoryMemory)
@pytest.fixture
def mock_mcp_client(self):
"""Create a mocked MCP client"""
mock_client = Mock()
mock_client.name = "DJ_recipe_flow"
mock_client.connect = AsyncMock()
mock_client.close = AsyncMock()
mock_client.get_callable_function = AsyncMock()
mock_client.list_tools = AsyncMock()
return mock_client
def create_named_mock_agent(self, name, mock_agent, *args, **kwargs):
"""Create a named mock agent for testing"""
agent_instance = Mock(spec=ReActAgent)
agent_instance.model = mock_agent.model
agent_instance.formatter = mock_agent.formatter
agent_instance.toolkit = mock_agent.toolkit
agent_instance.memory = mock_agent.memory
agent_instance.__call__ = mock_agent.__call__
agent_instance.name = name
return agent_instance
@pytest.fixture
def mock_agent(
self,
mock_model,
mock_formatter,
mock_toolkit,
mock_memory,
):
"""Create a mocked ReActAgent instance"""
agent = Mock(spec=ReActAgent)
agent.model = mock_model
agent.formatter = mock_formatter
agent.toolkit = mock_toolkit
agent.memory = mock_memory
agent.__call__ = AsyncMock(
return_value=Msg("assistant", "test response", role="assistant"),
)
return agent
def test_dj_toolkit_initialization(self):
"""Test DJ toolkit initialization and tool registration"""
assert dj_toolkit.tools.get("execute_safe_command") is not None
assert dj_toolkit.tools.get("view_text_file") is not None
assert dj_toolkit.tools.get("write_text_file") is not None
assert dj_toolkit.tools.get("query_dj_operators") is not None
# Verify tool list contains expected tools
expected_tools = [
execute_safe_command,
view_text_file,
write_text_file,
query_dj_operators,
]
assert len(dj_tools) == len(expected_tools)
for tool in expected_tools:
assert tool in dj_tools
def test_dj_dev_toolkit_initialization(self):
"""Test DJ development toolkit initialization and tool registration"""
assert dj_dev_toolkit.tools.get("view_text_file") is not None
assert dj_dev_toolkit.tools.get("write_text_file") is not None
assert dj_dev_toolkit.tools.get("get_basic_files") is not None
assert dj_dev_toolkit.tools.get("get_operator_example") is not None
assert dj_dev_toolkit.tools.get("configure_data_juicer_path") is not None
# Verify tool list contains expected tools
expected_tools = [
view_text_file,
write_text_file,
get_basic_files,
get_operator_example,
configure_data_juicer_path,
]
assert len(dj_dev_tools) == len(expected_tools)
for tool in expected_tools:
assert tool in dj_dev_tools
@pytest.mark.asyncio
async def test_mcp_tools_list(self, mock_mcp_client):
"""Test MCP tools list contains expected tools and MCP client binding"""
expected_tools = [view_text_file, write_text_file]
assert len(mcp_tools) == len(expected_tools)
for tool in expected_tools:
assert tool in mcp_tools
@pytest.mark.asyncio
async def test_agent_initialization(
self,
mock_model,
mock_formatter,
mock_toolkit,
mock_memory,
):
"""Test ReActAgent initialization"""
with patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"}):
agent = create_agent(
name="DataJuicer",
sys_prompt="You are {name}, a agent.",
toolkit=mock_toolkit,
model=mock_model,
formatter=mock_formatter,
memory=mock_memory,
)
assert agent.name == "DataJuicer"
assert "DataJuicer" in agent.sys_prompt
assert agent.model == mock_model
assert agent.formatter == mock_formatter
assert agent.toolkit == mock_toolkit
assert agent.memory == mock_memory
async def mock_user_func(self, msg=None):
return Msg("user", "exit", role="user")
@pytest.mark.asyncio
async def test_main_with_multiple_agents_loading(self, mock_agent, mock_mcp_client):
"""Test main function loads multiple agents successfully"""
with patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"}):
mock_mcp_clients = [mock_mcp_client]
with patch(
"data_juicer_agent.tools.mcp_tools._create_clients",
return_value=mock_mcp_clients,
):
with patch(
"data_juicer_agent.main.create_agent",
side_effect=lambda name, *args, **kwargs: self.create_named_mock_agent(
name, mock_agent, *args, **kwargs
),
) as mock_create_agent:
with patch(
"data_juicer_agent.main.user", side_effect=self.mock_user_func
):
await main(
use_studio=False,
available_agents=["dj", "dj_dev", "dj_mcp"],
retrieval_mode="auto",
)
# Validate multiple agents are correctly created (dj, dj_dev, dj_mcp, and router)
assert mock_create_agent.call_count == 4
# Validate router agent is created
create_calls = mock_create_agent.call_args_list
router_agent_created = any(
call[0][0] == "Router"
for call in create_calls # First parameter is name
)
assert router_agent_created, "Router agent should be created"
# Validate dj_mcp agent is created
mcp_agent_created = any(
call[0][0] == "mcp_datajuicer_agent"
for call in create_calls # First parameter is name
)
assert mcp_agent_created, "MCP agent should be created"
if __name__ == "__main__":
pytest.main(["-v", __file__])