refactor(data_juicer_agent): update imports and add tests
This commit is contained in:
126
data_juicer_agent/.gitignore
vendored
126
data_juicer_agent/.gitignore
vendored
@@ -1,126 +0,0 @@
|
|||||||
# Byte-compiled / optimized / DLL files
|
|
||||||
__pycache__/
|
|
||||||
*.py[cod]
|
|
||||||
*$py.class
|
|
||||||
|
|
||||||
# C extensions
|
|
||||||
*.so
|
|
||||||
|
|
||||||
# Distribution / packaging
|
|
||||||
.Python
|
|
||||||
build/
|
|
||||||
develop-eggs/
|
|
||||||
dist/
|
|
||||||
downloads/
|
|
||||||
eggs/
|
|
||||||
.eggs/
|
|
||||||
lib/
|
|
||||||
lib64/
|
|
||||||
parts/
|
|
||||||
sdist/
|
|
||||||
var/
|
|
||||||
wheels/
|
|
||||||
pip-wheel-metadata/
|
|
||||||
share/python-wheels/
|
|
||||||
*.egg-info/
|
|
||||||
.installed.cfg
|
|
||||||
*.egg
|
|
||||||
MANIFEST
|
|
||||||
|
|
||||||
# PyInstaller
|
|
||||||
# Usually these files are written by a python script from a template
|
|
||||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
|
||||||
*.manifest
|
|
||||||
*.spec
|
|
||||||
|
|
||||||
# Translations
|
|
||||||
*.mo
|
|
||||||
*.pot
|
|
||||||
|
|
||||||
# Django stuff:
|
|
||||||
*.log
|
|
||||||
local_settings.py
|
|
||||||
db.sqlite3
|
|
||||||
db.sqlite3-journal
|
|
||||||
|
|
||||||
# Flask stuff:
|
|
||||||
instance/
|
|
||||||
.webassets-cache
|
|
||||||
|
|
||||||
# Scrapy stuff:
|
|
||||||
.scrapy
|
|
||||||
|
|
||||||
# Sphinx documentation
|
|
||||||
docs/_build/
|
|
||||||
|
|
||||||
# PyBuilder
|
|
||||||
target/
|
|
||||||
|
|
||||||
# Jupyter Notebook
|
|
||||||
.ipynb_checkpoints
|
|
||||||
|
|
||||||
# IPython
|
|
||||||
profile_default/
|
|
||||||
ipython_config.py
|
|
||||||
|
|
||||||
# pyenv
|
|
||||||
.python-version
|
|
||||||
|
|
||||||
# pipenv
|
|
||||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
|
||||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
|
||||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
|
||||||
# install all needed dependencies.
|
|
||||||
#Pipfile.lock
|
|
||||||
|
|
||||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
|
||||||
__pypackages__/
|
|
||||||
|
|
||||||
# Celery stuff
|
|
||||||
celerybeat-schedule
|
|
||||||
celerybeat.pid
|
|
||||||
|
|
||||||
# SageMath parsed files
|
|
||||||
*.sage.py
|
|
||||||
|
|
||||||
# Environments
|
|
||||||
.env
|
|
||||||
.venv
|
|
||||||
env/
|
|
||||||
venv/
|
|
||||||
ENV/
|
|
||||||
env.bak/
|
|
||||||
venv.bak/
|
|
||||||
|
|
||||||
# Spyder project settings
|
|
||||||
.spyderproject
|
|
||||||
.spyproject
|
|
||||||
|
|
||||||
# Rope project settings
|
|
||||||
.ropeproject
|
|
||||||
|
|
||||||
# mkdocs documentation
|
|
||||||
/site
|
|
||||||
|
|
||||||
# mypy
|
|
||||||
.mypy_cache/
|
|
||||||
.dmypy.json
|
|
||||||
dmypy.json
|
|
||||||
|
|
||||||
# Pyre type checker
|
|
||||||
.pyre/
|
|
||||||
|
|
||||||
.idea/
|
|
||||||
|
|
||||||
# macOS
|
|
||||||
.DS_Store
|
|
||||||
|
|
||||||
# Used to save loggings and files
|
|
||||||
*runs/
|
|
||||||
agentscope.db
|
|
||||||
tmp*.json
|
|
||||||
.vscode/
|
|
||||||
data_agent/
|
|
||||||
outputs/
|
|
||||||
tools/op_manager/cache_retrieve/
|
|
||||||
tools/op_manager/vector_index_cache/
|
|
||||||
@@ -4,6 +4,7 @@ A multi-agent data processing system built on [AgentScope](https://github.com/mo
|
|||||||
|
|
||||||
## 📋 Table of Contents
|
## 📋 Table of Contents
|
||||||
|
|
||||||
|
- [DataJuicer Agent](#datajuicer-agent)
|
||||||
- [📋 Table of Contents](#-table-of-contents)
|
- [📋 Table of Contents](#-table-of-contents)
|
||||||
- [What Does This Agent Do?](#what-does-this-agent-do)
|
- [What Does This Agent Do?](#what-does-this-agent-do)
|
||||||
- [Architecture](#architecture)
|
- [Architecture](#architecture)
|
||||||
@@ -68,7 +69,14 @@ Router Agent ──┐
|
|||||||
### Installation
|
### Installation
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv pip install -e .
|
# Recommended to use uv
|
||||||
|
uv pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
or
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
### Configuration
|
### Configuration
|
||||||
|
|||||||
@@ -4,6 +4,7 @@
|
|||||||
|
|
||||||
## 📋 目录
|
## 📋 目录
|
||||||
|
|
||||||
|
- [DataJuicer 智能体](#datajuicer-智能体)
|
||||||
- [📋 目录](#-目录)
|
- [📋 目录](#-目录)
|
||||||
- [这个智能体做了什么?](#这个智能体做了什么)
|
- [这个智能体做了什么?](#这个智能体做了什么)
|
||||||
- [架构](#架构)
|
- [架构](#架构)
|
||||||
@@ -67,7 +68,14 @@ Data-Juicer (DJ) 是一个一站式系统,面向大模型的文本及多模态
|
|||||||
### 安装
|
### 安装
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
uv pip install -e .
|
# 推荐使用uv
|
||||||
|
uv pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
或
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
```
|
```
|
||||||
|
|
||||||
### 配置
|
### 配置
|
||||||
|
|||||||
@@ -7,11 +7,16 @@ from agentscope.model import DashScopeChatModel
|
|||||||
from agentscope.formatter import DashScopeChatFormatter
|
from agentscope.formatter import DashScopeChatFormatter
|
||||||
from agentscope.memory import InMemoryMemory
|
from agentscope.memory import InMemoryMemory
|
||||||
from agentscope.agent import UserAgent
|
from agentscope.agent import UserAgent
|
||||||
from agentscope.tool import Toolkit
|
|
||||||
|
|
||||||
from agent_factory import create_agent
|
from .agent_factory import create_agent
|
||||||
from prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT
|
from .prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT
|
||||||
from tools import dj_toolkit, dj_dev_toolkit, mcp_tools, get_mcp_toolkit, agents2toolkit
|
from .tools import (
|
||||||
|
dj_toolkit,
|
||||||
|
dj_dev_toolkit,
|
||||||
|
mcp_tools,
|
||||||
|
get_mcp_toolkit,
|
||||||
|
agents2toolkit,
|
||||||
|
)
|
||||||
|
|
||||||
# Create shared configuration
|
# Create shared configuration
|
||||||
model = DashScopeChatModel(
|
model = DashScopeChatModel(
|
||||||
@@ -145,10 +150,14 @@ async def main(
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Example tasks
|
# Example tasks
|
||||||
# project_root = os.path.abspath(os.path.dirname(__file__))
|
# project_root = os.path.abspath(os.path.dirname(__file__))
|
||||||
# task = f"数据存储在{project_root}/data/demo-dataset-images.jsonl,筛选掉样本中,文本字段长度小于5的样本,以及图片size小于100Kb的样本。并将输出结果保存到./outputs路径下。"
|
# task = (
|
||||||
|
# f"The data is stored in {project_root}/data/demo-dataset-images.jsonl. "
|
||||||
|
# "Among the samples, the text field length is less than 5 "
|
||||||
|
# "and the image size is less than 100Kb. "
|
||||||
|
# "And save the output results to the ./outputs path."
|
||||||
|
# )
|
||||||
#
|
#
|
||||||
# DJ Development example task:
|
# DJ Development example task:
|
||||||
# task = "我想开发一个新的DataJuicer过滤算子,用于过滤掉没有人声的音频文件"
|
# task = "I want to develop a new DataJuicer filter operator to filter out audio files without vocals"
|
||||||
#
|
#
|
||||||
# MCP Agent will be automatically selected for advanced processing tasks
|
|
||||||
fire.Fire(main)
|
fire.Fire(main)
|
||||||
|
|||||||
@@ -1,12 +0,0 @@
|
|||||||
[project]
|
|
||||||
name = "data-juicer-agent"
|
|
||||||
version = "0.1.0"
|
|
||||||
description = "A data processing agent with data juicer"
|
|
||||||
readme = "README.md"
|
|
||||||
requires-python = ">=3.11"
|
|
||||||
dependencies = [
|
|
||||||
"agentscope>=1.0.5",
|
|
||||||
"faiss-cpu>=1.12.0",
|
|
||||||
"langchain-community",
|
|
||||||
"py-data-juicer>=1.4.2",
|
|
||||||
]
|
|
||||||
5
data_juicer_agent/requirements.txt
Normal file
5
data_juicer_agent/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
agentscope>=1.0.5
|
||||||
|
py-data-juicer>=1.4.2
|
||||||
|
faiss-cpu>=1.12.0
|
||||||
|
fire>=0.7.1
|
||||||
|
langchain-community
|
||||||
220
tests/data_juicer_agent_test.py
Normal file
220
tests/data_juicer_agent_test.py
Normal file
@@ -0,0 +1,220 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
import os
|
||||||
|
import pytest
|
||||||
|
from unittest.mock import AsyncMock, Mock, patch
|
||||||
|
from agentscope.agent import ReActAgent
|
||||||
|
from agentscope.model import DashScopeChatModel
|
||||||
|
from agentscope.tool import Toolkit
|
||||||
|
from agentscope.message import Msg
|
||||||
|
from agentscope.formatter import DashScopeChatFormatter
|
||||||
|
from agentscope.memory import InMemoryMemory
|
||||||
|
from agentscope.tool import (
|
||||||
|
view_text_file,
|
||||||
|
write_text_file,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Import the main function and related components
|
||||||
|
from data_juicer_agent.main import main
|
||||||
|
from data_juicer_agent.agent_factory import create_agent
|
||||||
|
from data_juicer_agent.tools import (
|
||||||
|
dj_toolkit,
|
||||||
|
dj_dev_toolkit,
|
||||||
|
dj_tools,
|
||||||
|
dj_dev_tools,
|
||||||
|
mcp_tools,
|
||||||
|
execute_safe_command,
|
||||||
|
query_dj_operators,
|
||||||
|
get_basic_files,
|
||||||
|
get_operator_example,
|
||||||
|
configure_data_juicer_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
class TestDataJuicerAgent:
|
||||||
|
"""Test suite for the data_juicer_agent functionality"""
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_toolkit(self):
|
||||||
|
"""Create a mocked Toolkit instance"""
|
||||||
|
return Mock(spec=Toolkit)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_model(self):
|
||||||
|
"""Create a mocked DashScopeChatModel"""
|
||||||
|
model = Mock(spec=DashScopeChatModel)
|
||||||
|
model.call = AsyncMock(
|
||||||
|
return_value=Msg("assistant", "test response", role="assistant"),
|
||||||
|
)
|
||||||
|
return model
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_formatter(self):
|
||||||
|
"""Create a mocked DashScopeChatFormatter"""
|
||||||
|
return Mock(spec=DashScopeChatFormatter)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_memory(self):
|
||||||
|
"""Create a mocked InMemoryMemory"""
|
||||||
|
return Mock(spec=InMemoryMemory)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_mcp_client(self):
|
||||||
|
"""Create a mocked MCP client"""
|
||||||
|
mock_client = Mock()
|
||||||
|
mock_client.name = "DJ_recipe_flow"
|
||||||
|
mock_client.connect = AsyncMock()
|
||||||
|
mock_client.close = AsyncMock()
|
||||||
|
mock_client.get_callable_function = AsyncMock()
|
||||||
|
mock_client.list_tools = AsyncMock()
|
||||||
|
return mock_client
|
||||||
|
|
||||||
|
def create_named_mock_agent(self, name, mock_agent, *args, **kwargs):
|
||||||
|
"""Create a named mock agent for testing"""
|
||||||
|
agent_instance = Mock(spec=ReActAgent)
|
||||||
|
agent_instance.model = mock_agent.model
|
||||||
|
agent_instance.formatter = mock_agent.formatter
|
||||||
|
agent_instance.toolkit = mock_agent.toolkit
|
||||||
|
agent_instance.memory = mock_agent.memory
|
||||||
|
agent_instance.__call__ = mock_agent.__call__
|
||||||
|
agent_instance.name = name
|
||||||
|
return agent_instance
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def mock_agent(
|
||||||
|
self,
|
||||||
|
mock_model,
|
||||||
|
mock_formatter,
|
||||||
|
mock_toolkit,
|
||||||
|
mock_memory,
|
||||||
|
):
|
||||||
|
"""Create a mocked ReActAgent instance"""
|
||||||
|
agent = Mock(spec=ReActAgent)
|
||||||
|
agent.model = mock_model
|
||||||
|
agent.formatter = mock_formatter
|
||||||
|
agent.toolkit = mock_toolkit
|
||||||
|
agent.memory = mock_memory
|
||||||
|
agent.__call__ = AsyncMock(
|
||||||
|
return_value=Msg("assistant", "test response", role="assistant"),
|
||||||
|
)
|
||||||
|
return agent
|
||||||
|
|
||||||
|
def test_dj_toolkit_initialization(self):
|
||||||
|
"""Test DJ toolkit initialization and tool registration"""
|
||||||
|
assert dj_toolkit.tools.get("execute_safe_command") is not None
|
||||||
|
assert dj_toolkit.tools.get("view_text_file") is not None
|
||||||
|
assert dj_toolkit.tools.get("write_text_file") is not None
|
||||||
|
assert dj_toolkit.tools.get("query_dj_operators") is not None
|
||||||
|
|
||||||
|
# Verify tool list contains expected tools
|
||||||
|
expected_tools = [
|
||||||
|
execute_safe_command,
|
||||||
|
view_text_file,
|
||||||
|
write_text_file,
|
||||||
|
query_dj_operators,
|
||||||
|
]
|
||||||
|
assert len(dj_tools) == len(expected_tools)
|
||||||
|
for tool in expected_tools:
|
||||||
|
assert tool in dj_tools
|
||||||
|
|
||||||
|
def test_dj_dev_toolkit_initialization(self):
|
||||||
|
"""Test DJ development toolkit initialization and tool registration"""
|
||||||
|
assert dj_dev_toolkit.tools.get("view_text_file") is not None
|
||||||
|
assert dj_dev_toolkit.tools.get("write_text_file") is not None
|
||||||
|
assert dj_dev_toolkit.tools.get("get_basic_files") is not None
|
||||||
|
assert dj_dev_toolkit.tools.get("get_operator_example") is not None
|
||||||
|
assert dj_dev_toolkit.tools.get("configure_data_juicer_path") is not None
|
||||||
|
|
||||||
|
# Verify tool list contains expected tools
|
||||||
|
expected_tools = [
|
||||||
|
view_text_file,
|
||||||
|
write_text_file,
|
||||||
|
get_basic_files,
|
||||||
|
get_operator_example,
|
||||||
|
configure_data_juicer_path,
|
||||||
|
]
|
||||||
|
assert len(dj_dev_tools) == len(expected_tools)
|
||||||
|
for tool in expected_tools:
|
||||||
|
assert tool in dj_dev_tools
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_mcp_tools_list(self, mock_mcp_client):
|
||||||
|
"""Test MCP tools list contains expected tools and MCP client binding"""
|
||||||
|
expected_tools = [view_text_file, write_text_file]
|
||||||
|
assert len(mcp_tools) == len(expected_tools)
|
||||||
|
for tool in expected_tools:
|
||||||
|
assert tool in mcp_tools
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_agent_initialization(
|
||||||
|
self,
|
||||||
|
mock_model,
|
||||||
|
mock_formatter,
|
||||||
|
mock_toolkit,
|
||||||
|
mock_memory,
|
||||||
|
):
|
||||||
|
"""Test ReActAgent initialization"""
|
||||||
|
with patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"}):
|
||||||
|
agent = create_agent(
|
||||||
|
name="DataJuicer",
|
||||||
|
sys_prompt="You are {name}, a agent.",
|
||||||
|
toolkit=mock_toolkit,
|
||||||
|
model=mock_model,
|
||||||
|
formatter=mock_formatter,
|
||||||
|
memory=mock_memory,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert agent.name == "DataJuicer"
|
||||||
|
assert "DataJuicer" in agent.sys_prompt
|
||||||
|
assert agent.model == mock_model
|
||||||
|
assert agent.formatter == mock_formatter
|
||||||
|
assert agent.toolkit == mock_toolkit
|
||||||
|
assert agent.memory == mock_memory
|
||||||
|
|
||||||
|
async def mock_user_func(self, msg=None):
|
||||||
|
return Msg("user", "exit", role="user")
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_main_with_multiple_agents_loading(self, mock_agent, mock_mcp_client):
|
||||||
|
"""Test main function loads multiple agents successfully"""
|
||||||
|
with patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"}):
|
||||||
|
mock_mcp_clients = [mock_mcp_client]
|
||||||
|
|
||||||
|
with patch(
|
||||||
|
"data_juicer_agent.tools.mcp_tools._create_clients",
|
||||||
|
return_value=mock_mcp_clients,
|
||||||
|
):
|
||||||
|
with patch(
|
||||||
|
"data_juicer_agent.main.create_agent",
|
||||||
|
side_effect=lambda name, *args, **kwargs: self.create_named_mock_agent(
|
||||||
|
name, mock_agent, *args, **kwargs
|
||||||
|
),
|
||||||
|
) as mock_create_agent:
|
||||||
|
with patch(
|
||||||
|
"data_juicer_agent.main.user", side_effect=self.mock_user_func
|
||||||
|
):
|
||||||
|
|
||||||
|
await main(
|
||||||
|
use_studio=False,
|
||||||
|
available_agents=["dj", "dj_dev", "dj_mcp"],
|
||||||
|
retrieval_mode="auto",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Validate multiple agents are correctly created (dj, dj_dev, dj_mcp, and router)
|
||||||
|
assert mock_create_agent.call_count == 4
|
||||||
|
|
||||||
|
# Validate router agent is created
|
||||||
|
create_calls = mock_create_agent.call_args_list
|
||||||
|
router_agent_created = any(
|
||||||
|
call[0][0] == "Router"
|
||||||
|
for call in create_calls # First parameter is name
|
||||||
|
)
|
||||||
|
assert router_agent_created, "Router agent should be created"
|
||||||
|
|
||||||
|
# Validate dj_mcp agent is created
|
||||||
|
mcp_agent_created = any(
|
||||||
|
call[0][0] == "mcp_datajuicer_agent"
|
||||||
|
for call in create_calls # First parameter is name
|
||||||
|
)
|
||||||
|
assert mcp_agent_created, "MCP agent should be created"
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main(["-v", __file__])
|
||||||
Reference in New Issue
Block a user