refactor(data_juicer_agent): update imports and add tests
This commit is contained in:
126
data_juicer_agent/.gitignore
vendored
126
data_juicer_agent/.gitignore
vendored
@@ -1,126 +0,0 @@
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
pip-wheel-metadata/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
.python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
.idea/
|
||||
|
||||
# macOS
|
||||
.DS_Store
|
||||
|
||||
# Used to save loggings and files
|
||||
*runs/
|
||||
agentscope.db
|
||||
tmp*.json
|
||||
.vscode/
|
||||
data_agent/
|
||||
outputs/
|
||||
tools/op_manager/cache_retrieve/
|
||||
tools/op_manager/vector_index_cache/
|
||||
@@ -4,31 +4,32 @@ A multi-agent data processing system built on [AgentScope](https://github.com/mo
|
||||
|
||||
## 📋 Table of Contents
|
||||
|
||||
- [📋 Table of Contents](#-table-of-contents)
|
||||
- [What Does This Agent Do?](#what-does-this-agent-do)
|
||||
- [Architecture](#architecture)
|
||||
- [Quick Start](#quick-start)
|
||||
- [System Requirements](#system-requirements)
|
||||
- [Installation](#installation)
|
||||
- [Configuration](#configuration)
|
||||
- [Usage](#usage)
|
||||
- [Agent Introduction](#agent-introduction)
|
||||
- [Data Processing Agent](#data-processing-agent)
|
||||
- [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent)
|
||||
- [Advanced Features](#advanced-features)
|
||||
- [Operator Retrieval](#operator-retrieval)
|
||||
- [Retrieval Modes](#retrieval-modes)
|
||||
- [Usage](#usage-1)
|
||||
- [MCP Agent](#mcp-agent)
|
||||
- [MCP Server Types](#mcp-server-types)
|
||||
- [Configuration](#configuration-1)
|
||||
- [Usage Methods](#usage-methods)
|
||||
- [Feature Preview](#feature-preview)
|
||||
- [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available)
|
||||
- [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Common Issues](#common-issues)
|
||||
- [Optimization Recommendations](#optimization-recommendations)
|
||||
- [DataJuicer Agent](#datajuicer-agent)
|
||||
- [📋 Table of Contents](#-table-of-contents)
|
||||
- [What Does This Agent Do?](#what-does-this-agent-do)
|
||||
- [Architecture](#architecture)
|
||||
- [Quick Start](#quick-start)
|
||||
- [System Requirements](#system-requirements)
|
||||
- [Installation](#installation)
|
||||
- [Configuration](#configuration)
|
||||
- [Usage](#usage)
|
||||
- [Agent Introduction](#agent-introduction)
|
||||
- [Data Processing Agent](#data-processing-agent)
|
||||
- [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent)
|
||||
- [Advanced Features](#advanced-features)
|
||||
- [Operator Retrieval](#operator-retrieval)
|
||||
- [Retrieval Modes](#retrieval-modes)
|
||||
- [Usage](#usage-1)
|
||||
- [MCP Agent](#mcp-agent)
|
||||
- [MCP Server Types](#mcp-server-types)
|
||||
- [Configuration](#configuration-1)
|
||||
- [Usage Methods](#usage-methods)
|
||||
- [Feature Preview](#feature-preview)
|
||||
- [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available)
|
||||
- [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Common Issues](#common-issues)
|
||||
- [Optimization Recommendations](#optimization-recommendations)
|
||||
|
||||
## What Does This Agent Do?
|
||||
|
||||
@@ -68,7 +69,14 @@ Router Agent ──┐
|
||||
### Installation
|
||||
|
||||
```bash
|
||||
uv pip install -e .
|
||||
# Recommended to use uv
|
||||
uv pip install -r requirements.txt
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### Configuration
|
||||
|
||||
@@ -4,30 +4,31 @@
|
||||
|
||||
## 📋 目录
|
||||
|
||||
- [📋 目录](#-目录)
|
||||
- [这个智能体做了什么?](#这个智能体做了什么)
|
||||
- [架构](#架构)
|
||||
- [快速开始](#快速开始)
|
||||
- [系统要求](#系统要求)
|
||||
- [安装](#安装)
|
||||
- [配置](#配置)
|
||||
- [使用](#使用)
|
||||
- [智能体介绍](#智能体介绍)
|
||||
- [数据处理智能体](#数据处理智能体)
|
||||
- [代码开发智能体](#代码开发智能体)
|
||||
- [高级功能](#高级功能)
|
||||
- [算子检索](#算子检索)
|
||||
- [检索模式](#检索模式)
|
||||
- [使用](#使用-1)
|
||||
- [MCP 智能体](#mcp-智能体)
|
||||
- [MCP 服务器类型](#mcp-服务器类型)
|
||||
- [配置](#配置-1)
|
||||
- [使用方法](#使用方法)
|
||||
- [功能预览](#功能预览)
|
||||
- [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用)
|
||||
- [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中)
|
||||
- [常见问题](#常见问题)
|
||||
- [优化建议](#优化建议)
|
||||
- [DataJuicer 智能体](#datajuicer-智能体)
|
||||
- [📋 目录](#-目录)
|
||||
- [这个智能体做了什么?](#这个智能体做了什么)
|
||||
- [架构](#架构)
|
||||
- [快速开始](#快速开始)
|
||||
- [系统要求](#系统要求)
|
||||
- [安装](#安装)
|
||||
- [配置](#配置)
|
||||
- [使用](#使用)
|
||||
- [智能体介绍](#智能体介绍)
|
||||
- [数据处理智能体](#数据处理智能体)
|
||||
- [代码开发智能体](#代码开发智能体)
|
||||
- [高级功能](#高级功能)
|
||||
- [算子检索](#算子检索)
|
||||
- [检索模式](#检索模式)
|
||||
- [使用](#使用-1)
|
||||
- [MCP 智能体](#mcp-智能体)
|
||||
- [MCP 服务器类型](#mcp-服务器类型)
|
||||
- [配置](#配置-1)
|
||||
- [使用方法](#使用方法)
|
||||
- [功能预览](#功能预览)
|
||||
- [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用)
|
||||
- [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中)
|
||||
- [常见问题](#常见问题)
|
||||
- [优化建议](#优化建议)
|
||||
|
||||
## 这个智能体做了什么?
|
||||
|
||||
@@ -67,7 +68,14 @@ Data-Juicer (DJ) 是一个一站式系统,面向大模型的文本及多模态
|
||||
### 安装
|
||||
|
||||
```bash
|
||||
uv pip install -e .
|
||||
# 推荐使用uv
|
||||
uv pip install -r requirements.txt
|
||||
```
|
||||
|
||||
或
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
### 配置
|
||||
|
||||
@@ -7,11 +7,16 @@ from agentscope.model import DashScopeChatModel
|
||||
from agentscope.formatter import DashScopeChatFormatter
|
||||
from agentscope.memory import InMemoryMemory
|
||||
from agentscope.agent import UserAgent
|
||||
from agentscope.tool import Toolkit
|
||||
|
||||
from agent_factory import create_agent
|
||||
from prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT
|
||||
from tools import dj_toolkit, dj_dev_toolkit, mcp_tools, get_mcp_toolkit, agents2toolkit
|
||||
from .agent_factory import create_agent
|
||||
from .prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT
|
||||
from .tools import (
|
||||
dj_toolkit,
|
||||
dj_dev_toolkit,
|
||||
mcp_tools,
|
||||
get_mcp_toolkit,
|
||||
agents2toolkit,
|
||||
)
|
||||
|
||||
# Create shared configuration
|
||||
model = DashScopeChatModel(
|
||||
@@ -145,10 +150,14 @@ async def main(
|
||||
if __name__ == "__main__":
|
||||
# Example tasks
|
||||
# project_root = os.path.abspath(os.path.dirname(__file__))
|
||||
# task = f"数据存储在{project_root}/data/demo-dataset-images.jsonl,筛选掉样本中,文本字段长度小于5的样本,以及图片size小于100Kb的样本。并将输出结果保存到./outputs路径下。"
|
||||
# task = (
|
||||
# f"The data is stored in {project_root}/data/demo-dataset-images.jsonl. "
|
||||
# "Among the samples, the text field length is less than 5 "
|
||||
# "and the image size is less than 100Kb. "
|
||||
# "And save the output results to the ./outputs path."
|
||||
# )
|
||||
#
|
||||
# DJ Development example task:
|
||||
# task = "我想开发一个新的DataJuicer过滤算子,用于过滤掉没有人声的音频文件"
|
||||
# task = "I want to develop a new DataJuicer filter operator to filter out audio files without vocals"
|
||||
#
|
||||
# MCP Agent will be automatically selected for advanced processing tasks
|
||||
fire.Fire(main)
|
||||
|
||||
@@ -1,12 +0,0 @@
|
||||
[project]
|
||||
name = "data-juicer-agent"
|
||||
version = "0.1.0"
|
||||
description = "A data processing agent with data juicer"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"agentscope>=1.0.5",
|
||||
"faiss-cpu>=1.12.0",
|
||||
"langchain-community",
|
||||
"py-data-juicer>=1.4.2",
|
||||
]
|
||||
5
data_juicer_agent/requirements.txt
Normal file
5
data_juicer_agent/requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
agentscope>=1.0.5
|
||||
py-data-juicer>=1.4.2
|
||||
faiss-cpu>=1.12.0
|
||||
fire>=0.7.1
|
||||
langchain-community
|
||||
220
tests/data_juicer_agent_test.py
Normal file
220
tests/data_juicer_agent_test.py
Normal file
@@ -0,0 +1,220 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import os
|
||||
import pytest
|
||||
from unittest.mock import AsyncMock, Mock, patch
|
||||
from agentscope.agent import ReActAgent
|
||||
from agentscope.model import DashScopeChatModel
|
||||
from agentscope.tool import Toolkit
|
||||
from agentscope.message import Msg
|
||||
from agentscope.formatter import DashScopeChatFormatter
|
||||
from agentscope.memory import InMemoryMemory
|
||||
from agentscope.tool import (
|
||||
view_text_file,
|
||||
write_text_file,
|
||||
)
|
||||
|
||||
# Import the main function and related components
|
||||
from data_juicer_agent.main import main
|
||||
from data_juicer_agent.agent_factory import create_agent
|
||||
from data_juicer_agent.tools import (
|
||||
dj_toolkit,
|
||||
dj_dev_toolkit,
|
||||
dj_tools,
|
||||
dj_dev_tools,
|
||||
mcp_tools,
|
||||
execute_safe_command,
|
||||
query_dj_operators,
|
||||
get_basic_files,
|
||||
get_operator_example,
|
||||
configure_data_juicer_path,
|
||||
)
|
||||
|
||||
class TestDataJuicerAgent:
|
||||
"""Test suite for the data_juicer_agent functionality"""
|
||||
|
||||
@pytest.fixture
|
||||
def mock_toolkit(self):
|
||||
"""Create a mocked Toolkit instance"""
|
||||
return Mock(spec=Toolkit)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_model(self):
|
||||
"""Create a mocked DashScopeChatModel"""
|
||||
model = Mock(spec=DashScopeChatModel)
|
||||
model.call = AsyncMock(
|
||||
return_value=Msg("assistant", "test response", role="assistant"),
|
||||
)
|
||||
return model
|
||||
|
||||
@pytest.fixture
|
||||
def mock_formatter(self):
|
||||
"""Create a mocked DashScopeChatFormatter"""
|
||||
return Mock(spec=DashScopeChatFormatter)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_memory(self):
|
||||
"""Create a mocked InMemoryMemory"""
|
||||
return Mock(spec=InMemoryMemory)
|
||||
|
||||
@pytest.fixture
|
||||
def mock_mcp_client(self):
|
||||
"""Create a mocked MCP client"""
|
||||
mock_client = Mock()
|
||||
mock_client.name = "DJ_recipe_flow"
|
||||
mock_client.connect = AsyncMock()
|
||||
mock_client.close = AsyncMock()
|
||||
mock_client.get_callable_function = AsyncMock()
|
||||
mock_client.list_tools = AsyncMock()
|
||||
return mock_client
|
||||
|
||||
def create_named_mock_agent(self, name, mock_agent, *args, **kwargs):
|
||||
"""Create a named mock agent for testing"""
|
||||
agent_instance = Mock(spec=ReActAgent)
|
||||
agent_instance.model = mock_agent.model
|
||||
agent_instance.formatter = mock_agent.formatter
|
||||
agent_instance.toolkit = mock_agent.toolkit
|
||||
agent_instance.memory = mock_agent.memory
|
||||
agent_instance.__call__ = mock_agent.__call__
|
||||
agent_instance.name = name
|
||||
return agent_instance
|
||||
|
||||
@pytest.fixture
|
||||
def mock_agent(
|
||||
self,
|
||||
mock_model,
|
||||
mock_formatter,
|
||||
mock_toolkit,
|
||||
mock_memory,
|
||||
):
|
||||
"""Create a mocked ReActAgent instance"""
|
||||
agent = Mock(spec=ReActAgent)
|
||||
agent.model = mock_model
|
||||
agent.formatter = mock_formatter
|
||||
agent.toolkit = mock_toolkit
|
||||
agent.memory = mock_memory
|
||||
agent.__call__ = AsyncMock(
|
||||
return_value=Msg("assistant", "test response", role="assistant"),
|
||||
)
|
||||
return agent
|
||||
|
||||
def test_dj_toolkit_initialization(self):
|
||||
"""Test DJ toolkit initialization and tool registration"""
|
||||
assert dj_toolkit.tools.get("execute_safe_command") is not None
|
||||
assert dj_toolkit.tools.get("view_text_file") is not None
|
||||
assert dj_toolkit.tools.get("write_text_file") is not None
|
||||
assert dj_toolkit.tools.get("query_dj_operators") is not None
|
||||
|
||||
# Verify tool list contains expected tools
|
||||
expected_tools = [
|
||||
execute_safe_command,
|
||||
view_text_file,
|
||||
write_text_file,
|
||||
query_dj_operators,
|
||||
]
|
||||
assert len(dj_tools) == len(expected_tools)
|
||||
for tool in expected_tools:
|
||||
assert tool in dj_tools
|
||||
|
||||
def test_dj_dev_toolkit_initialization(self):
|
||||
"""Test DJ development toolkit initialization and tool registration"""
|
||||
assert dj_dev_toolkit.tools.get("view_text_file") is not None
|
||||
assert dj_dev_toolkit.tools.get("write_text_file") is not None
|
||||
assert dj_dev_toolkit.tools.get("get_basic_files") is not None
|
||||
assert dj_dev_toolkit.tools.get("get_operator_example") is not None
|
||||
assert dj_dev_toolkit.tools.get("configure_data_juicer_path") is not None
|
||||
|
||||
# Verify tool list contains expected tools
|
||||
expected_tools = [
|
||||
view_text_file,
|
||||
write_text_file,
|
||||
get_basic_files,
|
||||
get_operator_example,
|
||||
configure_data_juicer_path,
|
||||
]
|
||||
assert len(dj_dev_tools) == len(expected_tools)
|
||||
for tool in expected_tools:
|
||||
assert tool in dj_dev_tools
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_mcp_tools_list(self, mock_mcp_client):
|
||||
"""Test MCP tools list contains expected tools and MCP client binding"""
|
||||
expected_tools = [view_text_file, write_text_file]
|
||||
assert len(mcp_tools) == len(expected_tools)
|
||||
for tool in expected_tools:
|
||||
assert tool in mcp_tools
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_agent_initialization(
|
||||
self,
|
||||
mock_model,
|
||||
mock_formatter,
|
||||
mock_toolkit,
|
||||
mock_memory,
|
||||
):
|
||||
"""Test ReActAgent initialization"""
|
||||
with patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"}):
|
||||
agent = create_agent(
|
||||
name="DataJuicer",
|
||||
sys_prompt="You are {name}, a agent.",
|
||||
toolkit=mock_toolkit,
|
||||
model=mock_model,
|
||||
formatter=mock_formatter,
|
||||
memory=mock_memory,
|
||||
)
|
||||
|
||||
assert agent.name == "DataJuicer"
|
||||
assert "DataJuicer" in agent.sys_prompt
|
||||
assert agent.model == mock_model
|
||||
assert agent.formatter == mock_formatter
|
||||
assert agent.toolkit == mock_toolkit
|
||||
assert agent.memory == mock_memory
|
||||
|
||||
async def mock_user_func(self, msg=None):
|
||||
return Msg("user", "exit", role="user")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_main_with_multiple_agents_loading(self, mock_agent, mock_mcp_client):
|
||||
"""Test main function loads multiple agents successfully"""
|
||||
with patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"}):
|
||||
mock_mcp_clients = [mock_mcp_client]
|
||||
|
||||
with patch(
|
||||
"data_juicer_agent.tools.mcp_tools._create_clients",
|
||||
return_value=mock_mcp_clients,
|
||||
):
|
||||
with patch(
|
||||
"data_juicer_agent.main.create_agent",
|
||||
side_effect=lambda name, *args, **kwargs: self.create_named_mock_agent(
|
||||
name, mock_agent, *args, **kwargs
|
||||
),
|
||||
) as mock_create_agent:
|
||||
with patch(
|
||||
"data_juicer_agent.main.user", side_effect=self.mock_user_func
|
||||
):
|
||||
|
||||
await main(
|
||||
use_studio=False,
|
||||
available_agents=["dj", "dj_dev", "dj_mcp"],
|
||||
retrieval_mode="auto",
|
||||
)
|
||||
|
||||
# Validate multiple agents are correctly created (dj, dj_dev, dj_mcp, and router)
|
||||
assert mock_create_agent.call_count == 4
|
||||
|
||||
# Validate router agent is created
|
||||
create_calls = mock_create_agent.call_args_list
|
||||
router_agent_created = any(
|
||||
call[0][0] == "Router"
|
||||
for call in create_calls # First parameter is name
|
||||
)
|
||||
assert router_agent_created, "Router agent should be created"
|
||||
|
||||
# Validate dj_mcp agent is created
|
||||
mcp_agent_created = any(
|
||||
call[0][0] == "mcp_datajuicer_agent"
|
||||
for call in create_calls # First parameter is name
|
||||
)
|
||||
assert mcp_agent_created, "MCP agent should be created"
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main(["-v", __file__])
|
||||
Reference in New Issue
Block a user