diff --git a/data_juicer_agent/.gitignore b/data_juicer_agent/.gitignore deleted file mode 100644 index fd0b546..0000000 --- a/data_juicer_agent/.gitignore +++ /dev/null @@ -1,126 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -pip-wheel-metadata/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -.python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -.idea/ - -# macOS -.DS_Store - -# Used to save loggings and files -*runs/ -agentscope.db -tmp*.json -.vscode/ -data_agent/ -outputs/ -tools/op_manager/cache_retrieve/ -tools/op_manager/vector_index_cache/ \ No newline at end of file diff --git a/data_juicer_agent/README.md b/data_juicer_agent/README.md index bd357e3..375ca4a 100644 --- a/data_juicer_agent/README.md +++ b/data_juicer_agent/README.md @@ -4,31 +4,32 @@ A multi-agent data processing system built on [AgentScope](https://github.com/mo ## 📋 Table of Contents -- [📋 Table of Contents](#-table-of-contents) -- [What Does This Agent Do?](#what-does-this-agent-do) -- [Architecture](#architecture) -- [Quick Start](#quick-start) - - [System Requirements](#system-requirements) - - [Installation](#installation) - - [Configuration](#configuration) - - [Usage](#usage) -- [Agent Introduction](#agent-introduction) - - [Data Processing Agent](#data-processing-agent) - - [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent) -- [Advanced Features](#advanced-features) - - [Operator Retrieval](#operator-retrieval) - - [Retrieval Modes](#retrieval-modes) - - [Usage](#usage-1) - - [MCP Agent](#mcp-agent) - - [MCP Server Types](#mcp-server-types) - - [Configuration](#configuration-1) - - [Usage Methods](#usage-methods) -- [Feature Preview](#feature-preview) - - [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available) - - [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development) -- [Troubleshooting](#troubleshooting) - - [Common Issues](#common-issues) - - [Optimization Recommendations](#optimization-recommendations) +- [DataJuicer Agent](#datajuicer-agent) + - [📋 Table of Contents](#-table-of-contents) + - [What Does This Agent Do?](#what-does-this-agent-do) + - [Architecture](#architecture) + - [Quick Start](#quick-start) + - [System Requirements](#system-requirements) + - [Installation](#installation) + - [Configuration](#configuration) + - [Usage](#usage) + - [Agent Introduction](#agent-introduction) + - [Data Processing Agent](#data-processing-agent) + - [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent) + - [Advanced Features](#advanced-features) + - [Operator Retrieval](#operator-retrieval) + - [Retrieval Modes](#retrieval-modes) + - [Usage](#usage-1) + - [MCP Agent](#mcp-agent) + - [MCP Server Types](#mcp-server-types) + - [Configuration](#configuration-1) + - [Usage Methods](#usage-methods) + - [Feature Preview](#feature-preview) + - [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available) + - [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development) + - [Troubleshooting](#troubleshooting) + - [Common Issues](#common-issues) + - [Optimization Recommendations](#optimization-recommendations) ## What Does This Agent Do? @@ -68,7 +69,14 @@ Router Agent ──┐ ### Installation ```bash -uv pip install -e . +# Recommended to use uv +uv pip install -r requirements.txt +``` + +or + +```bash +pip install -r requirements.txt ``` ### Configuration diff --git a/data_juicer_agent/README_ZH.md b/data_juicer_agent/README_ZH.md index 4aa2a62..9ecf63c 100644 --- a/data_juicer_agent/README_ZH.md +++ b/data_juicer_agent/README_ZH.md @@ -4,30 +4,31 @@ ## 📋 目录 -- [📋 目录](#-目录) -- [这个智能体做了什么?](#这个智能体做了什么) -- [架构](#架构) -- [快速开始](#快速开始) - - [系统要求](#系统要求) - - [安装](#安装) - - [配置](#配置) - - [使用](#使用) -- [智能体介绍](#智能体介绍) - - [数据处理智能体](#数据处理智能体) - - [代码开发智能体](#代码开发智能体) -- [高级功能](#高级功能) - - [算子检索](#算子检索) - - [检索模式](#检索模式) - - [使用](#使用-1) - - [MCP 智能体](#mcp-智能体) - - [MCP 服务器类型](#mcp-服务器类型) - - [配置](#配置-1) - - [使用方法](#使用方法) -- [功能预览](#功能预览) - - [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用) - - [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中) - - [常见问题](#常见问题) - - [优化建议](#优化建议) +- [DataJuicer 智能体](#datajuicer-智能体) + - [📋 目录](#-目录) + - [这个智能体做了什么?](#这个智能体做了什么) + - [架构](#架构) + - [快速开始](#快速开始) + - [系统要求](#系统要求) + - [安装](#安装) + - [配置](#配置) + - [使用](#使用) + - [智能体介绍](#智能体介绍) + - [数据处理智能体](#数据处理智能体) + - [代码开发智能体](#代码开发智能体) + - [高级功能](#高级功能) + - [算子检索](#算子检索) + - [检索模式](#检索模式) + - [使用](#使用-1) + - [MCP 智能体](#mcp-智能体) + - [MCP 服务器类型](#mcp-服务器类型) + - [配置](#配置-1) + - [使用方法](#使用方法) + - [功能预览](#功能预览) + - [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用) + - [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中) + - [常见问题](#常见问题) + - [优化建议](#优化建议) ## 这个智能体做了什么? @@ -67,7 +68,14 @@ Data-Juicer (DJ) 是一个一站式系统,面向大模型的文本及多模态 ### 安装 ```bash -uv pip install -e . +# 推荐使用uv +uv pip install -r requirements.txt +``` + +或 + +```bash +pip install -r requirements.txt ``` ### 配置 diff --git a/data_juicer_agent/main.py b/data_juicer_agent/main.py index acdcf90..42aab76 100644 --- a/data_juicer_agent/main.py +++ b/data_juicer_agent/main.py @@ -7,11 +7,16 @@ from agentscope.model import DashScopeChatModel from agentscope.formatter import DashScopeChatFormatter from agentscope.memory import InMemoryMemory from agentscope.agent import UserAgent -from agentscope.tool import Toolkit -from agent_factory import create_agent -from prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT -from tools import dj_toolkit, dj_dev_toolkit, mcp_tools, get_mcp_toolkit, agents2toolkit +from .agent_factory import create_agent +from .prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT +from .tools import ( + dj_toolkit, + dj_dev_toolkit, + mcp_tools, + get_mcp_toolkit, + agents2toolkit, +) # Create shared configuration model = DashScopeChatModel( @@ -145,10 +150,14 @@ async def main( if __name__ == "__main__": # Example tasks # project_root = os.path.abspath(os.path.dirname(__file__)) - # task = f"数据存储在{project_root}/data/demo-dataset-images.jsonl,筛选掉样本中,文本字段长度小于5的样本,以及图片size小于100Kb的样本。并将输出结果保存到./outputs路径下。" + # task = ( + # f"The data is stored in {project_root}/data/demo-dataset-images.jsonl. " + # "Among the samples, the text field length is less than 5 " + # "and the image size is less than 100Kb. " + # "And save the output results to the ./outputs path." + # ) # # DJ Development example task: - # task = "我想开发一个新的DataJuicer过滤算子,用于过滤掉没有人声的音频文件" + # task = "I want to develop a new DataJuicer filter operator to filter out audio files without vocals" # - # MCP Agent will be automatically selected for advanced processing tasks fire.Fire(main) diff --git a/data_juicer_agent/pyproject.toml b/data_juicer_agent/pyproject.toml deleted file mode 100644 index fd0659f..0000000 --- a/data_juicer_agent/pyproject.toml +++ /dev/null @@ -1,12 +0,0 @@ -[project] -name = "data-juicer-agent" -version = "0.1.0" -description = "A data processing agent with data juicer" -readme = "README.md" -requires-python = ">=3.11" -dependencies = [ - "agentscope>=1.0.5", - "faiss-cpu>=1.12.0", - "langchain-community", - "py-data-juicer>=1.4.2", -] diff --git a/data_juicer_agent/requirements.txt b/data_juicer_agent/requirements.txt new file mode 100644 index 0000000..a7d5ccf --- /dev/null +++ b/data_juicer_agent/requirements.txt @@ -0,0 +1,5 @@ +agentscope>=1.0.5 +py-data-juicer>=1.4.2 +faiss-cpu>=1.12.0 +fire>=0.7.1 +langchain-community \ No newline at end of file diff --git a/tests/data_juicer_agent_test.py b/tests/data_juicer_agent_test.py new file mode 100644 index 0000000..80f2068 --- /dev/null +++ b/tests/data_juicer_agent_test.py @@ -0,0 +1,220 @@ +# -*- coding: utf-8 -*- +import os +import pytest +from unittest.mock import AsyncMock, Mock, patch +from agentscope.agent import ReActAgent +from agentscope.model import DashScopeChatModel +from agentscope.tool import Toolkit +from agentscope.message import Msg +from agentscope.formatter import DashScopeChatFormatter +from agentscope.memory import InMemoryMemory +from agentscope.tool import ( + view_text_file, + write_text_file, +) + +# Import the main function and related components +from data_juicer_agent.main import main +from data_juicer_agent.agent_factory import create_agent +from data_juicer_agent.tools import ( + dj_toolkit, + dj_dev_toolkit, + dj_tools, + dj_dev_tools, + mcp_tools, + execute_safe_command, + query_dj_operators, + get_basic_files, + get_operator_example, + configure_data_juicer_path, +) + +class TestDataJuicerAgent: + """Test suite for the data_juicer_agent functionality""" + + @pytest.fixture + def mock_toolkit(self): + """Create a mocked Toolkit instance""" + return Mock(spec=Toolkit) + + @pytest.fixture + def mock_model(self): + """Create a mocked DashScopeChatModel""" + model = Mock(spec=DashScopeChatModel) + model.call = AsyncMock( + return_value=Msg("assistant", "test response", role="assistant"), + ) + return model + + @pytest.fixture + def mock_formatter(self): + """Create a mocked DashScopeChatFormatter""" + return Mock(spec=DashScopeChatFormatter) + + @pytest.fixture + def mock_memory(self): + """Create a mocked InMemoryMemory""" + return Mock(spec=InMemoryMemory) + + @pytest.fixture + def mock_mcp_client(self): + """Create a mocked MCP client""" + mock_client = Mock() + mock_client.name = "DJ_recipe_flow" + mock_client.connect = AsyncMock() + mock_client.close = AsyncMock() + mock_client.get_callable_function = AsyncMock() + mock_client.list_tools = AsyncMock() + return mock_client + + def create_named_mock_agent(self, name, mock_agent, *args, **kwargs): + """Create a named mock agent for testing""" + agent_instance = Mock(spec=ReActAgent) + agent_instance.model = mock_agent.model + agent_instance.formatter = mock_agent.formatter + agent_instance.toolkit = mock_agent.toolkit + agent_instance.memory = mock_agent.memory + agent_instance.__call__ = mock_agent.__call__ + agent_instance.name = name + return agent_instance + + @pytest.fixture + def mock_agent( + self, + mock_model, + mock_formatter, + mock_toolkit, + mock_memory, + ): + """Create a mocked ReActAgent instance""" + agent = Mock(spec=ReActAgent) + agent.model = mock_model + agent.formatter = mock_formatter + agent.toolkit = mock_toolkit + agent.memory = mock_memory + agent.__call__ = AsyncMock( + return_value=Msg("assistant", "test response", role="assistant"), + ) + return agent + + def test_dj_toolkit_initialization(self): + """Test DJ toolkit initialization and tool registration""" + assert dj_toolkit.tools.get("execute_safe_command") is not None + assert dj_toolkit.tools.get("view_text_file") is not None + assert dj_toolkit.tools.get("write_text_file") is not None + assert dj_toolkit.tools.get("query_dj_operators") is not None + + # Verify tool list contains expected tools + expected_tools = [ + execute_safe_command, + view_text_file, + write_text_file, + query_dj_operators, + ] + assert len(dj_tools) == len(expected_tools) + for tool in expected_tools: + assert tool in dj_tools + + def test_dj_dev_toolkit_initialization(self): + """Test DJ development toolkit initialization and tool registration""" + assert dj_dev_toolkit.tools.get("view_text_file") is not None + assert dj_dev_toolkit.tools.get("write_text_file") is not None + assert dj_dev_toolkit.tools.get("get_basic_files") is not None + assert dj_dev_toolkit.tools.get("get_operator_example") is not None + assert dj_dev_toolkit.tools.get("configure_data_juicer_path") is not None + + # Verify tool list contains expected tools + expected_tools = [ + view_text_file, + write_text_file, + get_basic_files, + get_operator_example, + configure_data_juicer_path, + ] + assert len(dj_dev_tools) == len(expected_tools) + for tool in expected_tools: + assert tool in dj_dev_tools + + @pytest.mark.asyncio + async def test_mcp_tools_list(self, mock_mcp_client): + """Test MCP tools list contains expected tools and MCP client binding""" + expected_tools = [view_text_file, write_text_file] + assert len(mcp_tools) == len(expected_tools) + for tool in expected_tools: + assert tool in mcp_tools + + @pytest.mark.asyncio + async def test_agent_initialization( + self, + mock_model, + mock_formatter, + mock_toolkit, + mock_memory, + ): + """Test ReActAgent initialization""" + with patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"}): + agent = create_agent( + name="DataJuicer", + sys_prompt="You are {name}, a agent.", + toolkit=mock_toolkit, + model=mock_model, + formatter=mock_formatter, + memory=mock_memory, + ) + + assert agent.name == "DataJuicer" + assert "DataJuicer" in agent.sys_prompt + assert agent.model == mock_model + assert agent.formatter == mock_formatter + assert agent.toolkit == mock_toolkit + assert agent.memory == mock_memory + + async def mock_user_func(self, msg=None): + return Msg("user", "exit", role="user") + + @pytest.mark.asyncio + async def test_main_with_multiple_agents_loading(self, mock_agent, mock_mcp_client): + """Test main function loads multiple agents successfully""" + with patch.dict(os.environ, {"DASHSCOPE_API_KEY": "test_key"}): + mock_mcp_clients = [mock_mcp_client] + + with patch( + "data_juicer_agent.tools.mcp_tools._create_clients", + return_value=mock_mcp_clients, + ): + with patch( + "data_juicer_agent.main.create_agent", + side_effect=lambda name, *args, **kwargs: self.create_named_mock_agent( + name, mock_agent, *args, **kwargs + ), + ) as mock_create_agent: + with patch( + "data_juicer_agent.main.user", side_effect=self.mock_user_func + ): + + await main( + use_studio=False, + available_agents=["dj", "dj_dev", "dj_mcp"], + retrieval_mode="auto", + ) + + # Validate multiple agents are correctly created (dj, dj_dev, dj_mcp, and router) + assert mock_create_agent.call_count == 4 + + # Validate router agent is created + create_calls = mock_create_agent.call_args_list + router_agent_created = any( + call[0][0] == "Router" + for call in create_calls # First parameter is name + ) + assert router_agent_created, "Router agent should be created" + + # Validate dj_mcp agent is created + mcp_agent_created = any( + call[0][0] == "mcp_datajuicer_agent" + for call in create_calls # First parameter is name + ) + assert mcp_agent_created, "MCP agent should be created" + +if __name__ == "__main__": + pytest.main(["-v", __file__]) \ No newline at end of file