refactor(data_juicer_agent): update imports and add tests

This commit is contained in:
cmgzn
2025-10-30 15:36:25 +08:00
parent 55725959ae
commit 4377fe36cb
7 changed files with 308 additions and 196 deletions

View File

@@ -1,126 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
pip-wheel-metadata/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
.idea/
# macOS
.DS_Store
# Used to save loggings and files
*runs/
agentscope.db
tmp*.json
.vscode/
data_agent/
outputs/
tools/op_manager/cache_retrieve/
tools/op_manager/vector_index_cache/

View File

@@ -4,31 +4,32 @@ A multi-agent data processing system built on [AgentScope](https://github.com/mo
## 📋 Table of Contents
- [📋 Table of Contents](#-table-of-contents)
- [What Does This Agent Do?](#what-does-this-agent-do)
- [Architecture](#architecture)
- [Quick Start](#quick-start)
- [System Requirements](#system-requirements)
- [Installation](#installation)
- [Configuration](#configuration)
- [Usage](#usage)
- [Agent Introduction](#agent-introduction)
- [Data Processing Agent](#data-processing-agent)
- [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent)
- [Advanced Features](#advanced-features)
- [Operator Retrieval](#operator-retrieval)
- [Retrieval Modes](#retrieval-modes)
- [Usage](#usage-1)
- [MCP Agent](#mcp-agent)
- [MCP Server Types](#mcp-server-types)
- [Configuration](#configuration-1)
- [Usage Methods](#usage-methods)
- [Feature Preview](#feature-preview)
- [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available)
- [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development)
- [Troubleshooting](#troubleshooting)
- [Common Issues](#common-issues)
- [Optimization Recommendations](#optimization-recommendations)
- [DataJuicer Agent](#datajuicer-agent)
- [📋 Table of Contents](#-table-of-contents)
- [What Does This Agent Do?](#what-does-this-agent-do)
- [Architecture](#architecture)
- [Quick Start](#quick-start)
- [System Requirements](#system-requirements)
- [Installation](#installation)
- [Configuration](#configuration)
- [Usage](#usage)
- [Agent Introduction](#agent-introduction)
- [Data Processing Agent](#data-processing-agent)
- [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent)
- [Advanced Features](#advanced-features)
- [Operator Retrieval](#operator-retrieval)
- [Retrieval Modes](#retrieval-modes)
- [Usage](#usage-1)
- [MCP Agent](#mcp-agent)
- [MCP Server Types](#mcp-server-types)
- [Configuration](#configuration-1)
- [Usage Methods](#usage-methods)
- [Feature Preview](#feature-preview)
- [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available)
- [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development)
- [Troubleshooting](#troubleshooting)
- [Common Issues](#common-issues)
- [Optimization Recommendations](#optimization-recommendations)
## What Does This Agent Do?
@@ -68,7 +69,14 @@ Router Agent ──┐
### Installation
```bash
uv pip install -e .
# Recommended to use uv
uv pip install -r requirements.txt
```
or
```bash
pip install -r requirements.txt
```
### Configuration

View File

@@ -4,30 +4,31 @@
## 📋 目录
- [📋 目录](#-目录)
- [这个智能体做了什么?](#这个智能体做了什么)
- [架构](#架构)
- [快速开始](#快速开始)
- [系统要求](#系统要求)
- [安装](#安装)
- [配置](#配置)
- [使用](#使用)
- [智能体介绍](#智能体介绍)
- [数据处理智能体](#数据处理智能体)
- [代码开发智能体](#代码开发智能体)
- [高级功能](#高级功能)
- [算子检索](#算子检索)
- [检索模式](#检索模式)
- [使用](#使用-1)
- [MCP 智能体](#mcp-智能体)
- [MCP 服务器类型](#mcp-服务器类型)
- [配置](#配置-1)
- [使用方法](#使用方法)
- [功能预览](#功能预览)
- [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用)
- [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中)
- [常见问题](#常见问题)
- [优化建议](#优化建议)
- [DataJuicer 智能体](#datajuicer-智能体)
- [📋 目录](#-目录)
- [这个智能体做了什么?](#这个智能体做了什么)
- [架构](#架构)
- [快速开始](#快速开始)
- [系统要求](#系统要求)
- [安装](#安装)
- [配置](#配置)
- [使用](#使用)
- [智能体介绍](#智能体介绍)
- [数据处理智能体](#数据处理智能体)
- [代码开发智能体](#代码开发智能体)
- [高级功能](#高级功能)
- [算子检索](#算子检索)
- [检索模式](#检索模式)
- [使用](#使用-1)
- [MCP 智能体](#mcp-智能体)
- [MCP 服务器类型](#mcp-服务器类型)
- [配置](#配置-1)
- [使用方法](#使用方法)
- [功能预览](#功能预览)
- [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用)
- [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中)
- [常见问题](#常见问题)
- [优化建议](#优化建议)
## 这个智能体做了什么?
@@ -67,7 +68,14 @@ Data-Juicer (DJ) 是一个一站式系统,面向大模型的文本及多模态
### 安装
```bash
uv pip install -e .
# 推荐使用uv
uv pip install -r requirements.txt
```
```bash
pip install -r requirements.txt
```
### 配置

View File

@@ -7,11 +7,16 @@ from agentscope.model import DashScopeChatModel
from agentscope.formatter import DashScopeChatFormatter
from agentscope.memory import InMemoryMemory
from agentscope.agent import UserAgent
from agentscope.tool import Toolkit
from agent_factory import create_agent
from prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT
from tools import dj_toolkit, dj_dev_toolkit, mcp_tools, get_mcp_toolkit, agents2toolkit
from .agent_factory import create_agent
from .prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT
from .tools import (
dj_toolkit,
dj_dev_toolkit,
mcp_tools,
get_mcp_toolkit,
agents2toolkit,
)
# Create shared configuration
model = DashScopeChatModel(
@@ -145,10 +150,14 @@ async def main(
if __name__ == "__main__":
# Example tasks
# project_root = os.path.abspath(os.path.dirname(__file__))
# task = f"数据存储在{project_root}/data/demo-dataset-images.jsonl筛选掉样本中文本字段长度小于5的样本以及图片size小于100Kb的样本。并将输出结果保存到./outputs路径下。"
# task = (
# f"The data is stored in {project_root}/data/demo-dataset-images.jsonl. "
# "Among the samples, the text field length is less than 5 "
# "and the image size is less than 100Kb. "
# "And save the output results to the ./outputs path."
# )
#
# DJ Development example task:
# task = "我想开发一个新的DataJuicer过滤算子用于过滤掉没有人声的音频文件"
# task = "I want to develop a new DataJuicer filter operator to filter out audio files without vocals"
#
# MCP Agent will be automatically selected for advanced processing tasks
fire.Fire(main)

View File

@@ -1,12 +0,0 @@
[project]
name = "data-juicer-agent"
version = "0.1.0"
description = "A data processing agent with data juicer"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"agentscope>=1.0.5",
"faiss-cpu>=1.12.0",
"langchain-community",
"py-data-juicer>=1.4.2",
]

View File

@@ -0,0 +1,5 @@
agentscope>=1.0.5
py-data-juicer>=1.4.2
faiss-cpu>=1.12.0
fire>=0.7.1
langchain-community