diff --git a/README.md b/README.md index d9b8f98..f16f8a5 100644 --- a/README.md +++ b/README.md @@ -90,6 +90,8 @@ It includes **agent deployment** and **secure sandboxed tool execution**, and ca │ ├── multiagent_concurrent/ # Concurrent multi-agent task execution │ └── meta_planner_agent/ # Planning agent with tool orchestration │ +├── data_juicer_agent/ # Data processing multi-agent system +│ ├── sample_template/ # Template for new sample contributions └── README.md ``` @@ -119,6 +121,23 @@ It includes **agent deployment** and **secure sandboxed tool execution**, and ca | | functionality/plan | ✅ | ❌ | Task planning with ReAct agent | | | functionality/rag | ✅ | ❌ | Retrieval-Augmented Generation (RAG) integration | | | functionality/stream_printing_messages | ✅ | ❌ | Real-time message streaming and printing | +| **Data Processing** | data_juicer_agent/ | ✅ | ❌ | Multi-agent data processing with Data-Juicer | + +------ + +## 🌟 Featured Examples + +### DataJuicer Agent + +A powerful multi-agent data processing system that leverages Data-Juicer's 200+ operators for intelligent data processing: + +- **Intelligent Query**: Find suitable operators from 200+ data processing operators +- **Automated Pipeline**: Generate Data-Juicer YAML configurations from natural language +- **Custom Development**: Create domain-specific operators with AI assistance +- **Multiple Retrieval Modes**: LLM-based and vector-based operator matching +- **MCP Integration**: Native Model Context Protocol support + +📖 **Documentation**: [English](data_juicer_agent/README.md) | [中文](data_juicer_agent/README_ZH.md) ------ diff --git a/README_zh.md b/README_zh.md index 84f9735..d3ed51a 100644 --- a/README_zh.md +++ b/README_zh.md @@ -90,6 +90,8 @@ AgentScope Runtime 是一个**全面的运行时框架**,主要解决部署和 │ ├── multiagent_concurrent/ # 多 Agent 并发任务执行 │ └── meta_planner_agent/ # 带工具编排的计划 Agent │ +├── data_juicer_agent/ # 数据处理多智能体系统 +│ ├── sample_template/ # 新样例贡献模板 └── README.md ``` @@ -119,6 +121,23 @@ AgentScope Runtime 是一个**全面的运行时框架**,主要解决部署和 | | functionality/plan | ✅ | ❌ | 使用 ReAct Agent 规划任务 | | | functionality/rag | ✅ | ❌ | 检索增强生成 (RAG) 集成 | | | functionality/stream_printing_messages | ✅ | ❌ | 实时信息流输出与打印 | +| **数据处理** | data_juicer_agent/ | ✅ | ❌ | 基于 Data-Juicer 的多智能体数据处理 | + +--- + +## 🌟 特色示例 + +### DataJuicer 智能体 + +一个强大的数据处理多智能体系统,利用 Data-Juicer 的 200+ 算子进行智能数据处理: + +- **智能查询**:从 200+ 数据处理算子中找到合适的算子 +- **自动化流程**:从自然语言描述生成 Data-Juicer YAML 配置 +- **自定义开发**:通过 AI 辅助创建领域特定的算子 +- **多种检索模式**:基于 LLM 和向量的算子匹配 +- **MCP 集成**:原生模型上下文协议支持 + +📖 **文档**:[English](data_juicer_agent/README.md) | [中文](data_juicer_agent/README_ZH.md) --- diff --git a/data_juicer_agent/.gitignore b/data_juicer_agent/.gitignore new file mode 100644 index 0000000..fd0b546 --- /dev/null +++ b/data_juicer_agent/.gitignore @@ -0,0 +1,126 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +.idea/ + +# macOS +.DS_Store + +# Used to save loggings and files +*runs/ +agentscope.db +tmp*.json +.vscode/ +data_agent/ +outputs/ +tools/op_manager/cache_retrieve/ +tools/op_manager/vector_index_cache/ \ No newline at end of file diff --git a/data_juicer_agent/README.md b/data_juicer_agent/README.md new file mode 100644 index 0000000..bd357e3 --- /dev/null +++ b/data_juicer_agent/README.md @@ -0,0 +1,254 @@ +# DataJuicer Agent + +A multi-agent data processing system built on [AgentScope](https://github.com/modelscope/agentscope) and [Data-Juicer (DJ)](https://github.com/modelscope/data-juicer). This project demonstrates how to leverage the natural language understanding capabilities of large language models, enabling non-expert users to easily harness the powerful data processing capabilities of Data-Juicer. + +## 📋 Table of Contents + +- [📋 Table of Contents](#-table-of-contents) +- [What Does This Agent Do?](#what-does-this-agent-do) +- [Architecture](#architecture) +- [Quick Start](#quick-start) + - [System Requirements](#system-requirements) + - [Installation](#installation) + - [Configuration](#configuration) + - [Usage](#usage) +- [Agent Introduction](#agent-introduction) + - [Data Processing Agent](#data-processing-agent) + - [Code Development Agent (DJ Dev Agent)](#code-development-agent-dj-dev-agent) +- [Advanced Features](#advanced-features) + - [Operator Retrieval](#operator-retrieval) + - [Retrieval Modes](#retrieval-modes) + - [Usage](#usage-1) + - [MCP Agent](#mcp-agent) + - [MCP Server Types](#mcp-server-types) + - [Configuration](#configuration-1) + - [Usage Methods](#usage-methods) +- [Feature Preview](#feature-preview) + - [Data-Juicer Q\&A Agent (Demo Available)](#data-juicer-qa-agent-demo-available) + - [Data Analysis and Visualization Agent (In Development)](#data-analysis-and-visualization-agent-in-development) +- [Troubleshooting](#troubleshooting) + - [Common Issues](#common-issues) + - [Optimization Recommendations](#optimization-recommendations) + +## What Does This Agent Do? + +Data-Juicer (DJ) is a one-stop system for text and multimodal data processing for large language models. It provides nearly 200 core data processing operators, covering multimodal data such as text, images, and videos, and supports the full pipeline of data analysis, cleaning, and synthesis. + +After running this example, you can: +- **Intelligent Query**: Find suitable operators from nearly 200 data processing operators for your data scenarios +- **Automated Pipeline**: Describe your data processing needs, automatically generate Data-Juicer YAML configurations and execute them +- **Custom Extension**: Quickly develop custom operators for specific scenarios + +## Architecture + +``` +User Query + ↓ +Router Agent ──┐ + ├── Data Processing Agent (DJ Agent) + | ├── General File Read/Write Tools + │ ├── query_dj_operators (Query DataJuicer operators) + │ └── execute_safe_command (Execute safe commands including dj-process, dj-analyze) + │ + └── Code Development Agent (DJ Dev Agent) + ├── General File Read/Write Tools + ├── get_basic_files (Get basic development knowledge) + ├── get_operator_example (Get operator source code examples related to requirements) + └── configure_data_juicer_path (Configure DataJuicer path) +``` + +## Quick Start + +### System Requirements + +- Python 3.8+ +- Valid DashScope API key +- Optional: Data-Juicer source code (for custom operator development) + +### Installation + +```bash +uv pip install -e . +``` + +### Configuration + +1. **Set API Key** + +```bash +export DASHSCOPE_API_KEY="your-dashscope-key" +``` + +2. **Optional: Configure Data-Juicer Path (for custom operator development)** + +```bash +export DATA_JUICER_PATH="your-data-juicer-path" +``` + +> **Tip**: You can also set this during runtime through conversation, for example: +> - "Help me set the DataJuicer path: /path/to/data-juicer" +> - "Help me update the DataJuicer path: /path/to/data-juicer" + +### Usage + +Choose the running mode using the `-u` or `--use_studio` parameter: + +```bash +# Use AgentScope Studio (provides interactive interface) +python main.py --use_studio true + +# Or use command-line mode (default) +python main.py +``` + +## Agent Introduction + +### Data Processing Agent + +Responsible for interacting with Data-Juicer and executing actual data processing tasks. Supports automatic operator recommendation from natural language descriptions, configuration generation, and execution. + +**Typical Use Cases:** +- **Data Cleaning**: Deduplication, removal of low-quality samples, format standardization +- **Multimodal Processing**: Process text, image, and video data simultaneously +- **Batch Conversion**: Format conversion, data augmentation, feature extraction + +
+View Complete Example Log (from AgentScope Studio) + +
+ +### Code Development Agent (DJ Dev Agent) + +Assists in developing custom data processing operators, powered by the `qwen3-coder-480b-a35b-instruct` model by default. + +**Typical Use Cases:** +- **Develop domain-specific filter or transformation operators** +- **Integrate proprietary data processing logic** +- **Extend Data-Juicer capabilities for specific scenarios** + +
+View Complete Example Log (from AgentScope Studio) + +
+ +## Advanced Features + +### Operator Retrieval + +DJ Agent implements an intelligent operator retrieval tool that quickly finds the most relevant operators from Data-Juicer's nearly 200 operators through an independent LLM query process. This is a key component enabling the data processing agent and code development agent to run accurately. + +We provide three retrieval modes to choose from based on different scenarios: + +#### Retrieval Modes + +**LLM Retrieval (default)** +- Uses the Qwen-Turbo model to match the most relevant operators +- Provides detailed matching reasons and relevance scores +- Suitable for scenarios requiring high-precision matching, but consumes more tokens + +**Vector Retrieval (vector)** +- Based on DashScope text embedding and FAISS similarity search +- Fast and efficient, suitable for large-scale retrieval scenarios + +**Auto Mode (auto)** +- Prioritizes LLM retrieval, automatically falls back to vector retrieval on failure + +#### Usage + +Specify the retrieval mode using the `-r` or `--retrieve_mode` parameter: + +```bash +python main.py --retrieve_mode vector +``` + +For more parameter descriptions, see `python main.py --help` + +### MCP Agent + +Data-Juicer provides MCP (Model Context Protocol) services that can directly obtain operator information and execute data processing through native interfaces, making it easy to migrate and integrate without separate LLM queries and command-line calls. + +#### MCP Server Types + +Data-Juicer provides two MCP server modes: + +**Recipe-Flow (Data Recipe)** +- Filter by operator type and tags +- Support combining multiple operators into data recipes for execution + +**Granular-Operators (Fine-grained Operators)** +- Provide each operator as an independent tool +- Flexibly specify operator lists through environment variables +- Build fully customized data processing pipelines + +For detailed information, please refer to: [Data-Juicer MCP Service Documentation](https://modelscope.github.io/data-juicer/en/main/docs/DJ_service.html#mcp-server) + +> **Note**: The Data-Juicer MCP server is currently in early development, and features and tools may change with ongoing development. + +#### Configuration + +Configure the service address in `configs/mcp_config.json`: + +```json +{ + "mcpServers": { + "DJ_recipe_flow": { + "url": "http://127.0.0.1:8080/sse" + } + } +} +``` + +#### Usage Methods + +Enable MCP Agent to replace DJ Agent: + +```bash +# Enable MCP Agent and Dev Agent +python main.py --available_agents [dj_mcp, dj_dev] + +# Or use shorthand +python main.py -a [dj_mcp, dj_dev] +``` + +## Feature Preview + +The Data-Juicer agent ecosystem is rapidly expanding. Here are the new agents currently in development or planned: + +### Data-Juicer Q&A Agent (Demo Available) + +Provides users with detailed answers about Data-Juicer operators, concepts, and best practices. + + + +### Data Analysis and Visualization Agent (In Development) + +Generates data analysis and visualization results, expected to be released soon. + +## Troubleshooting + +### Common Issues + +**Q: How to get DashScope API key?** +A: Visit [DashScope official website](https://dashscope.aliyun.com/) to register an account and apply for an API key. + +**Q: Why does operator retrieval fail?** +A: Please check network connection and API key configuration, or try switching to vector retrieval mode. + +**Q: How to debug custom operators?** +A: Ensure Data-Juicer path is configured correctly and check the example code provided by the code development agent. + +**Q: What to do if MCP service connection fails?** +A: Check if the MCP server is running and confirm the URL address in the configuration file is correct. + +### Optimization Recommendations + +- For large-scale data processing, it is recommended to use DataJuicer's distributed mode +- Set batch size appropriately to balance memory usage and processing speed +- For more advanced data processing features (synthesis, Data-Model Co-Development), please refer to DataJuicer [documentation](https://modelscope.github.io/data-juicer/en/main/index.html) + +--- + +**Contributing**: Welcome to submit Issues and Pull Requests to improve AgentScope, DataJuicer Agent, and [DataJuicer](https://modelscope.github.io/data-juicer/en/main/index.html#contribution-and-acknowledgements). If you encounter problems during use or have feature suggestions, please feel free to contact us. \ No newline at end of file diff --git a/data_juicer_agent/README_ZH.md b/data_juicer_agent/README_ZH.md new file mode 100644 index 0000000..4aa2a62 --- /dev/null +++ b/data_juicer_agent/README_ZH.md @@ -0,0 +1,253 @@ +# DataJuicer 智能体 + +基于 [AgentScope](https://github.com/modelscope/agentscope) 和 [Data-Juicer (DJ)](https://github.com/modelscope/data-juicer) 构建的数据处理多智能体系统。该项目展示了如何利用大模型的自然语言理解能力,让非专家用户也能轻松使用 Data-Juicer 的强大数据处理能力。 + +## 📋 目录 + +- [📋 目录](#-目录) +- [这个智能体做了什么?](#这个智能体做了什么) +- [架构](#架构) +- [快速开始](#快速开始) + - [系统要求](#系统要求) + - [安装](#安装) + - [配置](#配置) + - [使用](#使用) +- [智能体介绍](#智能体介绍) + - [数据处理智能体](#数据处理智能体) + - [代码开发智能体](#代码开发智能体) +- [高级功能](#高级功能) + - [算子检索](#算子检索) + - [检索模式](#检索模式) + - [使用](#使用-1) + - [MCP 智能体](#mcp-智能体) + - [MCP 服务器类型](#mcp-服务器类型) + - [配置](#配置-1) + - [使用方法](#使用方法) +- [功能预览](#功能预览) + - [Data-Juicer 问答智能体 (演示可用)](#data-juicer-问答智能体-演示可用) + - [数据分析与可视化智能体 (开发中)](#数据分析与可视化智能体-开发中) + - [常见问题](#常见问题) + - [优化建议](#优化建议) + +## 这个智能体做了什么? + +Data-Juicer (DJ) 是一个一站式系统,面向大模型的文本及多模态数据处理。它提供了近200个核心数据处理算子,覆盖文本、图像、视频等多模态数据,支持数据分析、清洗、合成等全流程。 + +运行本示例后,您可以: +- **智能查询**:从近200个数据处理算子中找到适合您数据场景的算子 +- **自动化流程**:描述数据处理需求,自动生成 Data-Juicer YAML 配置并执行 +- **自定义扩展**:为特定场景快速开发自定义算子 + +## 架构 + +``` +用户查询 + ↓ +路由智能体 ──┐ + ├── 数据处理智能体 (DJ 智能体) + | ├── 通用文件读写工具 + │ ├── query_dj_operators (查询DataJuicer算子) + │ └── execute_safe_command (执行包含dj-process, dj-analyze在内的安全命令) + │ + └── 代码开发智能体 (DJ Dev 智能体) + ├── 通用文件读写工具 + ├── get_basic_files (获取基础的开发知识) + ├── get_operator_example (获取与需求相关的算子源码示例) + └── configure_data_juicer_path (配置DataJuicer路径) +``` + +## 快速开始 + +### 系统要求 + +- Python 3.8+ +- 有效的 DashScope API 密钥 +- 可选:Data-Juicer 源码(用于自定义算子开发) + +### 安装 + +```bash +uv pip install -e . +``` + +### 配置 + +1. **设置 API 密钥** + +```bash +export DASHSCOPE_API_KEY="your-dashscope-key" +``` + +2. **可选:配置 Data-Juicer 路径(用于自定义算子开发)** + +```bash +export DATA_JUICER_PATH="your-data-juicer-path" +``` + +> **提示**:也可以在运行时通过对话设置,例如: +> - "帮我设置 DataJuicer 路径:/path/to/data-juicer" +> - "帮我更新 DataJuicer 路径:/path/to/data-juicer" + +### 使用 + +通过 `-u` 或 `--use_studio` 参数选择运行方式: + +```bash +# 使用 AgentScope Studio(提供交互式界面) +python main.py --use_studio true + +# 或使用命令行模式(默认) +python main.py +``` + +## 智能体介绍 + +### 数据处理智能体 + +负责与 Data-Juicer 交互,执行实际的数据处理任务。支持从自然语言描述自动推荐算子、生成配置并执行。 + +**典型用途:** +- **数据清洗**:去重、移除低质量样本、格式标准化 +- **多模态处理**:同时处理文本、图像、视频数据 +- **批量转换**:格式转换、数据增强、特征提取 + +
+查看完整示例日志(from AgentScope Studio) + +
+ +### 代码开发智能体 + +辅助开发自定义数据处理算子,默认使用 `qwen3-coder-480b-a35b-instruct` 模型驱动。 + +**典型用途:** +- **开发领域特定的过滤或转换算子** +- **集成自有的数据处理逻辑** +- **为特定场景扩展 Data-Juicer 能力** + +
+查看完整示例日志(from AgentScope Studio) + +
+ +## 高级功能 + +### 算子检索 + +DJ 智能体实现了一个智能算子检索工具,通过独立的 LLM 查询环节从 Data-Juicer 的近200个算子中快速找到最相关的算子。这是数据处理智能体和代码开发智能体能够准确运行的关键组件。 + +我们提供了三种检索模式,可根据不同场景选用: + +#### 检索模式 + +**LLM 检索 (默认)** +- 使用 Qwen-Turbo 模型匹配最相关算子 +- 提供详细的匹配理由和相关性评分 +- 适合需要高精度匹配的场景,但消耗更多 Token + +**向量检索 (vector)** +- 基于 DashScope 文本嵌入和 FAISS 相似度搜索 +- 快速且高效,适合大规模检索场景 + +**自动模式 (auto)** +- 优先尝试 LLM 检索,失败时自动降级到向量检索 + +#### 使用 + +通过 `-r` 或 `--retrieve_mode` 参数指定检索模式: + +```bash +python main.py --retrieve_mode vector +``` + +更多参数说明见 `python main.py --help` + +### MCP 智能体 + +Data-Juicer 提供了 MCP (Model Context Protocol) 服务,可直接通过原生接口获取算子信息、执行数据处理,易于迁移和集成,无需单独的 LLM 查询和命令行调用。 + +#### MCP 服务器类型 + +Data-Juicer 提供两种 MCP 服务器模式: + +**Recipe-Flow(数据菜谱)** +- 根据算子类型和标签进行筛选 +- 支持将多个算子组合成数据菜谱运行 + +**Granular-Operators(细粒度算子)** +- 将每个算子作为独立工具提供 +- 通过环境变量灵活指定算子列表 +- 构建完全定制化的数据处理管道 + +详细信息请参考:[Data-Juicer MCP 服务文档](https://modelscope.github.io/data-juicer/en/main/docs/DJ_service.html#mcp-server) + +> **注意**:Data-Juicer MCP 服务器目前处于早期开发阶段,功能和工具可能会随着持续开发而变化。 + +#### 配置 + +在 `configs/mcp_config.json` 中配置服务地址: + +```json +{ + "mcpServers": { + "DJ_recipe_flow": { + "url": "http://127.0.0.1:8080/sse" + } + } +} +``` + +#### 使用方法 + +启用 MCP 智能体替代 DJ 智能体: + +```bash +# 启用 MCP 智能体和开发智能体 +python main.py --available_agents [dj_mcp, dj_dev] + +# 或使用简写 +python main.py -a [dj_mcp, dj_dev] +``` + + +## 功能预览 + +Data-Juicer 智能体生态系统正在快速扩展,以下是当前正在开发或计划中的新智能体: + +### Data-Juicer 问答智能体 (演示可用) + +为用户提供关于 Data-Juicer 算子、概念和最佳实践的详细解答。 + + + +### 数据分析与可视化智能体 (开发中) + +生成数据分析和可视化结果,预计近期发布。 + +### 常见问题 + +**Q: 如何获取 DashScope API 密钥?** +A: 访问 [DashScope 官网](https://dashscope.aliyun.com/) 注册账号并申请 API 密钥。 + +**Q: 为什么算子检索失败?** +A: 请检查网络连接和 API 密钥配置,或尝试切换到向量检索模式。 + +**Q: 如何调试自定义算子?** +A: 确保 Data-Juicer 路径配置正确,并查看代码开发智能体提供的示例代码。 + +**Q: MCP 服务连接失败怎么办?** +A: 检查 MCP 服务器是否正在运行,确认配置文件中的 URL 地址正确。 + +### 优化建议 + +- 对于大规模数据处理,建议使用DataJuicer提供的分布式模式 +- 合理设置批处理大小以平衡内存使用和处理速度 +- 更多进阶数据处理(合成、Data-Model Co-Development)等特性能力请参考DataJuicer[文档页](https://modelscope.github.io/data-juicer/zh_CN/main/index_ZH) + + +--- + +**贡献指南**:欢迎提交 Issue 和 Pull Request 来改进agentscope、DataJuicer Agent及[DataJuicer](https://modelscope.github.io/data-juicer/zh_CN/main/index_ZH#id4)。如果您在使用过程中遇到问题或有功能建议,请随时联系我们。 diff --git a/data_juicer_agent/agent_factory.py b/data_juicer_agent/agent_factory.py new file mode 100644 index 0000000..830bfed --- /dev/null +++ b/data_juicer_agent/agent_factory.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" +Agent Factory + +Factory functions for creating and configuring agents with standardized toolkits. +""" + +import os +from typing import Optional +from agentscope.agent import ReActAgent +from agentscope.tool import Toolkit +from agentscope.formatter import FormatterBase, OpenAIChatFormatter +from agentscope.model import ChatModelBase, OpenAIChatModel +from agentscope.memory import InMemoryMemory, MemoryBase + + +# Default configurations +DEFAULT_MODEL_CONFIG = { + "model_name": "gpt-4o", + "stream": False, +} + + +def get_default_model() -> OpenAIChatModel: + """Create default OpenAI model instance.""" + api_key = os.environ.get("OPENAI_API_KEY") + if not api_key: + raise ValueError("OPENAI_API_KEY environment variable is required") + + return OpenAIChatModel(api_key=api_key, **DEFAULT_MODEL_CONFIG) + + +def create_agent( + name: str, + sys_prompt: str, + toolkit: Toolkit, + description: Optional[str] = None, + model: Optional[ChatModelBase] = None, + formatter: Optional[FormatterBase] = None, + memory: Optional[MemoryBase] = None, + max_iters: int = 10, + parallel_tool_calls: bool = False, + **kwargs, +) -> ReActAgent: + """ + Create a ReActAgent with standardized configuration. + + Args: + name: Agent identifier + sys_prompt: System prompt template (supports {name} placeholder) + toolkit: Toolkit instance + model: Language model (defaults to GPT-4o) + formatter: Message formatter (defaults to OpenAIChatFormatter) + memory: Memory instance (defaults to InMemoryMemory) + max_iters: Maximum reasoning iterations + parallel_tool_calls: Enable parallel tool execution + **kwargs: Additional ReActAgent arguments + + Returns: + Configured ReActAgent instance + + Example: + >>> agent = create_agent( + ... name="sql_expert", + ... sys_prompt="You are {name}, a SQL database expert", + ... tools=sql_tools + ... ) + """ + # Set defaults + if model is None: + model = get_default_model() + if formatter is None: + formatter = OpenAIChatFormatter() + if memory is None: + memory = InMemoryMemory() + + # Create agent + agent = ReActAgent( + name=name, + sys_prompt=sys_prompt.format(name=name), + model=model, + formatter=formatter, + toolkit=toolkit, + memory=memory, + max_iters=max_iters, + parallel_tool_calls=parallel_tool_calls, + **kwargs, + ) + + agent.__doc__ = description + + return agent diff --git a/data_juicer_agent/assets/dj_agent_image.png b/data_juicer_agent/assets/dj_agent_image.png new file mode 100644 index 0000000..d8fb2d0 Binary files /dev/null and b/data_juicer_agent/assets/dj_agent_image.png differ diff --git a/data_juicer_agent/assets/dj_dev_agent_image.png b/data_juicer_agent/assets/dj_dev_agent_image.png new file mode 100644 index 0000000..48ea8c6 Binary files /dev/null and b/data_juicer_agent/assets/dj_dev_agent_image.png differ diff --git a/data_juicer_agent/configs/mcp_config.json b/data_juicer_agent/configs/mcp_config.json new file mode 100644 index 0000000..f8cdb62 --- /dev/null +++ b/data_juicer_agent/configs/mcp_config.json @@ -0,0 +1,7 @@ +{ + "mcpServers": { + "DJ_recipe_flow": { + "url": "http://127.0.0.1:8080/sse" + } + } +} \ No newline at end of file diff --git a/data_juicer_agent/data/demo-dataset-images.jsonl b/data_juicer_agent/data/demo-dataset-images.jsonl new file mode 100644 index 0000000..c91f716 --- /dev/null +++ b/data_juicer_agent/data/demo-dataset-images.jsonl @@ -0,0 +1,3 @@ +{"images":["./images/img1.png"], "text": "<__dj__image> A comfortable bed."} +{"images":["./images/img2.jpg"], "text": "<__dj__image> A bus."} +{"images":["./images/img3.jpg"], "text": "<__dj__image> Black and white photograph of a woman holding an umbrella."} diff --git a/data_juicer_agent/data/images/img1.png b/data_juicer_agent/data/images/img1.png new file mode 100644 index 0000000..8d9e70b Binary files /dev/null and b/data_juicer_agent/data/images/img1.png differ diff --git a/data_juicer_agent/data/images/img2.jpg b/data_juicer_agent/data/images/img2.jpg new file mode 100644 index 0000000..8595513 Binary files /dev/null and b/data_juicer_agent/data/images/img2.jpg differ diff --git a/data_juicer_agent/data/images/img3.jpg b/data_juicer_agent/data/images/img3.jpg new file mode 100644 index 0000000..e0de8b1 Binary files /dev/null and b/data_juicer_agent/data/images/img3.jpg differ diff --git a/data_juicer_agent/main.py b/data_juicer_agent/main.py new file mode 100644 index 0000000..acdcf90 --- /dev/null +++ b/data_juicer_agent/main.py @@ -0,0 +1,154 @@ +# -*- coding: utf-8 -*- +import os +import fire +from typing import List + +from agentscope.model import DashScopeChatModel +from agentscope.formatter import DashScopeChatFormatter +from agentscope.memory import InMemoryMemory +from agentscope.agent import UserAgent +from agentscope.tool import Toolkit + +from agent_factory import create_agent +from prompts import DJ_SYS_PROMPT, DJ_DEV_SYS_PROMPT, ROUTER_SYS_PROMPT, MCP_SYS_PROMPT +from tools import dj_toolkit, dj_dev_toolkit, mcp_tools, get_mcp_toolkit, agents2toolkit + +# Create shared configuration +model = DashScopeChatModel( + model_name="qwen-max", + api_key=os.environ["DASHSCOPE_API_KEY"], + stream=True, + enable_thinking=False, +) + +dev_model = DashScopeChatModel( + model_name="qwen3-coder-480b-a35b-instruct", + api_key=os.environ["DASHSCOPE_API_KEY"], + stream=True, + enable_thinking=False, +) + +formatter = DashScopeChatFormatter() +memory = InMemoryMemory() + +user = UserAgent("User") + + +async def main( + use_studio: bool = False, + available_agents: List[str] = ["dj", "dj_dev"], + retrieval_mode: str = "auto", +): + """ + Main function for running the agent. + + :param use_studio: Whether to use agentscope studio. + :param available_agents: List of available agents. Options: dj, dj_dev, dj_mcp + :param retrieval_mode: Retrieval mode for operators. Options: auto, vector, llm + """ + + if "dj" in available_agents: + # Set global retrieval mode for tools to use + os.environ["RETRIEVAL_MODE"] = retrieval_mode + print(f"Using retrieval mode: {retrieval_mode}") + + agents = [] + for agent_name in available_agents: + if agent_name == "dj": + # Create agents using unified create_agent function + dj_agent = create_agent( + "datajuicer_agent", + DJ_SYS_PROMPT, + dj_toolkit, + ( + "A professional data preprocessing AI assistant with the following core capabilities: \n" + "Tool Matching \n" + "- Query and validate suitable DataJuicer operators; \n" + "Configuration Generation \n" + "- Create YAML configuration files and preview data; \n" + "Task Execution - Run data processing pipelines and output results" + ), + model, + formatter, + memory, + ) + agents.append(dj_agent) + + if agent_name == "dj_dev": + # DJ Development Agent for operator development + dj_dev_agent = create_agent( + "dj_dev_agent", + DJ_DEV_SYS_PROMPT, + dj_dev_toolkit, + ( + "An expert DataJuicer development assistant specializing in creating new DataJuicer operators. \n" + "Core capabilities: \n" + "Reference Retrieval - fetch base classes and examples; \n" + "Environment Configuration - handle DATA_JUICER_PATH setup. if user provides a DataJuicer path requiring setup/update, please call this agent;\n; " + "Code Generation - write complete, convention-compliant operator code" + ), + dev_model, + formatter, + memory, + ) + agents.append(dj_dev_agent) + + if agent_name == "dj_mcp": + mcp_toolkit, _ = await get_mcp_toolkit() + for tool in mcp_tools: + mcp_toolkit.register_tool_function(tool) + + mcp_agent = create_agent( + "mcp_datajuicer_agent", + MCP_SYS_PROMPT, + mcp_toolkit, + ( + "DataJuicer MCP Agent powered by Recipe Flow MCP server. \n" + "Core capabilities: \n" + "- Filter operators by tags/categories using MCP protocol; \n" + "- Real-time data processing pipeline execution. \n" + ), + model, + formatter, + memory, + ) + agents.append(mcp_agent) + + # Router agent - uses agents2tools to dynamically generate tools from all agents + router_agent = create_agent( + "Router", + ROUTER_SYS_PROMPT, + agents2toolkit(agents), + "A router agent that intelligently routes tasks to specialized DataJuicer agents", + model, + formatter, + InMemoryMemory(), # Router uses its own memory instance + ) + + if use_studio is True: + import agentscope + + agentscope.init( + studio_url="http://localhost:3000", + project="data_agent", + ) + + msg = None + while True: + msg = await user(msg) + if msg.get_text_content() == "exit": + break + # Router agent handles the entire task with automatic multi-step routing + msg = await router_agent(msg) + + +if __name__ == "__main__": + # Example tasks + # project_root = os.path.abspath(os.path.dirname(__file__)) + # task = f"数据存储在{project_root}/data/demo-dataset-images.jsonl,筛选掉样本中,文本字段长度小于5的样本,以及图片size小于100Kb的样本。并将输出结果保存到./outputs路径下。" + # + # DJ Development example task: + # task = "我想开发一个新的DataJuicer过滤算子,用于过滤掉没有人声的音频文件" + # + # MCP Agent will be automatically selected for advanced processing tasks + fire.Fire(main) diff --git a/data_juicer_agent/prompts.py b/data_juicer_agent/prompts.py new file mode 100644 index 0000000..6def4cf --- /dev/null +++ b/data_juicer_agent/prompts.py @@ -0,0 +1,135 @@ +# -*- coding: utf-8 -*- + +DJ_SYS_PROMPT = """ +You are an expert data preprocessing assistant named {name}, specializing in handling multimodal data including text, images, videos, and other AI model-related data. + +You will strictly follow these steps sequentially: + +- Data Preview (optional but recommended): + Before generating the YAML, you may first use `view_text_file` to inspect a small subset of the raw data (e.g., the first 5–10 samples) so that you can: + 1. Verify the exact field names and formats; + 2. Decide appropriate values such as `text_keys`, `image_key`, and the parameters of subsequent operators. + If the user requests or needs more specific data analysis, use `dj-analyzer` to analyze the data: + 1. After creating the configuration file according to the requirements, run it (see Step 2 for the configuration file creation method): + dj-analyze --config configs/your_analyzer.yaml + 2. you can also use auto mode to avoid writing a recipe. It will analyze a small part (e.g. 1000 samples, specified by argument `auto_num`) of your dataset with all Filters that produce stats. + dj-analyze --auto --dataset_path xx.jsonl [--auto_num 1000] + +Step 1: Tool Discovery and Matching + - First, use the `query_dj_operators` tool to get relevant DataJuicer operators based on the user's task description + - Analyze the retrieved operators and verify if they have exact functional matches with the input query + - If no suitable operators are found, immediately terminate the task + - If partially supported operators exist, skip incompatible parts and proceed + +Step 2: Generate Configuration File + - Create a YAML configuration containing global parameters and tool configurations. Save it to a YAML file with yaml dump api. + After successful file creation, inform the user of the file location. File save failure indicates task failure. + a. Global Parameters: + - project_name: Project name + - dataset_path: Real data path (never fabricate paths. Set to `None` if unknown) + - export_path: Output path (use default if unspecified) + - text_keys: Text field names to process + - image_key: Image field name to process + - np: Multiprocessing count + Keep other parameters as defaults. + + b. Operator Configuration: + - Use the operators retrieved from Step 1 to configure the 'process' field + - Ensure precise functional matching with user requirements + +Step 3: Execute Processing Task + Pre-execution checks: + - dataset_path: Must be a valid user-provided path and the path must exist + - process: Operator configuration list must exist + Terminate immediately if any check fails and explain why. + + If all pre-execution checks are valid, run: `dj-process --config ${{YAML_config_file}}` + +Mandatory Requirements: +- Never ask me questions. Make reasonable assumptions for non-critical parameters +- Only generate the reply after the task has finished running +- Always start by retrieving relevant operators using the query_dj_operators tool + +Configuration Template: +```yaml +# global parameters +project_name: {{your project name}} +dataset_path: {{path to your dataset directory or file}} +text_keys: {{text key to be processed}} +image_key: {{image key to be processed}} +np: {{number of subprocess to process your dataset}} +skip_op_error: false # must set to false + +export_path: {{single file path to save processed data, must be a jsonl file path not a folder}} + +# process schedule +# a list of several process operators with their arguments +process: + - image_shape_filter: + min_width: 100 + min_height: 100 + - text_length_filter: + min_len: 5 + max_len: 10000 + - ... +``` + +Available Tools: +Function definitions: +``` +{{index}}. {{function name}}: {{function description}} +{{argument1 name}} ({{argument type}}): {{argument description}} +{{argument2 name}} ({{argument type}}): {{argument description}} +``` + +""" + +DJ_DEV_SYS_PROMPT = """ +You are an expert DataJuicer operator development assistant named {name}, specializing in helping developers create new DataJuicer operators. + +Development Workflow: +1. Understand user requirements and identify operator type (filter, mapper, deduplicator, etc.) +2. Call `get_basic_files()` to get base_op classes and development guidelines +3. Call `get_operator_example(operator_type)` to get relevant examples +4. If previous tools report `DATA_JUICER_PATH` not configured, **STOP** and request user input with a clear message asking for the value of `DATA_JUICER_PATH` +5. Once the user provides `DATA_JUICER_PATH`, call `configure_data_juicer_path(data_juicer_path)` with the provided value + **Do not attempt to set or infer `DATA_JUICER_PATH` on your own** + +Critical Requirements: +- NEVER guess or fabricate file paths or configuration values +- Always call get_basic_files() and get_operator_example() before writing code +- Write complete, runnable code following DataJuicer conventions +- Focus on practical implementation +""" + +MCP_SYS_PROMPT = """You are {name}, an advanced DataJuicer MCP Agent powered by MCP server, specializing in handling multimodal data including text, images, videos, and other AI model-related data. + +Analyze user requirements and use the tools provided to you for data processing. + +Before data processing, you can also try: +- Use `view_text_file` to inspect a small subset of the raw data (e.g., the first 2~5 samples) in order to: + 1. Verify the exact field names and formats + 2. Determine appropriate parameter values such as text length ranges, language types, confidence thresholds, etc. + 3. Understand data characteristics to optimize operator parameter configuration +""" + +ROUTER_SYS_PROMPT = """ +You are an AI routing agent named {name}. Your primary responsibility is to analyze user queries and route them to the most appropriate specialized agent for handling. + +Key responsibilities: +1. Understand the user's intent and requirements +2. Select the most suitable agent from available options +3. Handle user input requests from routed agents properly + +When routing to an agent that requires user input: +- If the routed agent returns a response indicating that additional input or configuration is required for user confirmation or submission, you must: + 1. Stop the current routing process + 2. Present the agent's request to the user directly + 3. Wait for user's response before continuing + 4. Pass the user's input back to the appropriate agent + +- NEVER fabricate or guess user input values (like paths, configurations, etc.) +- Always ask the user for the required information when an agent needs it + +Available agents and their capabilities will be provided as tools in your toolkit. +""" \ No newline at end of file diff --git a/data_juicer_agent/pyproject.toml b/data_juicer_agent/pyproject.toml new file mode 100644 index 0000000..fd0659f --- /dev/null +++ b/data_juicer_agent/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "data-juicer-agent" +version = "0.1.0" +description = "A data processing agent with data juicer" +readme = "README.md" +requires-python = ">=3.11" +dependencies = [ + "agentscope>=1.0.5", + "faiss-cpu>=1.12.0", + "langchain-community", + "py-data-juicer>=1.4.2", +] diff --git a/data_juicer_agent/tools/__init__.py b/data_juicer_agent/tools/__init__.py new file mode 100644 index 0000000..ecf012a --- /dev/null +++ b/data_juicer_agent/tools/__init__.py @@ -0,0 +1,89 @@ +# -*- coding: utf-8 -*- +""" +Tools package for data-agent. + +This module provides a unified entry point for all agent tools, +organized by agent type for easy access and management. +""" +import asyncio +from typing import List +from agentscope.agent import AgentBase +from agentscope.tool import ( + view_text_file, + write_text_file, +) +from agentscope.tool import Toolkit + +from .dj_tools import execute_safe_command +from .router_tools import agent_to_tool +from .dj_tools import query_dj_operators +from .dj_dev_tools import get_basic_files, get_operator_example, configure_data_juicer_path +from .mcp_tools import get_mcp_toolkit + +def create_toolkit(tools: List[str]): + # Create toolkit and register tools + toolkit = Toolkit() + for tool in tools: + toolkit.register_tool_function(tool) + + return toolkit + +# DJ Agent tools +dj_tools = [ + execute_safe_command, + view_text_file, + write_text_file, + query_dj_operators, +] + +# DJ Development Agent tools - for developing DataJuicer operators +dj_dev_tools = [ + view_text_file, + write_text_file, + get_basic_files, + get_operator_example, + configure_data_juicer_path, +] + +# MCP Agent tools - for advanced data processing with Recipe Flow MCP +mcp_tools = [ + view_text_file, + write_text_file, +] + +def agents2toolkit(agents: List[AgentBase]): + tools = [agent_to_tool(agent) for agent in agents] + return create_toolkit(tools) + +dj_toolkit = create_toolkit(dj_tools) +dj_dev_toolkit = create_toolkit(dj_dev_tools) + + +# All available tools +all_toolkit = { + "dj": dj_toolkit, + "dj_dev": dj_dev_toolkit, + "dj_mcp": get_mcp_toolkit, + "router": agents2toolkit, +} + +# Public API +__all__ = [ + "dj_tools", + "dj_dev_tools", + "mcp_tools", + "all_tools", + "agents2toolkit", + "dj_toolkit", + "dj_dev_toolkit", + "get_mcp_toolkit", + # Individual tools for direct import + "execute_safe_command", + "view_text_file", + "write_text_file", + "agent_to_tool", + "query_dj_operators", + "get_basic_files", + "get_operator_example", + "configure_data_juicer_path", +] \ No newline at end of file diff --git a/data_juicer_agent/tools/dj_dev_tools.py b/data_juicer_agent/tools/dj_dev_tools.py new file mode 100644 index 0000000..b7681e8 --- /dev/null +++ b/data_juicer_agent/tools/dj_dev_tools.py @@ -0,0 +1,235 @@ +# -*- coding: utf-8 -*- +""" +DataJuicer Development Tools + +Tools for developing DataJuicer operators, including access to basic documentation +and example code for different operator types. +""" + +import os +from pathlib import Path +from agentscope.message import TextBlock +from agentscope.tool import ToolResponse + +# DataJuicer home path - should be configured based on your environment +DATA_JUICER_PATH = os.getenv("DATA_JUICER_PATH", None) + +BASIC_LIST_RELATIVE = [ + "data_juicer/ops/base_op.py", + "docs/DeveloperGuide.md", + "docs/DeveloperGuide_ZH.md", +] + + +def get_basic_files() -> ToolResponse: + """Get basic DataJuicer development files content. + + Returns the content of essential files needed for DJ operator development: + - base_op.py: Base operator class + - DeveloperGuide.md: English developer guide + - DeveloperGuide_ZH.md: Chinese developer guide + + Returns: + ToolResponse: Combined content of all basic development files + """ + global DATA_JUICER_PATH, BASIC_LIST_RELATIVE + if DATA_JUICER_PATH is None: + return ToolResponse( + content=[ + TextBlock( + type="text", + text="DATA_JUICER_PATH is not configured. Please ask the user to provide the DATA_JUICER_PATH", + ) + ] + ) + + try: + combined_content = "# DataJuicer Operator Development Basic Files\n\n" + + for relative_path in BASIC_LIST_RELATIVE: + file_path = os.path.join(DATA_JUICER_PATH, relative_path) + if os.path.exists(file_path): + try: + with open(file_path, "r", encoding="utf-8") as f: + content = f.read() + + filename = os.path.basename(file_path) + combined_content += f"## {filename}\n\n" + combined_content += ( + f"```{'python' if filename.endswith('.py') else 'markdown'}\n" + ) + combined_content += content + combined_content += "\n```\n\n" + except Exception as e: + combined_content += ( + f"## {os.path.basename(file_path)} (Read Failed)\n" + ) + combined_content += f"Error: {str(e)}\n\n" + + return ToolResponse(content=[TextBlock(type="text", text=combined_content)]) + + except Exception as e: + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"Error occurred while getting basic files: {str(e)}", + ) + ] + ) + + +async def get_operator_example( + requirement_description: str, limit: int = 2 +) -> ToolResponse: + """Get example operators based on requirement description using dynamic search. + + Args: + requirement_description (str): Natural language description of the operator requirement + limit (int): Maximum number of example operators to return (default: 2) + + Returns: + ToolResponse: Example operator code and test files based on the requirement + """ + global DATA_JUICER_PATH + if DATA_JUICER_PATH is None: + return ToolResponse( + content=[ + TextBlock( + type="text", + text="DATA_JUICER_PATH is not configured. Please ask the user to provide the DATA_JUICER_PATH", + ) + ] + ) + + try: + # Import retrieve_ops from op_manager + from .op_manager.op_retrieval import retrieve_ops + + # Query relevant operators using the requirement description + # Use retrieval mode from environment variable if set + retrieval_mode = os.environ.get("RETRIEVAL_MODE", "auto") + tool_names = await retrieve_ops(requirement_description, limit=limit, mode=retrieval_mode) + + if not tool_names: + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"No relevant operators found for requirement: {requirement_description}\n" + f"Please try with more specific keywords or check if DATA_JUICER_PATH is properly configured.", + ) + ] + ) + + combined_content = ( + f"# Dynamic Operator Examples for: {requirement_description}\n\n" + ) + combined_content += ( + f"Found {len(tool_names)} relevant operators (limit: {limit})\n\n" + ) + + # Process each found operator + for i, tool_name in enumerate(tool_names[:limit]): + combined_content += f"## {i+1}. {tool_name}\n\n" + + op_type = tool_name.split("_")[-1] + + operator_path = f"data_juicer/ops/{op_type}/{tool_name}.py" + + # Try to find operator source file + + full_path = os.path.join(DATA_JUICER_PATH, operator_path) + if os.path.exists(full_path): + with open(full_path, "r", encoding="utf-8") as f: + operator_code = f.read() + + combined_content += f"### Source Code\n" + combined_content += "```python\n" + combined_content += operator_code + combined_content += "\n```\n\n" + else: + combined_content += ( + f"**Note:** Source code file not found for `{tool_name}`.\n\n" + ) + + test_path = f"tests/ops/{op_type}/test_{tool_name}.py" + + full_test_path = os.path.join(DATA_JUICER_PATH, test_path) + if os.path.exists(full_test_path): + with open(full_test_path, "r", encoding="utf-8") as f: + test_code = f.read() + + combined_content += f"### Test Code\n" + combined_content += f"**File Path:** `{test_path}`\n\n" + combined_content += "```python\n" + combined_content += test_code + combined_content += "\n```\n\n" + + else: + combined_content += ( + f"**Note:** Test file not found for `{tool_name}`.\n\n" + ) + + combined_content += "---\n\n" + + return ToolResponse(content=[TextBlock(type="text", text=combined_content)]) + + except Exception as e: + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"Error occurred while getting operator examples: {str(e)}\n" + f"Please check the requirement description and try again.", + ) + ] + ) + + +def configure_data_juicer_path(data_juicer_path: str) -> ToolResponse: + """Configure DataJuicer path. + If the user provides the data_juicer_path, please use this method to configure it. + + Args: + data_juicer_path (str): Path to DataJuicer installation + + Returns: + ToolResponse: Configuration result + """ + global DATA_JUICER_PATH + + data_juicer_path = os.path.expanduser(data_juicer_path) + + try: + if not os.path.exists(data_juicer_path): + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"Specified DataJuicer path does not exist: {data_juicer_path}", + ) + ] + ) + + # Update global DATA_JUICER_PATH + DATA_JUICER_PATH = data_juicer_path + + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"DataJuicer path has been updated to: {DATA_JUICER_PATH}", + ) + ] + ) + + except Exception as e: + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"Error occurred while configuring DataJuicer path: {str(e)}", + ) + ] + ) diff --git a/data_juicer_agent/tools/dj_tools.py b/data_juicer_agent/tools/dj_tools.py new file mode 100644 index 0000000..9cd5e76 --- /dev/null +++ b/data_juicer_agent/tools/dj_tools.py @@ -0,0 +1,224 @@ +import os +import os.path as osp +import json +import asyncio +from typing import Any +from agentscope.message import TextBlock +from agentscope.tool import ToolResponse +from .op_manager.op_retrieval import retrieve_ops + +# Load tool information for formatting +TOOLS_INFO_PATH = osp.join(osp.dirname(__file__), "op_manager", "dj_funcs_all.json") + +def _load_tools_info(): + """Load tools information from JSON file or create it if not exists""" + if osp.exists(TOOLS_INFO_PATH): + with open(TOOLS_INFO_PATH, "r", encoding="utf-8") as f: + return json.loads(f.read()) + else: + from .op_manager.create_dj_func_info import dj_func_info + with open(TOOLS_INFO_PATH, "w", encoding="utf-8") as f: + json.dump(dj_func_info, f) + return dj_func_info + +def _format_tool_names_to_class_entries(tool_names): + """Convert tool names list to formatted class entries string""" + if not tool_names: + return "" + + tools_info = _load_tools_info() + + # Create a mapping from class_name to tool info for quick lookup + tools_map = {tool['class_name']: tool for tool in tools_info} + + formatted_entries = [] + for i, tool_name in enumerate(tool_names): + if tool_name in tools_map: + tool_info = tools_map[tool_name] + class_entry = f"{i+1}. {tool_info['class_name']}: {tool_info['class_desc']}" + class_entry += "\n" + tool_info["arguments"] + formatted_entries.append(class_entry) + + return "\n".join(formatted_entries) + +async def query_dj_operators(query: str, limit: int = 20) -> ToolResponse: + """Query DataJuicer operators by natural language description. + + Retrieves relevant operators from DataJuicer library based on user query. + Supports matching by functionality, data type, and processing scenarios. + + Args: + query (str): Natural language operator query + limit (int): Maximum number of operators to return (default: 20) + + Returns: + ToolResponse: Tool response containing matched operators with names, descriptions, and parameters + """ + + try: + # Retrieve operator names using existing functionality with limit + # Use retrieval mode from environment variable if set + retrieval_mode = os.environ.get("RETRIEVAL_MODE", "auto") + tool_names = await retrieve_ops(query, limit=limit, mode=retrieval_mode) + + if not tool_names: + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"No matching DataJuicer operators found for query: {query}\n" + f"Suggestions:\n" + f"1. Use more specific keywords like 'text filter', 'image processing'\n" + f"2. Check spelling and try alternative terms\n" + f"3. Try English keywords for better matching", + ) + ], + ) + + # Format tool names to class entries + retrieved_operators = _format_tool_names_to_class_entries(tool_names) + + # Format response + result_text = f"🔍 DataJuicer Operator Query Results\n" + result_text += f"Query: {query}\n" + result_text += f"Limit: {limit} operators\n" + result_text += f"{'='*50}\n\n" + result_text += retrieved_operators + + return ToolResponse( + content=[ + TextBlock( + type="text", + text=result_text, + ) + ], + ) + + except Exception as e: + return ToolResponse( + content=[ + TextBlock( + type="text", + text=f"Error querying DataJuicer operators: {str(e)}\n" + f"Please verify query parameters and retry.", + ) + ], + ) + + +async def execute_safe_command( + command: str, + timeout: int = 300, + **kwargs: Any, +) -> ToolResponse: + """Execute safe commands including DataJuicer commands and other safe system commands. + Returns the return code, standard output and error within , + and tags. + + Args: + command (`str`): + The command to execute. Allowed commands include: + - DataJuicer commands: dj-process, dj-analyze + - File system commands: mkdir, ls, pwd, cat, echo, cp, mv, rm + - Text processing: grep, head, tail, wc, sort, uniq + - Archive commands: tar, zip, unzip + - Other safe commands: which, whoami, date, find + timeout (`float`, defaults to `300`): + The maximum time (in seconds) allowed for the command to run. + + Returns: + `ToolResponse`: + The tool response containing the return code, standard output, and + standard error of the executed command. + """ + + # Security check: only allow safe commands + command_stripped = command.strip() + + # Define allowed command prefixes for security + allowed_commands = [ + # DataJuicer commands + 'dj-process', 'dj-analyze', + # File system operations + 'mkdir', 'ls', 'pwd', 'cat', 'echo', 'cp', 'mv', 'rm', + # Text processing + 'grep', 'head', 'tail', 'wc', 'sort', 'uniq', + # Archive operations + 'tar', 'zip', 'unzip', + # Information commands + 'which', 'whoami', 'date', 'find', + # Python commands + 'python', 'python3', 'pip', 'uv' + ] + + # Check if command starts with any allowed command + command_allowed = False + for allowed_cmd in allowed_commands: + if command_stripped.startswith(allowed_cmd): + # Additional security checks for potentially dangerous commands + if allowed_cmd in ['rm', 'mv'] and ('/' in command_stripped or '..' in command_stripped): + # Prevent dangerous path operations + continue + command_allowed = True + break + + if not command_allowed: + error_msg = f"Error: Command not allowed for security reasons. Allowed commands: {', '.join(allowed_commands)}. Received command: {command}" + return ToolResponse( + content=[ + TextBlock( + type="text", + text=( + f"-1" + f"" + f"{error_msg}" + ), + ), + ], + ) + + proc = await asyncio.create_subprocess_shell( + command, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + bufsize=0, + ) + + try: + await asyncio.wait_for(proc.wait(), timeout=timeout) + stdout, stderr = await proc.communicate() + stdout_str = stdout.decode("utf-8") + stderr_str = stderr.decode("utf-8") + returncode = proc.returncode + + except asyncio.TimeoutError: + stderr_suffix = ( + f"TimeoutError: The command execution exceeded " + f"the timeout of {timeout} seconds." + ) + returncode = -1 + try: + proc.terminate() + stdout, stderr = await proc.communicate() + stdout_str = stdout.decode("utf-8") + stderr_str = stderr.decode("utf-8") + if stderr_str: + stderr_str += f"\n{stderr_suffix}" + else: + stderr_str = stderr_suffix + except ProcessLookupError: + stdout_str = "" + stderr_str = stderr_suffix + + return ToolResponse( + content=[ + TextBlock( + type="text", + text=( + f"{returncode}" + f"{stdout_str}" + f"{stderr_str}" + ), + ), + ], + ) \ No newline at end of file diff --git a/data_juicer_agent/tools/mcp_tools.py b/data_juicer_agent/tools/mcp_tools.py new file mode 100644 index 0000000..c441134 --- /dev/null +++ b/data_juicer_agent/tools/mcp_tools.py @@ -0,0 +1,120 @@ +import json +import os +import logging +from typing import Optional, List +import string + +from agentscope.tool import Toolkit +from agentscope.mcp import HttpStatefulClient, HttpStatelessClient, StdIOStatefulClient + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +root_path = os.path.abspath(os.path.dirname(os.path.dirname(__file__))) + +def _load_config(config_path: str) -> dict: + """Load MCP configuration from file""" + try: + if os.path.exists(config_path): + with open(config_path, "r", encoding="utf-8") as f: + config = json.load(f) + logger.info(f"Loaded MCP configuration from {config_path}") + return config + else: + logger.warning( + f"Configuration file {config_path} not found, using default settings" + ) + return _create_default_config() + except Exception as e: + logger.error(f"Error loading configuration: {e}") + return _create_default_config() + +def _create_default_config() -> dict: + """Create default configuration""" + return { + "mcpServers": { + "dj_recipe_flow": { + "command": "python", + "args": ["/home/test/data_juicer/tools/DJ_mcp_recipe_flow.py"], + "env": {"SERVER_TRANSPORT": "stdio"}, + } + } + } + +def _expand_env_vars(value: str) -> str: + """Expand environment variables in configuration values""" + if isinstance(value, str): + template = string.Template(value) + try: + return template.substitute(os.environ) + except KeyError as e: + logger.warning(f"Environment variable not found: {e}") + return value + return value + +async def _create_clients(config: dict, toolkit: Toolkit): + """Create MCP clients based on configuration""" + server_configs = config.get("mcpServers", {}) + clients = [] + + for server_name, server_config in server_configs.items(): + try: + # Handle StdIO client + if "command" in server_config: + command = server_config["command"] + args = server_config.get("args", []) + env = server_config.get("env", {}) + + # Expand environment variables + expanded_args = [_expand_env_vars(arg) for arg in args] + expanded_env = {k: _expand_env_vars(v) for k, v in env.items()} + + client = StdIOStatefulClient( + name=server_name, + command=command, + args=expanded_args, + env=expanded_env, + ) + + await client.connect() + await toolkit.register_mcp_client(client) + + # Handle HTTP clients + elif "url" in server_config: + url = _expand_env_vars(server_config["url"]) + transport = server_config.get("transport", "sse") + stateful = server_config.get("stateful", True) + + if stateful: + client = HttpStatefulClient( + name=server_name, transport=transport, url=url + ) + await client.connect() + await toolkit.register_mcp_client(client) + else: + client = HttpStatelessClient( + name=server_name, transport=transport, url=url + ) + await toolkit.register_mcp_client(client) + + else: + raise ValueError("Invalid server configuration") + + clients.append(client) + except Exception as e: + if "Invalid server configuration" in str(e): + raise e + logger.error(f"Failed to create client {server_name}: {e}") + + return clients + +async def get_mcp_toolkit(config_path: Optional[str] = None) -> Toolkit: + """Get toolkit with all MCP tools registered""" + config_path = config_path or root_path + "/configs/mcp_config.json" + config = _load_config(config_path) + toolkit = Toolkit() + + clients = await _create_clients(config, toolkit) + + return toolkit, clients diff --git a/data_juicer_agent/tools/op_manager/__init__.py b/data_juicer_agent/tools/op_manager/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/data_juicer_agent/tools/op_manager/create_dj_func_info.py b/data_juicer_agent/tools/op_manager/create_dj_func_info.py new file mode 100644 index 0000000..d7abc3a --- /dev/null +++ b/data_juicer_agent/tools/op_manager/create_dj_func_info.py @@ -0,0 +1,34 @@ +import inspect +from data_juicer.tools.op_search import OPSearcher + +searcher = OPSearcher(include_formatter=False) + +all_ops = searcher.search() + +dj_func_info = [] +for i, op in enumerate(all_ops): + class_entry = {"index": i, "class_name": op["name"], "class_desc": op["desc"]} + param_desc = op["param_desc"] + param_desc_map = {} + args = "" + for item in param_desc.split(":param"): + _item = item.split(":") + if len(_item) < 2: + continue + param_desc_map[_item[0].strip()] = ":".join(_item[1:]).strip() + + if op["sig"]: + for param_name, param in op["sig"].parameters.items(): + if param_name in ["self", "args", "kwargs"]: + continue + if param.kind in ( + inspect.Parameter.VAR_POSITIONAL, + inspect.Parameter.VAR_KEYWORD, + ): + continue + if param_name in param_desc_map: + args += f" {param_name} ({param.annotation}): {param_desc_map[param_name]}\n" + else: + args += f" {param_name} ({param.annotation})\n" + class_entry["arguments"] = args + dj_func_info.append(class_entry) diff --git a/data_juicer_agent/tools/op_manager/dj_funcs_all.json b/data_juicer_agent/tools/op_manager/dj_funcs_all.json new file mode 100644 index 0000000..44f06f1 --- /dev/null +++ b/data_juicer_agent/tools/op_manager/dj_funcs_all.json @@ -0,0 +1 @@ +[{"index": 0, "class_name": "nested_aggregator", "class_desc": "Aggregates nested content from multiple samples into a single summary.\n\n This operator uses a recursive summarization approach to aggregate content from multiple\n samples. It processes the input text, which is split into sub-documents, and generates a\n summary that maintains the average length of the original documents. The aggregation is\n performed using an API model, guided by system prompts and templates. The operator\n supports retrying the API call in case of errors and allows for customization of the\n summarization process through various parameters. The default system prompt and\n templates are provided in Chinese, but they can be customized. The operator uses a\n Hugging Face tokenizer to handle tokenization.", "arguments": " api_model (): API model name.\n input_key (): The input key in the meta field of the samples.\n It is \"event_description\" in default.\n output_key (): The output key in the aggregation field in the\n samples. It is same as the input_key in default.\n max_token_num (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): The max token num of the total tokens of the\n sub documents. Without limitation if it is None.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): The system prompt.\n sub_doc_template (typing.Optional[str]): The template for input text in each sample.\n input_template (typing.Optional[str]): The input template.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 1, "class_name": "entity_attribute_aggregator", "class_desc": "Summarizes a given attribute of an entity from a set of documents.\n\n The operator extracts and summarizes the specified attribute of a given entity from the\n provided documents. It uses a system prompt, example prompt, and input template to\n generate the summary. The output is formatted as a markdown-style summary with the\n entity and attribute clearly labeled. The summary is limited to a specified number of\n words (default is 100). The operator uses a Hugging Face tokenizer to handle token\n limits and splits documents if necessary. If the input key or required fields are\n missing, the operator logs a warning and returns the sample unchanged. The summary is\n stored in the batch metadata under the specified output key. The system prompt, input\n template, example prompt, and output pattern can be customized.", "arguments": " api_model (): API model name.\n entity (): The given entity.\n attribute (): The given attribute.\n input_key (): The input key in the meta field of the samples.\n It is \"event_description\" in default.\n output_key (): The output key in the aggregation field of the\n samples. It is \"entity_attribute\" in default.\n word_limit (typing.Annotated[int, Gt(gt=0)]): Prompt the output length.\n max_token_num (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): The max token num of the total tokens of the\n sub documents. Without limitation if it is None.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt_template (typing.Optional[str]): The system prompt template.\n example_prompt (typing.Optional[str]): The example part in the system prompt.\n input_template (typing.Optional[str]): The input template.\n output_pattern_template (typing.Optional[str]): The output template.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 2, "class_name": "meta_tags_aggregator", "class_desc": "Merge similar meta tags into a single, unified tag.\n\n This operator aggregates and consolidates similar meta tags from the input data. It can\n handle two scenarios:\n - When a set of target tags is provided, it maps the original tags to these predefined\n categories. If a \"miscellaneous\" or \"other\" category is included, any tags that do not\n fit into the specified categories are grouped under this label.\n - When no target tags are provided, it generates reasonable categories based on the\n similarity and frequency of the input tags.\n\n The operator uses a language model (default: gpt-4o) to analyze and merge the tags. The\n system prompt, input template, and output pattern can be customized. The aggregated tags\n are then updated in the input sample's metadata.", "arguments": " api_model (): API model name.\n meta_tag_key (): The key of the meta tag to be mapped.\n target_tags (typing.Optional[typing.List[str]]): The tags that is supposed to be mapped to.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): The system prompt.\n input_template (typing.Optional[str]): The input template.\n target_tag_template (typing.Optional[str]): The tap template for target tags.\n tag_template (typing.Optional[str]): The tap template for each tag and its\n frequency.\n output_pattern (typing.Optional[str]): The output pattern.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 3, "class_name": "most_relevant_entities_aggregator", "class_desc": "Extracts and ranks entities closely related to a given entity from provided texts.\n\n The operator uses a language model API to identify and rank entities, filtering out\n entities of the same type as the given entity. The ranked list is sorted in descending\n order of importance. Input texts are aggregated and passed to the model, with an\n optional token limit. The output is parsed using a regular expression to extract the\n relevant entities. Results are stored in the batch metadata under the key\n 'most_relevant_entities'. The operator retries the API call up to a specified number of\n times in case of errors. The system prompt, input template, and output pattern can be\n customized.", "arguments": " api_model (): API model name.\n entity (): The given entity.\n query_entity_type (): The type of queried relevant entities.\n input_key (): The input key in the meta field of the samples.\n It is \"event_description\" in default.\n output_key (): The output key in the aggregation field of the\n samples. It is \"most_relevant_entities\" in default.\n max_token_num (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): The max token num of the total tokens of the\n sub documents. Without limitation if it is None.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt_template (typing.Optional[str]): The system prompt template.\n input_template (typing.Optional[str]): The input template.\n output_pattern (typing.Optional[str]): The output pattern.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 4, "class_name": "document_deduplicator", "class_desc": "Deduplicates samples at the document level using exact matching.\n\n This operator computes an MD5 hash for each sample's text. It can optionally convert the\n text to lowercase and ignore non-alphabet characters, including whitespaces, digits, and\n punctuation. The deduplication is based on the computed hash values, where samples with\n identical hashes are considered duplicates. The `compute_hash` method adds a 'hash' key\n to each sample, storing its MD5 hash. During processing, the first occurrence of each\n unique hash is kept, and subsequent duplicates are filtered out. If the `show_num`\n parameter is set, the operator also returns a specified number of duplicate pairs for\n inspection.", "arguments": " lowercase (): Whether to convert sample text to lower case\n ignore_non_character (): Whether to ignore non-alphabet\n characters, including whitespaces, digits, and punctuations\n"}, {"index": 5, "class_name": "document_minhash_deduplicator", "class_desc": "Deduplicates samples at the document level using MinHash LSH.\n\n This operator computes MinHash values for each sample and uses Locality-Sensitive\n Hashing (LSH) to identify and remove near-duplicate documents. The Jaccard similarity\n threshold determines when two documents are considered duplicates. The tokenization\n method can be customized, and a Hugging Face tokenizer can be used for 'sentencepiece'\n tokenization. The minhash values are stored as bytes and are not kept in the final\n dataset. The number of bands and rows per band in LSH can be set manually or determined\n by an optimal parameter computation algorithm. Important notes:\n - If using 'punctuation' tokenization with an ignore pattern, ensure the pattern does\n not include punctuations.\n - For 'sentencepiece' tokenization, a tokenizer model path is required.\n - The deduplication process involves clustering and filtering, and only unique samples\n or the first sample in a cluster are retained.", "arguments": " tokenization (): tokenization method for sample texts. It\n should be one of [space, punctuation, character,\n sentencepiece]. For English-like languages, we recommend\n to use 'space', for Chinese-like languages, we recommend\n to use 'character', and for multiple languages, we recommend\n to use 'sentencepiece'. If using 'sentencepiece', please\n provided the model path in the 'tokenizer_model' field.\n window_size (typing.Annotated[int, Gt(gt=0)]): window size of shingling\n lowercase (): whether to convert text to lower case first\n ignore_pattern (typing.Optional[str]): whether to ignore sub-strings with\n specific pattern when computing minhash\n num_permutations (typing.Annotated[int, Gt(gt=0)]): number of permutations in minhash\n computing\n jaccard_threshold (typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]): the min jaccard similarity threshold\n in near-duplicate detection. When the jaccard similarity of\n two sample texts is >= this threshold, they are regarded as\n similar samples and this op will only keep one of them after\n deduplication\n num_bands (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): number of bands in LSH. Default it's None, and\n it will be determined by an optimal params computation\n algorithm by minimize the weighted sum of probs of False\n Positives and False Negatives\n num_rows_per_band (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): number of rows in each band in LSH.\n Default it's None, and it will be determined by an optimal\n params computation algorithm\n tokenizer_model (typing.Optional[str]): path for the sentencepiece model, used for\n sentencepiece tokenization.\n"}, {"index": 6, "class_name": "document_simhash_deduplicator", "class_desc": "Deduplicates samples at the document level using SimHash.\n\n This operator computes SimHash values for each sample and removes duplicates based on a\n specified Hamming distance threshold. It supports different tokenization methods:\n 'space', 'punctuation', and 'character'. The SimHash is computed over shingles of a\n given window size, and the deduplication process clusters similar documents and retains\n only one from each cluster. The default mode converts text to lowercase and can ignore\n specific patterns. The key metric, Hamming distance, is used to determine similarity\n between SimHash values. Important notes:\n - The `ignore_pattern` parameter can be used to exclude certain substrings during\n SimHash computation.\n - For punctuation-based tokenization, the `ignore_pattern` should not include\n punctuations to avoid conflicts.\n - The `hamming_distance` must be less than the number of blocks (`num_blocks`).\n - Only the first sample in each cluster is retained by default.", "arguments": " tokenization (): tokenization method for\n sample texts.\n\n It should be one of [space, punctuation, character]. For\n English-like languages, we recommend to use 'space'. And for\n Chinese-like languages, we recommend to use 'character'\n window_size (typing.Annotated[int, Gt(gt=0)]): window size of shingling\n lowercase (): whether to convert text to lower case first\n ignore_pattern (typing.Optional[str]): whether to ignore sub-strings with\n specific pattern when computing simhash\n num_blocks (typing.Annotated[int, Gt(gt=0)]): number of blocks in simhash computing\n hamming_distance (typing.Annotated[int, Gt(gt=0)]): the max hamming distance threshold in\n near-duplicate detection. When the hamming distance of two\n sample texts is <= this threshold, they are regarded as\n similar samples and this op will only keep one of them after\n deduplication. This threshold should be always less than\n num_blocks\n"}, {"index": 7, "class_name": "general_fused_op", "class_desc": "An explicitly fused operator designed to execute multiple sequential operations (OPs) on\n the same batch, enabling fine-grained control over data processing.\n\n This operator allows for the chaining of multiple data processing steps, such as mappers\n and filters, into a single pass. It processes each batch of samples sequentially through\n the defined operations, ensuring that all specified transformations are applied in\n order. The operator supports both mappers, which transform data, and filters, which\n remove or keep samples based on computed statistics. Context variables can be passed\n between operations if needed. The accelerator is set to 'cuda' if any of the fused\n operations use it. The number of processes is determined by the minimum value among all\n fused operations. After processing, any temporary context variables, such as those used\n for video containers, are cleaned up.", "arguments": " batch_size (): the batch size of the input samples.\n fused_op_list (typing.Optional[typing.List]): a list of OPs to be fused.\n"}, {"index": 8, "class_name": "image_deduplicator", "class_desc": "Deduplicates samples at the document level by exact matching of images.\n\n This operator compares images across documents to identify and remove duplicates.\n - It uses a specified hash method (default is 'phash') to compute image hashes.\n - If `consider_text` is set, it also considers text content for deduplication,\n using a text deduplicator in conjunction with the image hashes.\n - The key metric, `imagehash`, is computed for each sample. If `consider_text`\n is enabled, an additional `hash` field is used.\n - Duplicates are identified by comparing these hash values. Samples with\n identical hashes are considered duplicates.\n - When `show_num` is greater than 0, the operator also returns a subset of\n duplicate pairs for tracing purposes.\n - The operator caches the `imagehash` and, if applicable, the `hash` fields.", "arguments": " method (): hash method for image\n consider_text (): whether to consider text hash together with image\n hash when applying deduplication.\n"}, {"index": 9, "class_name": "ray_bts_minhash_deduplicator", "class_desc": "A MinhashLSH deduplicator that operates in Ray distributed mode.\n\n This operator uses the MinHash LSH technique to identify and remove near-duplicate\n samples from a dataset. It supports various tokenization methods, including space,\n punctuation, character, and sentencepiece. The Jaccard similarity threshold is used to\n determine if two samples are considered duplicates. If the Jaccard similarity of two\n samples is greater than or equal to the specified threshold, one of the samples is\n filtered out. The operator computes the MinHash values for each sample and uses a union-\n find algorithm to group similar samples. The key metric, Jaccard similarity, is computed\n based on the shingling of the text. The operator can run on both CPU and GPU, with\n specific batch size and memory configurations for each.", "arguments": " tokenization (): tokenization method for sample texts. It\n should be one of [space, punctuation, character,\n sentencepiece]. For English-like languages, we recommend\n to use 'space', for Chinese-like languages, we recommend\n to use 'character', and for multiple languages, we recommend\n to use 'sentencepiece'. If using 'sentencepiece', please\n provided the model path in the 'tokenizer_model' field.\n window_size (typing.Annotated[int, Gt(gt=0)]): window size of shingling\n lowercase (): whether to convert text to lower case first\n ignore_pattern (typing.Optional[str]): whether to ignore sub-strings with\n specific pattern when computing minhash\n num_permutations (typing.Annotated[int, Gt(gt=0)]): number of permutations in minhash\n computing\n jaccard_threshold (typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]): the min jaccard similarity threshold\n in near-duplicate detection. When the jaccard similarity of\n two sample texts is >= this threshold, they are regarded as\n similar samples and this op will only keep one of them after\n deduplication\n num_bands (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): number of bands in LSH. Default it's None, and\n it will be determined by an optimal params computation\n algorithm by minimize the weighted sum of probs of False\n Positives and False Negatives\n num_rows_per_band (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): number of rows in each band in LSH.\n Default it's None, and it will be determined by an optimal\n params computation algorithm\n tokenizer_model (typing.Optional[str]): path for the sentencepiece model, used for\n sentencepiece tokenization.\n union_find_parallel_num (typing.Union[int, str]): number of parallel workers for\n union-find algorithm. Default it's 'auto', and it will be\n determined by half of the number of CPUs.\n union_threshold (typing.Optional[int]): threshold for minhash values group to\n perform union-find algorithm. Default it's 256.\n max_pending_edge_buffer_task (typing.Optional[int]): max number of pending edge buffer\n ray tasks. Default it's 20.\n num_edge_buffer_task_returns (typing.Optional[int]): number of edge buffer tasks for\n `ray.wait` to return. Default it's 10.\n max_pending_filter_tasks (typing.Optional[int]): max number of pending filter ray\n tasks. Default it's 20.\n num_filter_task_returns (typing.Optional[int]): number of filter tasks for `ray.wait`\n to return. Default it's 10.\n merge_batch_size (typing.Optional[int]): batch size for BTS operations. Default\n it's 1000.\n minhash_batch_size (typing.Union[int, str, NoneType]): batch size for MinHash computation. If \"auto\",\n it will be set to default value on CPU(1024), or auto calculated per\n available GPU memory and memory_per_sample setting for GPU.\n memory_per_sample (typing.Optional[float]): estimated memory needed per sample in MB.\n Used to calculate batch size based on available GPU memory.\n Default is 0.1 MB per sample.\n"}, {"index": 10, "class_name": "ray_document_deduplicator", "class_desc": "Deduplicates samples at the document level using exact matching in Ray distributed mode.\n\n This operator computes a hash for each document and filters out duplicates based on\n exact matches. The hash is calculated from the text content, which can be optionally\n converted to lowercase and stripped of non-alphabet characters. The key metric used for\n deduplication is the MD5 hash of the processed text. If the `lowercase` parameter is\n set, the text is converted to lowercase before hashing. If `ignore_non_character` is\n enabled, all non-alphabet characters, including whitespaces, digits, and punctuation,\n are removed. The operator supports two backends: 'ray_actor' and 'redis', with the\n default being 'ray_actor'.", "arguments": " backend (): the backend for dedup, either 'ray_actor' or 'redis'\n redis_address (): the address of redis server\n lowercase (): Whether to convert sample text to lower case\n ignore_non_character (): Whether to ignore non-alphabet\n characters, including whitespaces, digits, and punctuations\n"}, {"index": 11, "class_name": "ray_image_deduplicator", "class_desc": "Deduplicates samples at the document level using exact matching of images in Ray distributed mode.\n\n This operator uses a specified hash method to compute image hashes and identifies\n duplicates by comparing these hashes. It operates in Ray distributed mode, supporting\n 'ray_actor' or 'redis' backends for deduplication. The hash method can be set during\n initialization, with supported methods listed in `HASH_METHOD`. If a sample does not\n contain an image, it is assigned an empty hash value. The operator loads images from the\n specified keys and computes their combined hash for comparison.", "arguments": " backend (): the backend for dedup, either 'ray_actor' or 'redis'\n redis_address (): the address of redis server\n method (): the hash method to use\n"}, {"index": 12, "class_name": "ray_video_deduplicator", "class_desc": "Deduplicates samples at document-level using exact matching of videos in Ray distributed mode.\n\n This operator computes the MD5 hash of video streams in each sample and compares them to\n identify duplicates. It uses Ray distributed mode for parallel processing. The hash is\n computed by demuxing the video streams and updating the MD5 hash with each video packet.\n If a sample does not contain a valid video, it is assigned an empty hash value. The\n operator supports 'ray_actor' or 'redis' backends for deduplication.", "arguments": " backend (): the backend for dedup, either 'ray_actor' or 'redis'\n redis_address (): the address of redis server\n"}, {"index": 13, "class_name": "video_deduplicator", "class_desc": "Deduplicates samples at the document level using exact matching of videos.\n\n This operator computes a hash for each video in the sample and uses it to identify and\n remove duplicate documents. If `consider_text` is set to True, it also considers the\n text hash alongside the video hash for deduplication. The video hash is computed by\n hashing the video data, including all video streams in the container. The operator\n supports sampling and tracing of duplicate pairs when the `show_num` parameter is\n greater than 0. Important fields used for caching include 'videohash' and optionally\n 'hash' if text is considered.", "arguments": " consider_text (): whether to consider text hash together with video\n hash when applying deduplication.\n"}, {"index": 14, "class_name": "alphanumeric_filter", "class_desc": "Filter to keep samples with an alphabet/numeric ratio within a specific range.\n\n This operator filters samples based on the ratio of alphanumeric characters or tokens.\n It keeps samples where the ratio of alphanumeric characters (or tokens) to the total\n number of characters (or tokens) is within the specified range. The ratio is computed\n either character-based or token-based, depending on the `tokenization` parameter. If\n `tokenization` is True, it uses a Hugging Face tokenizer to count tokens. The key metric\n used for filtering is 'alpha_token_ratio' if tokenization is enabled, otherwise\n 'alnum_ratio'. The operator caches these metrics in the stats field for each sample.", "arguments": " tokenization (): Whether to count the ratio of alphanumeric\n to the total number of tokens. if tokenization=False, it\n will count the ratio of alphanumeric to the total number of\n characters.\n min_ratio (): The min filter ratio in alphanumeric op,\n samples will be filtered if their alphabet/numeric ratio is\n below this parameter.\n max_ratio (): The max filter ratio in alphanumeric op,\n samples will be filtered if their alphabet/numeric ratio\n exceeds this parameter.\n"}, {"index": 15, "class_name": "audio_duration_filter", "class_desc": "Keep data samples whose audio durations are within a specified range.\n\n This operator filters data samples based on the duration of their audio files. It keeps\n samples where the audio duration is between a minimum and maximum value, in seconds. The\n operator supports two strategies for keeping samples: 'any' (keep if any audio meets the\n condition) or 'all' (keep only if all audios meet the condition). The audio duration is\n computed using the `librosa` library. If the audio duration has already been computed,\n it is retrieved from the sample's stats under the key 'audio_duration'. If no audio is\n present in the sample, an empty array is stored in the stats.", "arguments": " min_duration (): The min audio duration to keep samples in seconds.\n It's 0 by default.\n max_duration (): The max audio duration to keep samples in seconds.\n It's sys.maxsize by default.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all audios. 'any': keep this sample if any audios meet the\n condition. 'all': keep this sample only if all audios meet the\n condition.\n"}, {"index": 16, "class_name": "audio_nmf_snr_filter", "class_desc": "Keep data samples whose audio Signal-to-Noise Ratios (SNRs) are within a specified\n range.\n\n This operator computes the SNR of each audio in a sample using Non-negative Matrix\n Factorization (NMF). It then filters the samples based on whether their SNRs fall within\n the given minimum and maximum thresholds. The SNR is computed for each audio, and the\n filtering strategy can be set to either 'any' or 'all'. In 'any' mode, a sample is kept\n if at least one of its audios meets the SNR criteria. In 'all' mode, all audios must\n meet the criteria for the sample to be kept. The NMF computation uses a specified number\n of iterations. If no audio is present in the sample, the SNR is recorded as an empty\n array. The key metric is stored in the 'audio_nmf_snr' field.", "arguments": " min_snr (): The min audio SNR to keep samples in dB. It's 0 by\n default.\n max_snr (): The max audio SNR to keep samples in dB. It's\n sys.maxsize by default.\n nmf_iter_num (typing.Annotated[int, Gt(gt=0)]): The max number of iterations to run NMF. It's 500\n in default.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all audios. 'any': keep this sample if any audios meet the\n condition. 'all': keep this sample only if all audios meet the\n condition.\n"}, {"index": 17, "class_name": "audio_size_filter", "class_desc": "Keep data samples based on the size of their audio files.\n\n This operator filters data samples by checking if the size of their audio files falls\n within a specified range. The size can be in bytes, kilobytes, megabytes, or any other\n unit. The key metric used is 'audio_sizes', which is an array of file sizes in bytes. If\n no audio files are present, the 'audio_sizes' field will be an empty array. The operator\n supports two strategies for keeping samples: 'any' and 'all'. In 'any' mode, a sample is\n kept if at least one of its audio files meets the size criteria. In 'all' mode, all\n audio files must meet the size criteria for the sample to be kept.", "arguments": " min_size (): The min audio size to keep samples. set to be \"0\" by\n default for no size constraint\n max_size (): The max audio size to keep samples. set to be\n \"1Tb\" by default, an approximate for un-limited case\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all audios. 'any': keep this sample if any audios meet the\n condition. 'all': keep this sample only if all audios meet the\n condition.\n"}, {"index": 18, "class_name": "average_line_length_filter", "class_desc": "Filter to keep samples with average line length within a specific range.\n\n This operator filters out samples based on their average line length. It keeps samples\n where the average line length is between the specified minimum and maximum values. The\n average line length is calculated as the total text length divided by the number of\n lines. If the context is provided, it uses precomputed lines from the context. The\n computed average line length is stored in the 'avg_line_length' key in the stats field.", "arguments": " min_len (): The min filter length in this op, samples will\n be filtered if their average line length is below this\n parameter.\n max_len (): The max filter length in this op, samples will\n be filtered if their average line length exceeds this\n parameter.\n"}, {"index": 19, "class_name": "character_repetition_filter", "class_desc": "Filter to keep samples with character-level n-gram repetition ratio within a specific\n range.\n\n This operator calculates the character-level n-gram repetition ratio for each sample and\n filters out samples that do not fall within the specified range. The repetition ratio is\n computed based on the frequency of n-grams in the text. The key metric 'char_rep_ratio'\n is cached in the stats field. Samples are kept if their 'char_rep_ratio' is between the\n specified min and max ratios. The n-gram length, minimum, and maximum ratios are\n configurable.", "arguments": " rep_len (typing.Annotated[int, Gt(gt=0)]): Repetition length for char-level n-gram.\n min_ratio (): The min filter ratio in this op, samples will\n be filtered if their char-level n-gram repetition ratio is\n below this parameter.\n max_ratio (): The max filter ratio in this op, samples will\n be filtered if their char-level n-gram repetition ratio\n exceeds this parameter.\n"}, {"index": 20, "class_name": "flagged_words_filter", "class_desc": "Filter to keep samples with flagged-word ratio in a specified range.\n\n This operator filters out samples based on the ratio of flagged words. It uses a list of\n flagged words, which can be language-specific or combined from multiple languages. The\n flagged-word ratio is computed as the number of flagged words divided by the total\n number of words in the sample. If tokenization is enabled, a Hugging Face tokenizer is\n used to split the text into words. The operator supports word augmentation for certain\n languages, which can be configured. The key metric, 'flagged_words_ratio', is cached and\n reused if already computed. Samples are kept if their flagged-word ratio falls within\n the specified min and max ratio.", "arguments": " lang (): Consider flagged words in what language. If lang ==\n \"all\", we will adopt the one merged from all the available\n languages\n tokenization (): Whether to use model to tokenize documents\n min_ratio (): The min filter ratio in this op.\n max_ratio (): The max filter ratio in this op.\n flagged_words_dir (): The directory storing the\n flagged_words file(s) whose name includes \"flagged_words\"\n and in json format\n use_words_aug (): Whether to augment words, especially for\n Chinese and Vietnamese\n words_aug_group_sizes (typing.List[typing.Annotated[int, Gt(gt=0)]]): The group size of words to augment\n words_aug_join_char (): The join char between words to\n augment\n"}, {"index": 21, "class_name": "general_field_filter", "class_desc": "Filter to keep samples based on a general field filter condition.\n\n The filter condition is a string that can include logical operators (and/or) and chain\n comparisons. For example: \"10 < num <= 30 and text != 'nothing here' and __dj__meta__.a\n == 3\". The condition is evaluated for each sample, and only samples that meet the\n condition are kept. The result of the filter condition is stored in the sample's stats\n under the key 'general_field_filter_condition'. If the filter condition is empty or\n already computed, the sample is not re-evaluated.", "arguments": " filter_condition (): The filter condition as a string.\n It can include logical operators (and/or) and chain comparisons.\n For example: \"10 < num <= 30 and text != 'nothing here' and __dj__meta__.a == 3\".\n"}, {"index": 22, "class_name": "image_aesthetics_filter", "class_desc": "Filter to keep samples with aesthetics scores within a specific range.\n\n This operator uses a Hugging Face model to predict the aesthetics score of images. It\n keeps samples where the predicted scores fall within the specified min and max score\n range. The operator supports two strategies: 'any' (keep if any image meets the\n condition) and 'all' (keep only if all images meet the condition). Aesthetics scores are\n cached in the 'image_aesthetics_scores' field. If no images are present, the sample is\n kept. Scores are normalized by dividing by 10 if the model name includes\n 'shunk031/aesthetics-predictor'.", "arguments": " hf_scorer_model (): Huggingface model name for the aesthetics\n predictor. By default, we will use\n 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE',\n refer to pypi.org/project/simple-aesthetics-predictor\n trust_remote_code ()\n min_score (): Min score for the predicted aesthetics in an image.\n max_score (): Max score for the predicted aesthetics in an image.\n any_or_all (): Keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 23, "class_name": "image_aspect_ratio_filter", "class_desc": "Filter to keep samples with image aspect ratio within a specific range.\n\n The operator computes the aspect ratio for each image in the sample, defined as the\n width divided by the height (W / H). It caches the computed aspect ratios in the\n 'aspect_ratios' field. Samples are kept if their images' aspect ratios fall within the\n specified minimum and maximum range. The 'any_or_all' parameter determines the strategy:\n 'any' keeps samples if at least one image meets the criteria, while 'all' requires all\n images to meet the criteria. If no images are present in a sample, the sample is not\n filtered out.", "arguments": " min_ratio (): The min aspect ratio to keep samples.\n max_ratio (): The max aspect ratio to keep samples.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 24, "class_name": "image_face_count_filter", "class_desc": "Filter to keep samples with the number of faces within a specific range.\n\n This operator uses an OpenCV classifier for face detection. It filters samples based on\n the number of faces detected in the images, keeping only those with a face count within\n the specified range. The operator supports two strategies: 'any' (keep if any image\n meets the condition) and 'all' (keep only if all images meet the condition). The face\n counts are cached in the 'face_counts' field. If no images are present in the sample,\n the face count is set to an empty array.", "arguments": " cv_classifier (): OpenCV classifier path for face detection.\n By default, we will use 'haarcascade_frontalface_alt.xml'.\n min_face_count (): Minimum number of faces required for samples.\n max_face_count (): Maximum number of faces required for samples.\n any_or_all (): Keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 25, "class_name": "image_face_ratio_filter", "class_desc": "Filter to keep samples with face area ratios within a specific range.\n\n This operator filters samples based on the ratio of the largest face area to the total\n image area. It uses an OpenCV classifier for face detection. The key metric,\n 'face_ratios', is computed for each image in the sample. Samples are kept if the face\n area ratios fall within the specified min and max ratio range. The filtering strategy\n can be set to 'any' (keep if any image meets the condition) or 'all' (keep only if all\n images meet the condition). If no images are present in the sample, the sample is\n retained.", "arguments": " cv_classifier (): OpenCV classifier path for face detection.\n By default, we will use 'haarcascade_frontalface_alt.xml'.\n min_ratio (): Min ratio for the largest face area in an image.\n max_ratio (): Max ratio for the largest face area in an image.\n any_or_all (): Keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 26, "class_name": "image_nsfw_filter", "class_desc": "Filter to keep samples whose images have nsfw scores in a specified range.\n\n This operator uses a Hugging Face model to compute the nsfw scores for each image in a\n sample. It keeps samples based on the specified `min_score` and `max_score` thresholds.\n The operator supports two strategies: 'any' (keep the sample if any image meets the\n condition) or 'all' (keep the sample only if all images meet the condition). The nsfw\n scores are cached in the 'image_nsfw_score' field of the sample's stats.", "arguments": " hf_nsfw_model (): nsfw detection model name on huggingface.\n trust_remote_code ()\n min_score (): the min nsfw score threshold for samples.\n range from 0 to 1.\n max_score (): the max nsfw score threshold for samples.\n range from 0 to 1.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 27, "class_name": "image_pair_similarity_filter", "class_desc": "Filter to keep image pairs with similarities between images within a specific range.\n\n This operator uses a Hugging Face CLIP model to compute the cosine similarity between\n two images in each sample. It retains samples where the similarity score falls within\n the specified minimum and maximum thresholds. The 'any' strategy keeps a sample if any\n of the image pairs meet the condition, while the 'all' strategy requires all image pairs\n to meet the condition. The similarity scores are cached in the 'image_pair_similarity'\n field. Each sample must include exactly two distinct images.", "arguments": " hf_clip (): clip model name on huggingface to compute\n the similarity between image and text.\n trust_remote_code ()\n min_score (): The min similarity to keep samples.\n max_score (): The max similarity to keep samples.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 28, "class_name": "image_shape_filter", "class_desc": "Filter to keep samples with image shape (width, height) within specific ranges.\n\n This operator filters samples based on the width and height of images. It keeps samples\n where the image dimensions fall within the specified ranges. The operator supports two\n strategies: 'any' and 'all'. In 'any' mode, a sample is kept if at least one image meets\n the criteria. In 'all' mode, all images in the sample must meet the criteria for the\n sample to be kept. The image width and height are stored in the 'image_width' and\n 'image_height' fields of the sample's stats. If no images are present in the sample, the\n corresponding stats fields will be empty arrays.", "arguments": " min_width (): The min width to keep samples.\n max_width (): The max width to keep samples.\n min_height (): The min height to keep samples.\n max_height (): The max height to keep samples.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 29, "class_name": "image_size_filter", "class_desc": "Keep data samples whose image size (in Bytes/KB/MB/...) is within a specific range.\n\n This operator filters data samples based on the size of their images. It keeps samples\n if the image sizes fall within the specified minimum and maximum size range. The operator\n supports two strategies: 'any'(keep the sample if any image meets the size condition) and\n 'all' (keep the sample only if all images meet the size condition). If no images are\n present in the sample, the 'image_sizes' field will be an empty array.", "arguments": " min_size (): The min image size to keep samples. set to be \"0\" by\n default for no size constraint\n max_size (): The max image size to keep samples. set to be\n \"1TB\" by default, an approximate for un-limited case\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 30, "class_name": "image_text_matching_filter", "class_desc": "Filter to keep samples with image-text matching scores within a specific range.\n\n This operator uses a Hugging Face BLIP model to compute the matching score between\n images and text. It keeps samples where the matching score falls within the specified\n `min_score` and `max_score` range. The key metric, `image_text_matching_score`, is\n computed for each image-text pair. If multiple images are associated with a single text,\n the scores can be reduced using 'avg', 'max', or 'min' modes. The operator supports\n horizontal and vertical flipping of images. Samples are kept based on either 'any' or\n 'all' strategy: 'any' keeps the sample if any image meets the condition, while 'all'\n keeps the sample only if all images meet the condition.", "arguments": " hf_blip (): blip model name on huggingface to compute\n the matching score between image and text.\n trust_remote_code ()\n min_score (): The min matching score to keep samples.\n max_score (): The max matching score to keep samples.\n horizontal_flip (): Flip image horizontally (left to right).\n vertical_flip (): Flip image vertically (top to bottom).\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n reduce_mode (): reduce mode when one text corresponds to\n multiple images in a chunk.\n 'avg': Take the average of multiple values\n 'max': Take the max of multiple values\n 'min': Take the min of multiple values\n"}, {"index": 31, "class_name": "image_text_similarity_filter", "class_desc": "Filter to keep samples with image-text similarity within a specified range.\n\n This operator uses a Hugging Face CLIP model to compute the similarity between images\n and text. It retains samples where the similarity scores fall within the given range.\n The similarity score is computed for each image-text pair, and the final score can be\n reduced using 'avg', 'max', or 'min' modes. The 'any' or 'all' strategy determines if at\n least one or all image-text pairs must meet the similarity criteria. The key metric\n 'image_text_similarity' is cached in the sample's stats. Images can be flipped\n horizontally or vertically before computing the similarity.", "arguments": " hf_clip (): clip model name on huggingface to compute\n the similarity between image and text.\n trust_remote_code ()\n min_score (): The min similarity to keep samples.\n max_score (): The max similarity to keep samples.\n horizontal_flip (): Flip image horizontally (left to right).\n vertical_flip (): Flip image vertically (top to bottom).\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n reduce_mode (): reduce mode when one text corresponds to\n multiple images in a chunk.\n 'avg': Take the average of multiple values\n 'max': Take the max of multiple values\n 'min': Take the min of multiple values\n"}, {"index": 32, "class_name": "image_watermark_filter", "class_desc": "Filter to keep samples whose images have no watermark with high probability.\n\n This operator uses a Hugging Face watermark detection model to filter samples based on\n the presence of watermarks in their images. It keeps samples where the predicted\n watermark probability is below a specified threshold. The operator supports two\n strategies: 'any' (keep if any image meets the condition) and 'all' (keep only if all\n images meet the condition). The key metric 'image_watermark_prob' is computed for each\n image, representing the probability that the image contains a watermark. If no images\n are present in the sample, the metric is set to an empty array.", "arguments": " hf_watermark_model (): watermark detection model name on\n huggingface.\n trust_remote_code ()\n prob_threshold (): the predicted watermark probability threshold\n for samples. range from 0 to 1. Samples with watermark probability\n less than this threshold will be kept.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 33, "class_name": "llm_perplexity_filter", "class_desc": "Filter to keep samples with perplexity scores within a specified range, computed using a\n specified LLM.\n\n This operator computes the perplexity score for each sample using a Hugging Face LLM. It\n then filters the samples based on whether their perplexity scores fall within the\n specified minimum and maximum score range. The perplexity score is calculated as the\n exponential of the loss value from the LLM. The operator uses a query and response\n template to format the input text for the LLM. If the perplexity score is not already\n cached in the sample's stats under the key 'llm_perplexity', it will be computed.", "arguments": " hf_model (): huggingface embedding model name.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the API model.\n min_score (): Minimum perplexity score.\n max_score (): Maximum perplexity score.\n query_template (typing.Optional[str]): Template for building the query string.\n response_template (typing.Optional[str]): Template for building the response string.\n"}, {"index": 34, "class_name": "in_context_influence_filter", "class_desc": "Filter to keep texts based on their in-context influence on a validation set.\n\n This operator calculates the in-context influence of each sample by comparing\n perplexities with and without the sample as context. The influence score is computed as\n the ratio of these perplexities. If `valid_as_demo` is True, the score is L(A|Q) /\n L(A|task_desc, Q_v, A_v, Q). Otherwise, it is L(A_v|Q) / L(A_v|task_desc, Q, A, Q_v).\n The operator retains samples whose in-context influence score is within a specified\n range. The in-context influence score is stored in the 'in_context_influence' field of\n the sample's stats. The validation set must be prepared using the\n `prepare_valid_feature` method if not provided during initialization.", "arguments": " valid_dataset (typing.Optional[typing.List[typing.Dict]]): The dataset to use for validation.\n If None, 'self.prepare_valid_feature' should be manually called before applying the filter.\n task_desc (): The description of the validation task.\n valid_as_demo (): If true, score = L(A|Q) / L(A|task_desc, Q_v, A_v, Q);\n If false, score = L(A_v|Q) L(A_v|task_desc, Q, A, Q_v).\n n_shot (typing.Optional[int]): The number of shots in validation.\n"}, {"index": 35, "class_name": "instruction_following_difficulty_filter", "class_desc": "Filter to keep texts based on their instruction following difficulty (IFD,\n https://arxiv.org/abs/2308.12032) score.\n\n This operator computes the IFD score for each sample, which is the ratio of the loss\n with and without the query. It keeps samples where the IFD score falls within a\n specified range. The IFD score is calculated using a Hugging Face tokenizer and model.\n If the IFD score is already cached in the 'ifd_score' field, it will be reused. The\n operator decides to keep or filter samples based on the provided minimum and maximum IFD\n score thresholds.", "arguments": " hf_model (): huggingface embedding model name.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the API model.\n min_score (): Minimum perplexity score.\n max_score (): Maximum perplexity score.\n query_template (typing.Optional[str]): Template for building the query string.\n response_template (typing.Optional[str]): Template for building the response string.\n"}, {"index": 36, "class_name": "language_id_score_filter", "class_desc": "Filter to keep samples in a specific language with a confidence score above a threshold.\n\n This operator uses a FastText model to identify the language of each sample. It keeps\n samples that are in the specified language(s) and have a language identification\n confidence score greater than or equal to the minimum score. If no specific language is\n provided, it only filters based on the confidence score. The language ID and its\n confidence score are stored in the 'lang' and 'lang_score' fields of the sample's stats,\n respectively.", "arguments": " lang (typing.Union[str, typing.List[str]]): Samples in which languages to keep.\n min_score (): The min language identification confidence\n scores of samples to keep.\n"}, {"index": 37, "class_name": "llm_analysis_filter", "class_desc": "Base filter class for leveraging LLMs to analyze and filter data samples.\n\n This operator uses an LLM to score and tag each sample across multiple quality\n dimensions. It supports both API-based and Hugging Face models. The LLM evaluates the\n sample on clarity, relevance, usefulness, and fluency, providing scores from 1 to 5.\n Tags are assigned to categorize the sample, and a recommendation is made to keep,\n review, or discard the sample. The average score is computed based on the required\n dimension keys. Samples are kept if their average score falls within the specified min\n and max score thresholds. The key metric 'llm_analysis_score' is cached in the sample's\n stats.", "arguments": " api_or_hf_model (): API or huggingface model name.\n min_score (): The min score threshold to keep the sample.\n max_score (): The max score threshold to keep the sample.\n is_hf_model (): If true, use Transformers for loading hugging face or\n local llm.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n input_keys (typing.List[str]): Sub set of keys in the sample. Support data with\n multi fields such as 'query', 'analysis' and 'answer' in RFT data.\n field_names (typing.List[str]): Corresponding field names for input keys.\n system_prompt (typing.Optional[str]): System prompt for the task.\n input_template (typing.Optional[str]): Template for building the model input.\n field_template (typing.Optional[str]): Template for each field in the prompt.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n enable_vllm (): If true, use VLLM for loading hugging face or\n local llm.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n dim_required_keys (typing.Optional[typing.List[str]]): A list of keys used to calculate the average\n dimension score, only the dimension scores associated with these\n keys are used in the average calculation.\n"}, {"index": 38, "class_name": "llm_difficulty_score_filter", "class_desc": "Filter to keep samples with high difficulty scores estimated by an LLM.\n\n This operator uses a Hugging Face LLM to evaluate the difficulty of each sample. The LLM\n analyzes the sample across multiple dimensions, including linguistic complexity,\n conceptual depth, prior knowledge, step complexity, and ambiguity. Each dimension is\n scored on a 1-5 scale, with 5 being the highest difficulty. The final difficulty score\n is computed as the average of these dimension scores. Samples are kept if their\n difficulty score falls within the specified range (min_score to max_score). The key\n metric 'llm_difficulty_score' is stored in the sample's stats, along with detailed\n records and flags.", "arguments": " api_or_hf_model (): API or huggingface model name.\n min_score (): The min score threshold to keep the sample.\n max_score (): The max score threshold to keep the sample.\n is_hf_model (): If true, use Transformers for loading hugging face or\n local llm.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n input_keys (typing.List[str]): Sub set of keys in the sample. Support data with\n multi fields such as 'query', 'analysis' and 'answer' in RFT data.\n field_names (typing.List[str]): Corresponding field names for input keys.\n system_prompt (typing.Optional[str]): System prompt for the task.\n input_template (typing.Optional[str]): Template for building the model input.\n field_template (typing.Optional[str]): Template for each field in the prompt.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n enable_vllm (): If true, use VLLM for loading hugging face or\n local llm.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n dim_required_keys (typing.Optional[typing.List[str]]): A list of keys used to calculate the average\n dimension score, only the dimension scores associated with these\n keys are used in the average calculation.\n"}, {"index": 39, "class_name": "llm_quality_score_filter", "class_desc": "Filter to keep samples with a high quality score estimated by a language model.\n\n This operator uses a language model to evaluate the quality of each sample across\n multiple dimensions, including accuracy, grammar, informativeness, and coherence. The\n LLM provides a numerical score for each dimension on a 1-5 scale, where 1 is the lowest\n and 5 is the highest. The overall quality score is used to decide whether to keep or\n filter out the sample based on the specified minimum and maximum score thresholds. The\n evaluation results are cached in the 'llm_quality_score' and 'llm_quality_record'\n fields. Important flags and tags from the LLM's analysis may also be stored in the\n sample's stats.", "arguments": " api_or_hf_model (): API or huggingface model name.\n min_score (): The min score threshold to keep the sample.\n max_score (): The max score threshold to keep the sample.\n is_hf_model (): If true, use Transformers for loading hugging face or\n local llm.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n input_keys (typing.List[str]): Sub set of keys in the sample. Support data with\n multi fields such as 'query', 'analysis' and 'answer' in RFT data.\n field_names (typing.List[str]): Corresponding field names for input keys.\n system_prompt (typing.Optional[str]): System prompt for the task.\n input_template (typing.Optional[str]): Template for building the model input.\n field_template (typing.Optional[str]): Template for each field in the prompt.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n enable_vllm (): If true, use VLLM for loading hugging face or\n local llm.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n dim_required_keys (typing.Optional[typing.List[str]]): A list of keys used to calculate the average\n dimension score, only the dimension scores associated with these\n keys are used in the average calculation.\n"}, {"index": 40, "class_name": "llm_task_relevance_filter", "class_desc": "Filter to keep samples with high relevance scores to validation tasks estimated by an\n LLM.\n\n This operator evaluates the relevance of each sample to a specified validation task\n using an LLM. The LLM scores the sample on multiple dimensions, including topical\n relevance, linguistic style match, task match, knowledge alignment, and potential\n utility. Each dimension is scored on a 1-5 scale, with 5 being the highest. The key\n metric, 'llm_task_relevance', is the average score across these dimensions. Samples are\n kept if their average score meets or exceeds the specified minimum threshold. The\n operator uses either an API or a Hugging Face model for evaluation. If no validation\n dataset or task description is provided, the 'prepare_valid_feature' method must be\n called manually before applying the filter.", "arguments": " api_or_hf_model (): API or huggingface model name.\n min_score (): The lowest score threshold to keep the sample.\n is_hf_model (): Indicates if the model is from HuggingFace.\n valid_dataset (typing.Optional[typing.List[typing.Dict]]): The dataset to use for validation.\n task_desc (typing.Optional[str]): The description of the validation task.\n If valid_dataset=None and task_desc=None,\n 'self.prepare_valid_feature' should be manually called before applying the filter.\n n_shot (typing.Optional[int]): The number of shots in validation.\n"}, {"index": 41, "class_name": "maximum_line_length_filter", "class_desc": "Filter to keep samples with a maximum line length within a specified range.\n\n This operator filters out samples based on the length of their longest line. It retains\n samples where the maximum line length is within the specified `min_len` and `max_len`\n range. The maximum line length is computed by splitting the text into lines and\n measuring the length of each line. If the context is provided, it uses precomputed lines\n stored under the key 'lines' in the context. The maximum line length is cached in the\n 'max_line_length' field of the stats.", "arguments": " min_len (): The min filter length in this op, samples will\n be filtered if their maximum line length is below this\n parameter.\n max_len (): The max filter length in this op, samples will\n be filtered if their maximum line length exceeds this\n parameter.\n"}, {"index": 42, "class_name": "perplexity_filter", "class_desc": "Filter to keep samples with perplexity score in a specified range.\n\n This operator computes the perplexity of text samples using a Hugging Face tokenizer and\n a KenLM language model. It keeps samples with perplexity scores within the specified\n minimum and maximum values. The perplexity is calculated character-based by default. If\n the perplexity is already computed, it will be reused from the 'perplexity' field in the\n sample's stats. The operator supports batched operations for efficiency.", "arguments": " lang (): Compute perplexity for samples in which language.\n min_ppl (): The min filter perplexity in this op.\n max_ppl (): The max filter perplexity in this op.\n"}, {"index": 43, "class_name": "phrase_grounding_recall_filter", "class_desc": "Filter to keep samples based on the phrase grounding recall of phrases extracted from\n text in images.\n\n This operator uses a Hugging Face Owl-ViT model to locate phrases extracted from the\n text within the images. It keeps samples where the phrase grounding recall is within a\n specified range. The recall is computed by comparing the number of correctly located\n phrases to the total number of phrases. The operator can handle multiple images per text\n chunk and supports different strategies for reducing the recall values (e.g., average,\n max, min). It also allows for flipping images horizontally or vertically. The key metric\n 'phrase_grounding_recall' is computed and stored in the sample's stats. If no images are\n present, the recall is set to an empty array.", "arguments": " hf_owlvit (): Owl-ViT model name on huggingface to locate the\n phrases extracted from the text.\n trust_remote_code ()\n min_recall (): The min phrase grounding recall to keep samples.\n max_recall (): The max phrase grounding recall to keep samples.\n horizontal_flip (): Flip image horizontally (left to right).\n vertical_flip (): Flip image vertically (top to bottom).\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n reduce_mode (): reduce mode when one text corresponds to\n multiple images in a chunk.\n 'avg': Take the average of multiple values\n 'max': Take the max of multiple values\n 'min': Take the min of multiple values\n iou_thr (): the IoU threshold for NMS-like post-process. If two\n predicted bboxes are overlap with an IoU larger than this\n threshold, the bbox with less confidence will be removed. Default:\n 0.5.\n large_area_ratio_thr (): the area ratio threshold for filtering out\n those large predicted bboxes. If the area of a predicted bbox\n accounts for more than this ratio threshold of the whole image\n area, this bbox will be removed. Default: 0.95.\n conf_thr (): the confidence score threshold for removing\n low-confidence bboxes. If the confidence score of a predicted bbox\n is lower than the threshold, this bbox will be removed. Default: 0.\n"}, {"index": 44, "class_name": "special_characters_filter", "class_desc": "Filter to keep samples with special-character ratio within a specific range.\n\n This operator filters out samples based on the ratio of special characters in the text.\n It keeps samples where the special-character ratio is within the specified minimum and\n maximum thresholds. The special-character ratio is computed as the number of special\n characters divided by the total number of characters in the text. If the\n 'special_char_ratio' is already cached in the stats, it will be reused. Otherwise, it\n will be computed and stored in the 'special_char_ratio' field.", "arguments": " min_ratio (): The min filter ratio in this op, samples will\n be filtered if their special-char ratio is below this\n parameter.\n max_ratio (): The max filter ratio in this op, samples will\n be filtered if their special-char ratio exceeds this\n parameter.\n"}, {"index": 45, "class_name": "specified_field_filter", "class_desc": "Filter samples based on the specified field information.\n\n This operator checks if the value of a specified field in each sample is within a given\n target value range. If the field value is not within the target range, the sample is\n filtered out. The field can be a multi-level key, with levels separated by dots. The\n target value is a list of acceptable values for the field. If the field value is not a\n list or tuple, it is converted to a list for comparison. Samples are retained if all\n values in the field match any of the target values.\n\n - Uses the 'field_key' and 'target_value' parameters.\n - Supports multi-level field keys, e.g., 'level1.level2'.\n - Converts non-list/tuple field values to a list for comparison.", "arguments": " field_key (): Filter based on the specified value\n corresponding to the target key. The target key\n corresponding to multi-level field information need to be\n separated by '.'.\n target_value (typing.List): The range of specified field information\n corresponding to the samples that need to be retained.\n"}, {"index": 46, "class_name": "specified_numeric_field_filter", "class_desc": "Filter samples based on a specified numeric field value.\n\n This operator filters out samples if the numeric value in the specified field is not\n within the given range. The field can be multi-level, with keys separated by dots. The\n sample is kept if the numeric value is between the minimum and maximum values,\n inclusive. If the field key is not provided, all samples are retained. The operator\n ensures that the field exists in the sample and that its value is numeric before\n performing the comparison.\n\n - Uses the 'min_value' and 'max_value' to define the acceptable range.\n - Supports multi-level fields using dot-separated keys.\n - Returns False for non-numeric or out-of-range values, filtering the sample.", "arguments": " field_key (): Filter based on the specified numeric value\n corresponding to the target key. The target key\n corresponding to multi-level field information need to be\n separated by '.'.\n min_value (): The min filter value in SpecifiedNumericField\n op, samples will be filtered if their specified numeric\n field value is below this parameter.\n max_value (): The max filter value in SpecifiedNumericField\n op, samples will be filtered if their specified numeric\n field value exceeds this parameter.\n"}, {"index": 47, "class_name": "stopwords_filter", "class_desc": "Filter to keep samples with stopword ratio within a specified range.\n\n This operator calculates the ratio of stopwords in a sample and keeps samples where this\n ratio is between the specified minimum and maximum values. The stopword ratio is\n computed as the number of stopwords divided by the total number of words. If the\n `tokenization` parameter is set, a Hugging Face tokenizer is used to tokenize the text.\n The stopwords are loaded from a directory, and if the language is set to \"all\", it\n merges stopwords from all available languages. The key metric is `stopwords_ratio`,\n which is character-based by default. The operator also supports word augmentation for\n specific languages.", "arguments": " lang (): Consider stopwords in what language. If lang ==\n \"all\", we will adopt the one merged from all the available\n languages\n tokenization (): whether to use model to tokenize documents\n min_ratio (): The min filter ratio in this op.\n max_ratio (): The max filter ratio in this op.\n stopwords_dir (): The directory storing the stopwords\n file(s) whose name includes \"stopwords\" and in json format\n use_words_aug (): Whether to augment words, especially for\n Chinese and Vietnamese\n words_aug_group_sizes (typing.List[typing.Annotated[int, Gt(gt=0)]]): The group size of words to augment\n words_aug_join_char (): The join char between words to\n augment\n"}, {"index": 48, "class_name": "suffix_filter", "class_desc": "Filter to keep samples with specified suffix.\n\n This operator retains samples that have a suffix matching any of the provided suffixes.\n If no suffixes are specified, all samples are kept. The key metric 'keep' is computed\n based on whether the sample's suffix matches the specified list. The 'suffix' field of\n each sample is checked against the list of allowed suffixes. If the suffix matches, the\n sample is kept; otherwise, it is filtered out.", "arguments": " suffixes (typing.Union[str, typing.List[str]]): the suffix of text that will be keep.\n For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']\n"}, {"index": 49, "class_name": "text_action_filter", "class_desc": "Filter to keep texts that contain a minimum number of actions.\n\n This operator uses a Spacy model to detect actions in the text. It keeps samples if the\n number of detected actions meets or exceeds the specified minimum. The supported\n languages are English ('en') and Chinese ('zh'). The 'num_action' statistic is computed\n and cached for each sample. Actions are identified based on part-of-speech (POS) tags\n and specific tags for verbs.", "arguments": " lang (): language of the text in the samples. 'en' for detection of\n actions in English and 'zh' for detection of actions in Chinese.\n min_action_num (): The min action number in the filtering. samples\n will be filtered if their action number in the text is below this\n parameter.\n"}, {"index": 50, "class_name": "text_embd_similarity_filter", "class_desc": "Filter to keep texts whose average embedding similarity to a set of given validation\n texts falls within a specific range.\n\n This operator computes the cosine similarity between the text embeddings and a set of\n validation text embeddings. It keeps samples where the average similarity score is\n within the specified range. The key metric, 'text_embd_similarity', is computed as the\n mean cosine similarity. The operator supports both API-based and Hugging Face model-\n based embeddings. If no valid dataset is provided, the `prepare_valid_feature` method\n must be called manually before applying the filter.", "arguments": " api_or_hf_model (): API or huggingface embedding model name.\n is_hf_model (): Indicates if the model is from HuggingFace.\n api_endpoint (): Embedding URL endpoint for the API.\n response_path (): Path to extract content from the API response.\n Defaults to 'data.0.embedding' for embedding model.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the API model.\n min_score (): The min average similarity to keep samples.\n max_score (): The max average similarity to keep samples.\n valid_dataset (typing.Optional[typing.List[typing.Dict]]): The dataset to use for validation.\n If None, 'self.prepare_valid_feature' should be manually called before applying the filter.\n ebd_dim (): The embedding's dimension via API.\n API specific parameter, i.e., if is_hf_model=True, this parameter will not take effect.\n pooling (typing.Optional[str]): strategy to extract embedding from the hidden states. https://arxiv.org/abs/2503.01807\n None: default option, the hidden state of the last token.\n \"mean\": uniform mean of hidden states.\n \"weighted_mean\": weighted mean of hidden states. https://arxiv.org/abs/2202.08904\n HF_MODEL specific parameter, i.e., if is_hf_model=False, this parameter will not take effect.\n input_template (typing.Optional[str]): Template for building the model input.\n"}, {"index": 51, "class_name": "text_entity_dependency_filter", "class_desc": "Identify and filter text samples based on entity dependencies.\n\n This operator uses a spaCy model to detect entities in the text and evaluates their\n dependency relationships. It filters out samples where entities have fewer than a\n specified number of dependency edges. The key metric is 'num_dependency_edges', which\n counts the number of edges for each entity in the dependency tree. Samples with no\n detected entities are omitted. The operator supports 'any' or 'all' strategies: 'any'\n keeps samples if at least one entity meets the dependency threshold, while 'all'\n requires all entities to meet the threshold. Supported languages are English ('en') and\n Chinese ('zh').", "arguments": " lang (): language of the text in the samples. 'en' for detection of\n entities in English and 'zh' for detection of entities in Chinese.\n min_dependency_num (): The min token number in the filtering.\n Objects is independent if their number of edges in the dependency\n tree is below this parameter.\n any_or_all (): keep this sample with 'any' or 'all' strategy.\n 'any': keep this sample if any object is dependent. 'all': keep\n this sample only if all images are dependent.\n"}, {"index": 52, "class_name": "text_length_filter", "class_desc": "Filter to keep samples with total text length within a specific range.\n\n This operator filters out samples based on their total text length. It retains samples\n where the text length is between the specified minimum and maximum lengths. The text\n length is computed as the number of characters in the sample's text. If the 'text_len'\n key is already present in the sample's stats, it will be reused; otherwise, it will be\n computed. The operator processes samples in batches for efficiency.", "arguments": " min_len (): The min text length in the filtering. samples\n will be filtered if their text length is below this\n parameter.\n max_len (): The max text length in the filtering. samples\n will be filtered if their text length exceeds this\n parameter.\n"}, {"index": 53, "class_name": "text_pair_similarity_filter", "class_desc": "Filter to keep text pairs with similarities within a specific range.\n\n This operator computes the similarity between two texts in a pair using a Hugging Face\n CLIP model. It keeps samples where the similarity score falls within the specified min\n and max thresholds. The key metric, 'text_pair_similarity', is computed as the cosine\n similarity between the text embeddings. The operator supports two strategies for keeping\n samples: 'any' (keep if any pair meets the condition) and 'all' (keep only if all pairs\n meet the condition). If the second text key is not provided, the operator will raise an\n error. The similarity scores are cached under the 'text_pair_similarity' field in the\n sample's stats.", "arguments": " hf_clip (): clip model name on huggingface to compute\n the similarity between image and text.\n trust_remote_code ()\n min_score (): The min similarity to keep samples.\n max_score (): The max similarity to keep samples.\n text_key_second (): used to store the other sentence\n in the text pair.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 54, "class_name": "token_num_filter", "class_desc": "Filter to keep samples with a total token number within a specified range.\n\n This operator uses a Hugging Face tokenizer to count the number of tokens in each\n sample. It keeps samples where the token count is between the minimum and maximum\n thresholds. The token count is stored in the 'num_token' field of the sample's stats. If\n the token count is not already computed, it will be calculated using the specified\n tokenizer.", "arguments": " hf_tokenizer (): the tokenizer name of Hugging Face tokenizers.\n min_num (): The min filter token number in this op, samples\n will be filtered if their token number is below this\n parameter.\n max_num (): The max filter token number in this op, samples\n will be filtered if their token number exceeds this\n parameter.\n"}, {"index": 55, "class_name": "video_aesthetics_filter", "class_desc": "Filter to keep data samples with aesthetics scores for specified frames in the videos\n within a specific range.\n\n This operator evaluates the aesthetic quality of video frames using a Hugging Face\n model. It keeps samples where the aesthetics scores of the specified frames fall within\n a given range. The key metric, 'video_frames_aesthetics_score', is computed by\n averaging, taking the max, or min of the frame scores, depending on the reduce mode.\n Frame sampling can be done uniformly or by extracting all keyframes. The filter applies\n a 'any' or 'all' strategy to decide if a sample should be kept based on the scores of\n multiple videos.", "arguments": " hf_scorer_model (): Huggingface model name for the aesthetics\n predictor. By default, we will use\n 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE',\n refer to pypi.org/project/simple-aesthetics-predictor\n trust_remote_code ()\n min_score (): Min score for the predicted aesthetics in a video.\n max_score (): Max score for the predicted aesthetics in a video.\n frame_sampling_method (): sampling method of extracting frame\n images from the videos.\n Should be one of [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames and the latter one extract\n specified number of frames uniformly from the video.\n Default: \"uniform\" with frame_num=3, considering that the number of\n keyframes can be large while their difference is usually small\n in terms of their aesthetics.\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n any_or_all (): Keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n reduce_mode (): reduce mode when one sample corresponds to\n multiple frames, must be one of ['avg','max', 'min'].\n 'avg': Take the average of multiple values\n 'max': Take the max of multiple values\n 'min': Take the min of multiple values\n"}, {"index": 56, "class_name": "video_aspect_ratio_filter", "class_desc": "Filter to keep samples with video aspect ratio within a specific range.\n\n This operator filters samples based on the aspect ratios of their videos. It keeps\n samples where the video aspect ratios fall within a specified range. The aspect ratio is\n calculated as the width divided by the height (W / H). The operator supports two\n strategies for keeping samples: 'any' and 'all'. In 'any' mode, a sample is kept if at\n least one video meets the aspect ratio condition. In 'all' mode, all videos in the\n sample must meet the condition for the sample to be kept. The aspect ratios are computed\n and stored in the 'video_aspect_ratios' field of the sample's stats.", "arguments": " min_ratio (): The minimum aspect ratio to keep samples,\n supported format is a string, such as \"9:21\" or \"9/21\".\n max_ratio (): The maximum aspect ratio to keep samples,\n supported format is a string, such as \"21:9\" or \"21/9\".\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n"}, {"index": 57, "class_name": "video_duration_filter", "class_desc": "Keep data samples whose videos' durations are within a specified range.\n\n This operator filters data samples based on the duration of their associated videos. It\n keeps samples where the video durations fall within a specified minimum and maximum\n range. The filtering strategy can be set to 'any' or 'all':\n - 'any': Keep the sample if any of its videos meet the duration criteria.\n - 'all': Keep the sample only if all of its videos meet the duration criteria.\n The video durations are computed and stored in the 'video_duration' field of the\n sample's stats. If no videos are present, an empty array is stored.", "arguments": " min_duration (): The min video duration to keep samples in seconds.\n It's 0 by default.\n max_duration (): The max video duration to keep samples in seconds.\n It's sys.maxsize by default.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n"}, {"index": 58, "class_name": "video_frames_text_similarity_filter", "class_desc": "Filter to keep samples based on the similarity between video frame images and text\n within a specific range.\n\n This operator uses a Hugging Face CLIP model to compute the similarity between video\n frames and associated text. It keeps samples where the computed similarity scores fall\n within a specified range. The operator supports different frame sampling methods,\n including 'all_keyframes' and 'uniform', and allows for horizontal and vertical flipping\n of the frames. The similarity score is reduced using one of three modes: 'avg', 'max',\n or 'min'. The operator also supports two strategies for keeping samples: 'any' (keep if\n any video meets the condition) or 'all' (keep only if all videos meet the condition).\n The key metric is stored in the 'video_frames_text_similarity' field.", "arguments": " hf_clip (): clip model name on huggingface to compute\n the similarity between frame image and text. It's kind of\n language-related. For example, for Chinese datasets, ChineseCLIP\n might be a better choice.\n trust_remote_code ()\n min_score (): the min similarity to keep samples.\n max_score (): the max similarity to keep samples.\n frame_sampling_method (): sampling method of extracting frame\n images from the videos.\n Should be one of [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames (the number of which depends\n on the duration of the video) and the latter one extract specified\n number of frames uniformly from the video.\n Default: \"all_keyframes\".\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n horizontal_flip (): flip frame image horizontally (left to right).\n vertical_flip (): flip frame image vertically (top to bottom).\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n reduce_mode (): reduce mode when one text corresponds to\n multiple video frame images in a chunk.\n 'avg': Take the average of multiple values\n 'max': Take the max of multiple values\n 'min': Take the min of multiple values\n"}, {"index": 59, "class_name": "video_motion_score_filter", "class_desc": "Filter to keep samples with video motion scores within a specific range.\n\n The operator uses Farneback's algorithm from OpenCV to compute dense optical flow. It\n calculates the average motion score for each video and retains samples based on the\n specified minimum and maximum score thresholds. The 'any' or 'all' strategy determines\n whether to keep a sample if any or all videos meet the criteria. The motion score is\n computed as the mean magnitude of the optical flow, which can be normalized relative to\n the frame's diagonal length. The stats are cached under the key 'video_motion_score'.", "arguments": " min_score (): The minimum motion score to keep samples.\n max_score (): The maximum motion score to keep samples.\n sampling_fps (typing.Annotated[float, Gt(gt=0)]): The sampling rate in frames_per_second for\n optical flow calculations.\n size (typing.Union[typing.Annotated[int, Gt(gt=0)], typing.Tuple[typing.Annotated[int, Gt(gt=0)]], typing.Tuple[typing.Annotated[int, Gt(gt=0)], typing.Annotated[int, Gt(gt=0)]], NoneType]): Resize frames before computing optical flow. If size is a\n sequence like (h, w), frame size will be matched to this. If size\n is an int, smaller edge of frames will be matched to this number.\n i.e, if height > width, then frame will be rescaled to (size *\n height / width, size). Default `None` to keep the original size.\n max_size (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): The maximum allowed for the longer edge of resized\n frames. If the longer edge of frames is greater than max_size after\n being resized according to size, size will be overruled so that the\n longer edge is equal to max_size. As a result, the smaller edge may\n be shorter than size. This is only supported if size is an int.\n divisible (typing.Annotated[int, Gt(gt=0)]): The number that the dimensions must be divisible by.\n relative (): If `True`, the optical flow magnitude is normalized to\n a [0, 1] range, relative to the frame's diagonal length.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n"}, {"index": 60, "class_name": "video_motion_score_raft_filter", "class_desc": "Filter to keep samples with video motion scores within a specified range.\n\n This operator utilizes the RAFT (Recurrent All-Pairs Field Transforms) model from\n torchvision to predict optical flow between video frames. It keeps samples where the\n video motion score is within the given min and max score range. The motion score is\n computed based on the optical flow between frames, which is estimated using the RAFT\n model. The operator can sample frames at a specified FPS and apply transformations to\n the frames before computing the flow.\n\n - The RAFT model is used to estimate the optical flow.\n - Frames are preprocessed using a series of transformations including normalization and\n color channel flipping.\n - The motion score is calculated from the optical flow data.\n - The operator can be configured to filter based on any or all frames in the video.\n - The device for model inference (CPU or CUDA) is automatically detected and set.\n\n For further details, refer to the official torchvision documentation:\n https://pytorch.org/vision/main/models/raft.html\n\n The original paper on RAFT is available here:\n https://arxiv.org/abs/2003.12039\n ", "arguments": " min_score ()\n max_score ()\n sampling_fps (typing.Annotated[float, Gt(gt=0)])\n size (typing.Union[typing.Annotated[int, Gt(gt=0)], typing.Tuple[typing.Annotated[int, Gt(gt=0)]], typing.Tuple[typing.Annotated[int, Gt(gt=0)], typing.Annotated[int, Gt(gt=0)]], NoneType])\n max_size (typing.Optional[typing.Annotated[int, Gt(gt=0)]])\n divisible (typing.Annotated[int, Gt(gt=0)])\n relative ()\n any_or_all ()\n"}, {"index": 61, "class_name": "video_nsfw_filter", "class_desc": "Filter to keep samples whose videos have nsfw scores in a specified range.\n\n This operator uses a Hugging Face model to detect NSFW content in video frames. It keeps\n samples where the NSFW score is below a specified threshold. The operator supports two\n frame sampling methods: \"all_keyframes\" and \"uniform\". For \"uniform\", it extracts a\n specified number of frames. The NSFW scores are reduced using one of three modes: \"avg\",\n \"max\", or \"min\". The key metric, 'video_nsfw_score', is computed for each video and\n stored in the sample's stats. The operator can use either an \"any\" or \"all\" strategy to\n decide if a sample should be kept based on the NSFW scores of its videos.", "arguments": " hf_nsfw_model (): nsfw detection model name on huggingface.\n trust_remote_code ()\n min_score ()\n max_score (): the nsfw score threshold for samples.\n range from 0 to 1. Samples with nsfw score less than this threshold\n will be kept.\n frame_sampling_method (): sampling method of extracting frame\n images from the videos.\n Should be one of [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames (the number of which depends\n on the duration of the video) and the latter one extract specified\n number of frames uniformly from the video.\n Default: \"all_keyframes\".\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n reduce_mode (): reduce mode for multiple sampled video frames.\n 'avg': Take the average of multiple values\n 'max': Take the max of multiple values\n 'min': Take the min of multiple values\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n"}, {"index": 62, "class_name": "video_ocr_area_ratio_filter", "class_desc": "Keep data samples whose detected text area ratios for specified frames in the video are\n within a specified range.\n\n This operator filters data based on the ratio of the detected text area to the total\n frame area. It uses EasyOCR to detect text in the specified languages and calculates the\n area ratio for each sampled frame. The operator then determines whether to keep a sample\n based on the `any` or `all` strategy, which checks if any or all of the videos meet the\n specified area ratio range. The key metric, `video_ocr_area_ratio`, is computed as the\n mean of the text area ratios across the sampled frames. The number of sampled frames and\n the specific frames to be sampled can be configured.", "arguments": " min_area_ratio (): The min ocr area ratio to keep samples. It's 0\n by default.\n max_area_ratio (): The max ocr area ratio to keep samples. It's 1.0\n by default.\n frame_sample_num (typing.Annotated[int, Gt(gt=0)]): The number of sampled frames to calculate the\n ocr area ratio. If it's 1, only middle frame will be selected. If\n it's 2, only the first and the last frames will be selected. If\n it's larger than 2, in addition to the first and the last frames,\n other frames will be sampled evenly within the video duration.\n languages_to_detect (typing.Union[str, typing.List[str]]): texts in which languages should be\n detected. Default: ['ch_sim', 'en']. Full language list can be\n found here: https://www.jaided.ai/easyocr/.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n"}, {"index": 63, "class_name": "video_resolution_filter", "class_desc": "Keep data samples whose videos' resolutions are within a specified range.\n\n This operator filters data samples based on the resolution of the videos they contain.\n It keeps samples if the video resolutions fall within the defined width and height\n ranges. The filtering strategy can be set to 'any' or 'all':\n - 'any': Keeps the sample if any video meets the resolution criteria.\n - 'all': Keeps the sample only if all videos meet the resolution criteria.\n\n The operator computes and caches the 'video_width' and 'video_height' for each video in\n the sample. If no videos are present, it sets these fields to empty arrays. These cached\n values are used to determine whether to keep or filter out the sample.", "arguments": " min_width (): The min horizontal resolution.\n max_width (): The max horizontal resolution.\n min_height (): The min vertical resolution.\n max_height (): The max vertical resolution.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n"}, {"index": 64, "class_name": "human_preference_annotation_mapper", "class_desc": "Operator for human preference annotation using Label Studio.\n\n This operator formats and presents pairs of answers to a prompt for human evaluation. It\n uses a default or custom Label Studio configuration to display the prompt and answer\n options. The operator processes the annotations to determine the preferred answer,\n updating the sample with the chosen and rejected answers. The operator requires specific\n keys in the samples for the prompt and answer options. If these keys are missing, it\n logs warnings and uses placeholder text. The annotated results are processed to update\n the sample with the chosen and rejected answers.", "arguments": " label_config_file (): Path to the label config file\n answer1_key (): Key for the first answer\n answer2_key (): Key for the second answer\n prompt_key (): Key for the prompt/question\n chosen_key (): Key for the chosen answer\n rejected_key (): Key for the rejected answer\n"}, {"index": 65, "class_name": "audio_add_gaussian_noise_mapper", "class_desc": "Mapper to add Gaussian noise to audio samples.\n\n This operator adds Gaussian noise to audio data with a specified probability. The\n amplitude of the noise is randomly chosen between `min_amplitude` and `max_amplitude`.\n If `save_dir` is provided, the modified audio files are saved in that directory;\n otherwise, they are saved in the same directory as the input files. The `p` parameter\n controls the probability of applying this transformation to each sample. If no audio is\n present in the sample, it is returned unchanged.", "arguments": " min_amplitude (): float unit: linear amplitude.\n Default: 0.001. Minimum noise amplification factor.\n max_amplitude (): float unit: linear amplitude.\n Default: 0.015. Maximum noise amplification factor.\n p (): float range: [0.0, 1.0]. Default: 0.5.\n The probability of applying this transform.\n save_dir: str. Default: None.\n The directory where generated audio files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n save_dir ()\n"}, {"index": 66, "class_name": "audio_ffmpeg_wrapped_mapper", "class_desc": "Wraps FFmpeg audio filters for processing audio files in a dataset.\n\n This operator applies specified FFmpeg audio filters to the audio files in the dataset.\n It supports passing custom filter parameters and global arguments to the FFmpeg command\n line. The processed audio files are saved to a specified directory or the same directory\n as the input files if no save directory is provided. The `DJ_PRODUCED_DATA_DIR`\n environment variable can also be used to set the save directory. If no filter name is\n provided, the audio files remain unmodified. The operator updates the source file paths\n in the dataset after processing.", "arguments": " filter_name (typing.Optional[str]): ffmpeg audio filter name.\n filter_kwargs (typing.Optional[typing.Dict]): keyword-arguments passed to ffmpeg filter.\n global_args (typing.Optional[typing.List[str]]): list-arguments passed to ffmpeg command-line.\n capture_stderr (): whether to capture stderr.\n overwrite_output (): whether to overwrite output file.\n save_dir (): The directory where generated audio files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 67, "class_name": "calibrate_qa_mapper", "class_desc": "Calibrates question-answer pairs based on reference text using an API model.\n\n This operator uses a specified API model to calibrate question-answer pairs, making them\n more detailed and accurate. It constructs the input prompt by combining the reference\n text and the question-answer pair, then sends it to the API for calibration. The output\n is parsed to extract the calibrated question and answer. The operator retries the API\n call and parsing up to a specified number of times in case of errors. The default system\n prompt, input templates, and output pattern can be customized. The operator supports\n additional parameters for model initialization and sampling.", "arguments": " api_model (): API model name.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the calibration task.\n input_template (typing.Optional[str]): Template for building the model input.\n reference_template (typing.Optional[str]): Template for formatting the reference text.\n qa_pair_template (typing.Optional[str]): Template for formatting question-answer pairs.\n output_pattern (typing.Optional[str]): Regular expression for parsing model output.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 68, "class_name": "calibrate_query_mapper", "class_desc": "Calibrate query in question-answer pairs based on reference text.\n\n This operator adjusts the query (question) in a question-answer pair to be more detailed\n and accurate, while ensuring it can still be answered by the original answer. It uses a\n reference text to inform the calibration process. The calibration is guided by a system\n prompt, which instructs the model to refine the question without adding extraneous\n information. The output is parsed to extract the calibrated query, with any additional\n content removed.", "arguments": " api_model (): API model name.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the calibration task.\n input_template (typing.Optional[str]): Template for building the model input.\n reference_template (typing.Optional[str]): Template for formatting the reference text.\n qa_pair_template (typing.Optional[str]): Template for formatting question-answer pairs.\n output_pattern (typing.Optional[str]): Regular expression for parsing model output.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 69, "class_name": "calibrate_response_mapper", "class_desc": "Calibrate response in question-answer pairs based on reference text.\n\n This mapper calibrates the 'response' part of a question-answer pair by using a\n reference text. It aims to make the response more detailed and accurate while ensuring\n it still answers the original question. The calibration process uses a default system\n prompt, which can be customized. The output is stripped of any leading or trailing\n whitespace.", "arguments": " api_model (): API model name.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the calibration task.\n input_template (typing.Optional[str]): Template for building the model input.\n reference_template (typing.Optional[str]): Template for formatting the reference text.\n qa_pair_template (typing.Optional[str]): Template for formatting question-answer pairs.\n output_pattern (typing.Optional[str]): Regular expression for parsing model output.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 70, "class_name": "chinese_convert_mapper", "class_desc": "Mapper to convert Chinese text between Traditional, Simplified, and Japanese Kanji.\n\n This operator converts Chinese text based on the specified mode. It supports conversions\n between Simplified Chinese, Traditional Chinese (including Taiwan and Hong Kong\n variants), and Japanese Kanji. The conversion is performed using a pre-defined set of\n rules. The available modes include 's2t' for Simplified to Traditional, 't2s' for\n Traditional to Simplified, and other specific variants like 's2tw', 'tw2s', 's2hk',\n 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', and 'jp2t'. The\n operator processes text in batches and applies the conversion to the specified text key\n in the samples.", "arguments": " mode (): Choose the mode to convert Chinese:\n\n s2t: Simplified Chinese to Traditional Chinese,\n\n t2s: Traditional Chinese to Simplified Chinese,\n\n s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),\n\n tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,\n\n s2hk: Simplified Chinese to Traditional Chinese\n (Hong Kong variant),\n\n hk2s: Traditional Chinese (Hong Kong variant) to Simplified\n Chinese,\n\n s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard)\n with Taiwanese idiom,\n\n tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese\n with Mainland Chinese idiom,\n\n t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),\n\n tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,\n\n hk2t: Traditional Chinese (Hong Kong variant) to Traditional\n Chinese,\n\n t2hk: Traditional Chinese to Traditional Chinese\n (Hong Kong variant),\n\n t2jp: Traditional Chinese Characters (Ky\u016bjitai) to New Japanese\n Kanji,\n\n jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese\n Characters,\n"}, {"index": 71, "class_name": "clean_copyright_mapper", "class_desc": "Cleans copyright comments at the beginning of text samples.\n\n This operator removes copyright comments from the start of text samples. It identifies\n and strips multiline comments that contain the word \"copyright\" using a regular\n expression. It also greedily removes lines starting with comment markers like `//`, `#`,\n or `--` at the beginning of the text, as these are often part of copyright headers. The\n operator processes each sample individually but can handle batches for efficiency.", "arguments": ""}, {"index": 72, "class_name": "clean_email_mapper", "class_desc": "Cleans email addresses from text samples using a regular expression.\n\n This operator removes or replaces email addresses in the text based on a regular\n expression pattern. By default, it uses a standard pattern to match email addresses, but\n a custom pattern can be provided. The matched email addresses are replaced with a\n specified replacement string, which defaults to an empty string. The operation is\n applied to each text sample in the batch. If no email address is found in a sample, it\n remains unchanged.", "arguments": " pattern (typing.Optional[str]): regular expression pattern to search for within text.\n repl (): replacement string, default is empty string.\n"}, {"index": 73, "class_name": "clean_html_mapper", "class_desc": "Cleans HTML code from text samples, converting HTML to plain text.\n\n This operator processes text samples by removing HTML tags and converting HTML elements\n to a more readable format. Specifically, it replaces `
  • ` and `
      ` tags with newline\n and bullet points. The Selectolax HTML parser is used to extract the text content from\n the HTML. This operation is performed in a batched manner, making it efficient for large\n datasets.", "arguments": ""}, {"index": 74, "class_name": "clean_ip_mapper", "class_desc": "Cleans IPv4 and IPv6 addresses from text samples.\n\n This operator removes or replaces IPv4 and IPv6 addresses in the text. It uses a regular\n expression to identify and clean the IP addresses. By default, it replaces the IP\n addresses with an empty string, effectively removing them. The operator can be\n configured with a custom pattern and replacement string. If no pattern is provided, a\n default pattern for both IPv4 and IPv6 addresses is used. The operator processes samples\n in batches.\n\n - Uses a regular expression to find and clean IP addresses.\n - Replaces found IP addresses with a specified replacement string.\n - Default replacement string is an empty string, which removes the IP addresses.\n - Can use a custom regular expression pattern if provided.\n - Processes samples in batches for efficiency.", "arguments": " pattern (typing.Optional[str]): regular expression pattern to search for within text.\n repl (): replacement string, default is empty string.\n"}, {"index": 75, "class_name": "clean_links_mapper", "class_desc": "Mapper to clean links like http/https/ftp in text samples.\n\n This operator removes or replaces URLs and other web links in the text. It uses a\n regular expression pattern to identify and remove links. By default, it replaces the\n identified links with an empty string, effectively removing them. The operator can be\n customized with a different pattern and replacement string. It processes samples in\n batches and modifies the text in place. If no links are found in a sample, it is left\n unchanged.", "arguments": " pattern (typing.Optional[str]): regular expression pattern to search for within text.\n repl (): replacement string, default is empty string.\n"}, {"index": 76, "class_name": "dialog_intent_detection_mapper", "class_desc": "Generates user's intent labels in a dialog by analyzing the history, query, and\n response.\n\n This operator processes a dialog to identify and label the user's intent. It uses a\n predefined system prompt and templates to build input prompts for an API call. The API\n model (e.g., GPT-4) is used to analyze the dialog and generate intent labels and\n analysis. The results are stored in the meta field under 'dialog_intent_labels' and\n 'dialog_intent_labels_analysis'. The operator supports customizing the system prompt,\n templates, and patterns for parsing the API response. If the intent candidates are\n provided, they are included in the input prompt. The operator retries the API call up to\n a specified number of times if there are errors.", "arguments": " api_model (): API model name.\n intent_candidates (typing.Optional[typing.List[str]]): The output intent candidates. Use the\n intent labels of the open domain if it is None.\n max_round (typing.Annotated[int, Ge(ge=0)]): The max num of round in the dialog to build the\n prompt.\n labels_key (): The key name in the meta field to store the\n output labels. It is 'dialog_intent_labels' in default.\n analysis_key (): The key name in the meta field to store the\n corresponding analysis. It is 'dialog_intent_labels_analysis'\n in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the task.\n query_template (typing.Optional[str]): Template for query part to build the input\n prompt.\n response_template (typing.Optional[str]): Template for response part to build the\n input prompt.\n candidate_template (typing.Optional[str]): Template for intent candidates to\n build the input prompt.\n analysis_template (typing.Optional[str]): Template for analysis part to build the\n input prompt.\n labels_template (typing.Optional[str]): Template for labels to build the\n input prompt.\n analysis_pattern (typing.Optional[str]): Pattern to parse the return intent\n analysis.\n labels_pattern (typing.Optional[str]): Pattern to parse the return intent\n labels.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 77, "class_name": "dialog_sentiment_detection_mapper", "class_desc": "Generates sentiment labels and analysis for user queries in a dialog.\n\n This operator processes a dialog to detect and label the sentiments expressed by the\n user. It uses the provided history, query, and response keys to construct prompts for an\n API call. The API returns sentiment analysis and labels, which are then parsed and\n stored in the sample's metadata under the 'dialog_sentiment_labels' and\n 'dialog_sentiment_labels_analysis' keys. The operator supports custom templates and\n patterns for prompt construction and output parsing. If no sentiment candidates are\n provided, it uses open-domain sentiment labels. The operator retries the API call up to\n a specified number of times in case of errors.", "arguments": " api_model (): API model name.\n sentiment_candidates (typing.Optional[typing.List[str]]): The output sentiment candidates. Use\n open-domain sentiment labels if it is None.\n max_round (typing.Annotated[int, Ge(ge=0)]): The max num of round in the dialog to build the\n prompt.\n labels_key (): The key name in the meta field to store the\n output labels. It is 'dialog_sentiment_labels' in default.\n analysis_key (): The key name in the meta field to store the\n corresponding analysis. It is\n 'dialog_sentiment_labels_analysis' in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the task.\n query_template (typing.Optional[str]): Template for query part to build the input\n prompt.\n response_template (typing.Optional[str]): Template for response part to build the\n input prompt.\n candidate_template (typing.Optional[str]): Template for sentiment candidates to\n build the input prompt.\n analysis_template (typing.Optional[str]): Template for analysis part to build the\n input prompt.\n labels_template (typing.Optional[str]): Template for labels part to build the\n input prompt.\n analysis_pattern (typing.Optional[str]): Pattern to parse the return sentiment\n analysis.\n labels_pattern (typing.Optional[str]): Pattern to parse the return sentiment\n labels.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 78, "class_name": "dialog_sentiment_intensity_mapper", "class_desc": "Mapper to predict user's sentiment intensity in a dialog, ranging from -5 to 5.\n\n This operator analyzes the sentiment of user queries in a dialog and outputs a list of\n sentiment intensities and corresponding analyses. The sentiment intensity ranges from -5\n (extremely negative) to 5 (extremely positive), with 0 indicating a neutral sentiment.\n The analysis is based on the provided history, query, and response keys. The default\n system prompt and templates guide the sentiment analysis process. The results are stored\n in the meta field under 'dialog_sentiment_intensity' for intensities and\n 'dialog_sentiment_intensity_analysis' for analyses. The operator uses an API model to\n generate the sentiment analysis, with configurable retry attempts and sampling\n parameters.", "arguments": " api_model (): API model name.\n max_round (typing.Annotated[int, Ge(ge=0)]): The max num of round in the dialog to build the\n prompt.\n intensities_key (): The key name in the meta field to store\n the output sentiment intensities. It is\n 'dialog_sentiment_intensity' in default.\n analysis_key (): The key name in the meta field to store the\n corresponding analysis. It is\n 'dialog_sentiment_intensity_analysis' in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the task.\n query_template (typing.Optional[str]): Template for query part to build the input\n prompt.\n response_template (typing.Optional[str]): Template for response part to build the\n input prompt.\n analysis_template (typing.Optional[str]): Template for analysis part to build the\n input prompt.\n intensity_template (typing.Optional[str]): Template for intensity part to build the\n input prompt.\n analysis_pattern (typing.Optional[str]): Pattern to parse the return sentiment\n analysis.\n intensity_pattern (typing.Optional[str]): Pattern to parse the return sentiment\n intensity.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 79, "class_name": "dialog_topic_detection_mapper", "class_desc": "Generates user's topic labels and analysis in a dialog.\n\n This operator processes a dialog to detect and label the topics discussed by the user.\n It takes input from `history_key`, `query_key`, and `response_key` and outputs lists of\n labels and analysis for each query in the dialog. The operator uses a predefined system\n prompt and templates to build the input prompt for the API call. It supports customizing\n the system prompt, templates, and patterns for parsing the API response. The results are\n stored in the `meta` field under the keys specified by `labels_key` and `analysis_key`.\n If these keys already exist in the `meta` field, the operator skips processing. The\n operator retries the API call up to `try_num` times in case of errors.", "arguments": " api_model (): API model name.\n topic_candidates (typing.Optional[typing.List[str]]): The output topic candidates. Use\n open-domain topic labels if it is None.\n max_round (typing.Annotated[int, Ge(ge=0)]): The max num of round in the dialog to build the\n prompt.\n labels_key (): The key name in the meta field to store the\n output labels. It is 'dialog_topic_labels' in default.\n analysis_key (): The key name in the meta field to store the\n corresponding analysis. It is 'dialog_topic_labels_analysis'\n in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the task.\n query_template (typing.Optional[str]): Template for query part to build the input\n prompt.\n response_template (typing.Optional[str]): Template for response part to build the\n input prompt.\n candidate_template (typing.Optional[str]): Template for topic candidates to\n build the input prompt.\n analysis_template (typing.Optional[str]): Template for analysis part to build the\n input prompt.\n labels_template (typing.Optional[str]): Template for labels part to build the\n input prompt.\n analysis_pattern (typing.Optional[str]): Pattern to parse the return topic\n analysis.\n labels_pattern (typing.Optional[str]): Pattern to parse the return topic\n labels.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 80, "class_name": "download_file_mapper", "class_desc": "Mapper to download URL files to local files or load them into memory.\n\n This operator downloads files from URLs and can either save them to a specified\n directory or load the contents directly into memory. It supports downloading multiple\n files concurrently and can resume downloads if the `resume_download` flag is set. The\n operator processes nested lists of URLs, flattening them for batch processing and then\n reconstructing the original structure in the output. If both `save_dir` and `save_field`\n are not specified, it defaults to saving the content under the key `image_bytes`. The\n operator logs any failed download attempts and provides error messages for\n troubleshooting.", "arguments": " download_field (): The filed name to get the url to download.\n save_dir (): The directory to save downloaded files.\n save_field (): The filed name to save the downloaded file content.\n resume_download (): Whether to resume download. if True, skip the sample if it exists.\n timeout (): Timeout for download.\n max_concurrent (): Maximum concurrent downloads.\n"}, {"index": 81, "class_name": "expand_macro_mapper", "class_desc": "Expands macro definitions in the document body of LaTeX samples.\n\n This operator processes LaTeX documents to expand user-defined macros in the text. It\n supports \\newcommand and \\def macros without arguments. Macros are identified and\n expanded in the text, ensuring they are not part of longer alphanumeric words. The\n operator currently does not support macros with arguments. The processed text is updated\n in the samples.", "arguments": ""}, {"index": 82, "class_name": "extract_entity_attribute_mapper", "class_desc": "Extracts attributes for given entities from the text and stores them in the sample's\n metadata.\n\n This operator uses an API model to extract specified attributes for given entities from\n the input text. It constructs prompts based on provided templates and parses the model's\n output to extract attribute descriptions and supporting text. The extracted data is\n stored in the sample's metadata under the specified keys. If the required metadata\n fields already exist, the operator skips processing for that sample. The operator\n retries the API call and parsing up to a specified number of times in case of errors.\n The default system prompt, input template, and parsing patterns are used if not\n provided.", "arguments": " api_model (): API model name.\n query_entities (typing.List[str]): Entity list to be queried.\n query_attributes (typing.List[str]): Attribute list to be queried.\n entity_key (): The key name in the meta field to store the\n given main entity for attribute extraction. It's \"entity\" in\n default.\n attribute_key ()\n attribute_desc_key (): The key name in the meta field to store\n the extracted attribute description. It's\n \"attribute_description\" in default.\n support_text_key (): The key name in the meta field to store\n the attribute support text extracted from the raw text.\n It's \"support_text\" in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt_template (typing.Optional[str]): System prompt template for the\n task. Need to be specified by given entity and attribute.\n input_template (typing.Optional[str]): Template for building the model input.\n attr_pattern_template (typing.Optional[str]): Pattern for parsing the attribute from\n output. Need to be specified by given attribute.\n demo_pattern (typing.Optional[str])\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n drop_text (): If drop the text in the output.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 83, "class_name": "extract_entity_relation_mapper", "class_desc": "Extracts entities and relations from text to build a knowledge graph.\n\n - Identifies entities based on specified types and extracts their names, types, and\n descriptions.\n - Identifies relationships between the entities, including source and target entities,\n relationship descriptions, keywords, and strength scores.\n - Uses a Hugging Face tokenizer and a predefined prompt template to guide the extraction\n process.\n - Outputs entities and relations in a structured format, using delimiters for\n separation.\n - Caches the results in the sample's metadata under the keys 'entity' and 'relation'.\n - Supports multiple retries and gleaning to ensure comprehensive extraction.\n - The default entity types include 'organization', 'person', 'geo', and 'event'.", "arguments": " api_model (): API model name.\n entity_types (typing.List[str]): Pre-defined entity types for knowledge graph.\n entity_key (): The key name to store the entities in the meta\n field. It's \"entity\" in default.\n relation_key (): The field name to store the relations between\n entities. It's \"relation\" in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n prompt_template (typing.Optional[str]): The template of input prompt.\n tuple_delimiter (typing.Optional[str]): Delimiter to separate items in outputs.\n record_delimiter (typing.Optional[str]): Delimiter to separate records in outputs.\n completion_delimiter (typing.Optional[str]): To mark the end of the output.\n max_gleaning (typing.Annotated[int, Ge(ge=0)]): the extra max num to call LLM to glean entities\n and relations.\n continue_prompt (typing.Optional[str]): the prompt for gleaning entities and\n relations.\n if_loop_prompt (typing.Optional[str]): the prompt to determine whether to stop\n gleaning.\n entity_pattern (typing.Optional[str]): Regular expression for parsing entity record.\n relation_pattern (typing.Optional[str]): Regular expression for parsing relation\n record.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n drop_text (): If drop the text in the output.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 84, "class_name": "extract_event_mapper", "class_desc": "Extracts events and relevant characters from the text.\n\n This operator uses an API model to summarize the text into multiple events and extract\n the relevant characters for each event. The summary and character extraction follow a\n predefined format. The operator retries the API call up to a specified number of times\n if there is an error. The extracted events and characters are stored in the meta field\n of the samples. If no events are found, the original samples are returned. The operator\n can optionally drop the original text after processing.", "arguments": " api_model (): API model name.\n event_desc_key (): The key name to store the event descriptions\n in the meta field. It's \"event_description\" in default.\n relevant_char_key (): The field name to store the relevant\n characters to the events in the meta field. It's\n \"relevant_characters\" in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the task.\n input_template (typing.Optional[str]): Template for building the model input.\n output_pattern (typing.Optional[str]): Regular expression for parsing model output.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n drop_text (): If drop the text in the output.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 85, "class_name": "extract_keyword_mapper", "class_desc": "Generate keywords for the text.\n\n This operator uses a specified API model to generate high-level keywords that summarize\n the main concepts, themes, or topics of the input text. The generated keywords are\n stored in the meta field under the key specified by `keyword_key`. The operator retries\n the API call up to `try_num` times in case of errors. If `drop_text` is set to True, the\n original text is removed from the sample after processing. The operator uses a default\n prompt template and completion delimiter, which can be customized. The output is parsed\n using a regular expression to extract the keywords.", "arguments": " api_model (): API model name.\n keyword_key (): The key name to store the keywords in the meta\n field. It's \"keyword\" in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n prompt_template (typing.Optional[str]): The template of input prompt.\n completion_delimiter (typing.Optional[str]): To mark the end of the output.\n output_pattern (typing.Optional[str]): Regular expression for parsing keywords.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n drop_text (): If drop the text in the output.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 86, "class_name": "extract_nickname_mapper", "class_desc": "Extracts nickname relationships in the text using a language model.\n\n This operator uses a language model to identify and extract nickname relationships from\n the input text. It follows specific instructions to ensure accurate extraction, such as\n identifying the speaker, the person being addressed, and the nickname used. The\n extracted relationships are stored in the meta field under the specified key. The\n operator uses a default system prompt, input template, and output pattern, but these can\n be customized. The results are parsed and validated to ensure they meet the required\n format. If the text already contains the nickname information, it is not processed\n again. The operator retries the API call a specified number of times if an error occurs.", "arguments": " api_model (): API model name.\n nickname_key (): The key name to store the nickname\n relationship in the meta field. It's \"nickname\" in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the task.\n input_template (typing.Optional[str]): Template for building the model input.\n output_pattern (typing.Optional[str]): Regular expression for parsing model output.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n drop_text (): If drop the text in the output.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 87, "class_name": "extract_support_text_mapper", "class_desc": "Extracts a supporting sub-text from the original text based on a given summary.\n\n This operator uses an API model to identify and extract a segment of the original text\n that best matches the provided summary. It leverages a system prompt and input template\n to guide the extraction process. The extracted support text is stored in the specified\n meta field key. If the extraction fails or returns an empty string, the original summary\n is used as a fallback. The operator retries the extraction up to a specified number of\n times in case of errors.", "arguments": " api_model (): API model name.\n summary_key (): The key name to store the input summary in the\n meta field. It's \"event_description\" in default.\n support_text_key (): The key name to store the output\n support text for the summary in the meta field. It's\n \"support_text\" in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for the task.\n input_template (typing.Optional[str]): Template for building the model input.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n drop_text (): If drop the text in the output.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 88, "class_name": "extract_tables_from_html_mapper", "class_desc": "Extracts tables from HTML content and stores them in a specified field.\n\n This operator processes HTML content to extract tables. It can either retain or remove\n HTML tags based on the `retain_html_tags` parameter. If `retain_html_tags` is False, it\n can also include or exclude table headers based on the `include_header` parameter. The\n extracted tables are stored in the `tables_field_name` field within the sample's\n metadata. If no tables are found, an empty list is stored. If the tables have already\n been extracted, the operator will not reprocess the sample.", "arguments": " tables_field_name (): Field name to store the extracted tables.\n retain_html_tags (): If True, retains HTML tags in the tables;\n otherwise, removes them.\n include_header (): If True, includes the table header;\n otherwise, excludes it.\n This parameter is effective\n only when `retain_html_tags` is False\n and applies solely to the extracted table content.\n"}, {"index": 89, "class_name": "fix_unicode_mapper", "class_desc": "Fixes unicode errors in text samples.\n\n This operator corrects common unicode errors and normalizes the text to a specified\n Unicode normalization form. The default normalization form is 'NFC', but it can be set\n to 'NFKC', 'NFD', or 'NFKD' during initialization. It processes text samples in batches,\n applying the specified normalization to each sample. If an unsupported normalization\n form is provided, a ValueError is raised.", "arguments": " normalization (): the specified form of Unicode\n normalization mode, which can be one of\n ['NFC', 'NFKC', 'NFD', and 'NFKD'], default 'NFC'.\n"}, {"index": 90, "class_name": "generate_qa_from_examples_mapper", "class_desc": "Generates question and answer pairs from examples using a Hugging Face model.\n\n This operator generates QA pairs based on provided seed examples. The number of\n generated samples is determined by the length of the empty dataset configured in the\n YAML file. The operator uses a Hugging Face model to generate new QA pairs, which are\n then filtered based on their similarity to the seed examples. Samples with a similarity\n score below the specified threshold are kept. The similarity is computed using the\n ROUGE-L metric. The operator requires a seed file in chatml format, which provides the\n initial QA examples. The generated QA pairs must follow specific formatting rules, such\n as maintaining the same format as the input examples and ensuring that questions and\n answers are paired correctly.", "arguments": " hf_model (): Huggingface model ID.\n seed_file (): Path to the seed file in chatml format.\n example_num (typing.Annotated[int, Gt(gt=0)]): The number of selected examples.\n Randomly select N examples from \"seed_file\" and\n put them into prompt as QA examples.\n similarity_threshold (): The similarity score threshold\n between the generated samples and the seed examples.\n Range from 0 to 1. Samples with similarity score less than\n this threshold will be kept.\n system_prompt (typing.Optional[str]): System prompt for guiding the generation task.\n input_template (typing.Optional[str]): Template for building the input prompt. It must\n include one placeholder '{}', which will be replaced by\n `example_num` formatted examples defined by `example_template`.\n example_template (typing.Optional[str]): Template for formatting one QA example. It\n must include one placeholder '{}', which will be replaced by one\n formatted qa_pair.\n qa_pair_template (typing.Optional[str]): Template for formatting a single QA pair\n within each example. Must include two placeholders '{}' for the\n question and answer.\n output_pattern (typing.Optional[str]): Regular expression pattern to extract questions\n and answers from model response.\n enable_vllm (): Whether to use vllm for inference acceleration.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the model.\n sampling_params (typing.Optional[typing.Dict]): Sampling parameters for text generation.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 91, "class_name": "generate_qa_from_text_mapper", "class_desc": "Generates question and answer pairs from text using a specified model.\n\n This operator uses a Hugging Face model to generate QA pairs from the input text. It\n supports both Hugging Face and vLLM models for inference. The recommended models, such\n as 'alibaba-pai/pai-llama3-8b-doc2qa', are trained on Chinese data and are suitable for\n Chinese text. The operator can limit the number of generated QA pairs per text and\n allows custom output patterns for parsing the model's response. By default, it uses a\n regular expression to extract questions and answers from the model's output. If no QA\n pairs are extracted, a warning is logged.", "arguments": " hf_model (): Huggingface model ID.\n max_num (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): The max num of returned QA sample for each text.\n Not limit if it is None.\n output_pattern (typing.Optional[str]): Regular expression pattern to extract\n questions and answers from model response.\n enable_vllm (): Whether to use vllm for inference acceleration.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the model.\n sampling_params (typing.Optional[typing.Dict]): Sampling parameters for text generation,\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 92, "class_name": "image_blur_mapper", "class_desc": "Blurs images in the dataset with a specified probability and blur type.\n\n This operator blurs images using one of three types: mean, box, or Gaussian. The\n probability of an image being blurred is controlled by the `p` parameter. The blur\n effect is applied using a kernel with a specified radius. Blurred images are saved to a\n directory, which can be specified or defaults to the input directory. If the save\n directory is not provided, the `DJ_PRODUCED_DATA_DIR` environment variable can be used\n to set it. The operator ensures that the blur type is one of the supported options and\n that the radius is non-negative.", "arguments": " p (): Probability of the image being blurred.\n blur_type (): Type of blur kernel, including\n ['mean', 'box', 'gaussian'].\n radius (): Radius of blur kernel.\n save_dir (): The directory where generated image files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 93, "class_name": "image_captioning_from_gpt4v_mapper", "class_desc": "Generates text captions for images using the GPT-4 Vision model.\n\n This operator generates text based on the provided images and specified parameters. It\n supports different modes of text generation, including 'reasoning', 'description',\n 'conversation', and 'custom'. The generated text can be added to the original sample or\n replace it, depending on the `keep_original_sample` parameter. The operator uses a\n Hugging Face tokenizer and the GPT-4 Vision API to generate the text. The `any_or_all`\n parameter determines whether all or any of the images in a sample must meet the\n generation criteria for the sample to be kept. If `user_prompt_key` is set, it will use\n the prompt from the sample; otherwise, it will use the `user_prompt` parameter. If both\n are set, `user_prompt_key` takes precedence.", "arguments": " mode (): mode of text generated from images, can be one of\n ['reasoning', 'description', 'conversation', 'custom']\n api_key (): the API key to authenticate the request.\n max_token (): the maximum number of tokens to generate.\n Default is 500.\n temperature (typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]): controls the randomness of the output (range\n from 0 to 1). Default is 0.\n system_prompt (): a string prompt used to set the context of a\n conversation and provide global guidance or rules for the\n gpt4-vision so that it can generate responses in the expected way.\n If `mode` set to `custom`, the parameter will be used.\n user_prompt (): a string prompt to guide the generation of\n gpt4-vision for each samples. It's \"\" in default, which means no\n prompt provided.\n user_prompt_key (typing.Optional[str]): the key name of fields in samples to store\n prompts for each sample. It's used for set different prompts for\n different samples. If it's none, use prompt in parameter \"prompt\".\n It's None in default.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only generated text in the\n final datasets and the original text will be removed. It's True\n in default.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all images. 'any': keep this sample if any images meet the\n condition. 'all': keep this sample only if all images meet the\n condition.\n"}, {"index": 94, "class_name": "image_captioning_mapper", "class_desc": "Generates image captions using a Hugging Face model and appends them to samples.\n\n This operator generates captions for images in the input samples using a specified\n Hugging Face model. It can generate multiple captions per image and apply different\n strategies to retain the generated captions. The operator supports three retention\n modes: 'random_any', 'similar_one_simhash', and 'all'. In 'random_any' mode, a random\n caption is retained. In 'similar_one_simhash' mode, the most similar caption to the\n original text (based on SimHash) is retained. In 'all' mode, all generated captions are\n concatenated and retained. The operator can also keep or discard the original sample\n based on the `keep_original_sample` parameter. If both `prompt` and `prompt_key` are\n set, the `prompt_key` takes precedence.", "arguments": " hf_img2seq (): model name on huggingface to generate caption\n trust_remote_code ()\n caption_num (typing.Annotated[int, Gt(gt=0)]): how many candidate captions to generate\n for each image\n keep_candidate_mode (): retain strategy for the generated\n $caption_num$ candidates.\n\n 'random_any': Retain the random one from generated captions\n\n 'similar_one_simhash': Retain the generated one that is most\n similar to the original caption\n\n 'all': Retain all generated captions by concatenation\n\n Note:\n This is a batched_OP, whose input and output type are\n both list. Suppose there are $N$ list of input samples, whose batch\n size is $b$, and denote caption_num as $M$.\n The number of total samples after generation is $2Nb$ when\n keep_original_sample is True and $Nb$ when keep_original_sample is\n False. For 'random_any' and 'similar_one_simhash' mode,\n it's $(1+M)Nb$ for 'all' mode when keep_original_sample is True\n and $MNb$ when keep_original_sample is False.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only generated captions in the\n final datasets and the original captions will be removed. It's True\n in default.\n prompt (typing.Optional[str]): a string prompt to guide the generation of blip2 model\n for all samples globally. It's None in default, which means no\n prompt provided.\n prompt_key (typing.Optional[str]): the key name of fields in samples to store prompts\n for each sample. It's used for set different prompts for different\n samples. If it's none, use prompt in parameter \"prompt\". It's None\n in default.\n"}, {"index": 95, "class_name": "image_detection_yolo_mapper", "class_desc": "Perform object detection using YOLO on images and return bounding boxes and class\n labels.\n\n This operator uses a YOLO model to detect objects in images. It processes each image in\n the sample, returning the bounding boxes and class labels for detected objects. The\n operator sets the `bbox_tag` and `class_label_tag` fields in the sample's metadata. If\n no image is present or no objects are detected, it sets `bbox_tag` to an empty array and\n `class_label_tag` to -1. The operator uses a confidence score threshold and IoU\n (Intersection over Union) score threshold to filter detections.", "arguments": " imgsz (): resolution for image resizing\n conf (): confidence score threshold\n iou (): IoU (Intersection over Union) score threshold\n model_path (): the path to the YOLO model.\n"}, {"index": 96, "class_name": "image_diffusion_mapper", "class_desc": "Generate images using a diffusion model based on provided captions.\n\n This operator uses a Hugging Face diffusion model to generate images from given\n captions. It supports different modes for retaining generated samples, including random\n selection, similarity-based selection, and retaining all. The operator can also generate\n captions if none are provided, using a Hugging Face image-to-sequence model. The\n strength parameter controls the extent of transformation from the reference image, and\n the guidance scale influences how closely the generated images match the text prompt.\n Generated images can be saved in a specified directory or the same directory as the\n input files. This is a batched operation, processing multiple samples at once and\n producing a specified number of augmented images per sample.", "arguments": " hf_diffusion (): diffusion model name on huggingface to generate\n the image.\n trust_remote_code ()\n torch_dtype (): the floating point type used to load the diffusion\n model. Can be one of ['fp32', 'fp16', 'bf16']\n revision (): The specific model version to use. It can be a\n branch name, a tag name, a commit id, or any identifier allowed\n by Git.\n strength (typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]): Indicates extent to transform the reference image.\n Must be between 0 and 1. image is used as a starting point and\n more noise is added the higher the strength. The number of\n denoising steps depends on the amount of noise initially added.\n When strength is 1, added noise is maximum and the denoising\n process runs for the full number of iterations specified in\n num_inference_steps. A value of 1 essentially ignores image.\n guidance_scale (): A higher guidance scale value encourages the\n model to generate images closely linked to the text prompt at the\n expense of lower image quality. Guidance scale is enabled when\n guidance_scale > 1.\n aug_num (typing.Annotated[int, Gt(gt=0)]): The image number to be produced by stable-diffusion\n model.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only generated captions in the\n final datasets and the original captions will be removed. It's True\n by default.\n caption_key (typing.Optional[str]): the key name of fields in samples to store captions\n for each images. It can be a string if there is only one image in\n each sample. Otherwise, it should be a list. If it's none,\n ImageDiffusionMapper will produce captions for each images.\n hf_img2seq (): model name on huggingface to generate caption if\n caption_key is None.\n save_dir (): The directory where generated image files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 97, "class_name": "image_face_blur_mapper", "class_desc": "Mapper to blur faces detected in images.\n\n This operator uses an OpenCV classifier to detect faces in images and applies a\n specified blur type to the detected face regions. The blur types supported are 'mean',\n 'box', and 'gaussian'. The radius of the blur kernel can be adjusted. If no save\n directory is provided, the modified images will be saved in the same directory as the\n input files.", "arguments": " cv_classifier (): OpenCV classifier path for face detection.\n By default, we will use 'haarcascade_frontalface_alt.xml'.\n blur_type (): Type of blur kernel, including\n ['mean', 'box', 'gaussian'].\n radius (typing.Annotated[float, Ge(ge=0)]): Radius of blur kernel.\n save_dir (): The directory where generated image files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 98, "class_name": "image_remove_background_mapper", "class_desc": "Mapper to remove the background of images.\n\n This operator processes each image in the sample, removing its background. It uses the\n `rembg` library to perform the background removal. If `alpha_matting` is enabled, it\n applies alpha matting with specified thresholds and erosion size. The resulting images\n are saved in PNG format. The `bgcolor` parameter can be set to specify a custom\n background color for the cutout image. The processed images are stored in the directory\n specified by `save_dir`, or in the same directory as the input files if `save_dir` is\n not provided. The `source_file` field in the sample is updated to reflect the new file\n paths.", "arguments": " alpha_matting (): (bool, optional)\n Flag indicating whether to use alpha matting. Defaults to False.\n alpha_matting_foreground_threshold (): (int, optional)\n Foreground threshold for alpha matting. Defaults to 240.\n alpha_matting_background_threshold (): (int, optional)\n Background threshold for alpha matting. Defaults to 10.\n alpha_matting_erode_size (): (int, optional)\n Erosion size for alpha matting. Defaults to 10.\n bgcolor (typing.Optional[typing.Tuple[int, int, int, int]]): (Optional[Tuple[int, int, int, int]], optional)\n Background color for the cutout image. Defaults to None.\n save_dir (): The directory where generated image files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n *args (Optional[Any]): Additional positional arguments.\n **kwargs (Optional[Any]): Additional keyword arguments.\n"}, {"index": 99, "class_name": "image_segment_mapper", "class_desc": "Perform segment-anything on images and return the bounding boxes.\n\n This operator uses a FastSAM model to detect and segment objects in images, returning\n their bounding boxes. It processes each image in the sample, and stores the bounding\n boxes in the 'bbox_tag' field under the 'meta' key. If no images are present in the\n sample, an empty array is stored instead. The operator allows setting the image\n resolution, confidence threshold, and IoU (Intersection over Union) score threshold for\n the segmentation process. Bounding boxes are represented as N x M x 4 arrays, where N is\n the number of images, M is the number of detected boxes, and 4 represents the\n coordinates.", "arguments": " imgsz (): resolution for image resizing\n conf (): confidence score threshold\n iou (): IoU (Intersection over Union) score threshold\n model_path (): the path to the FastSAM model. Model name should be\n one of ['FastSAM-x.pt', 'FastSAM-s.pt'].\n"}, {"index": 100, "class_name": "image_tagging_mapper", "class_desc": "Generates image tags for each image in the sample.\n\n This operator processes images to generate descriptive tags. It uses a Hugging Face\n model to analyze the images and produce relevant tags. The tags are stored in the\n specified field, defaulting to 'image_tags'. If the tags are already present in the\n sample, the operator will not recompute them. For samples without images, an empty tag\n array is assigned. The generated tags are sorted by frequency and stored as a list of\n strings.", "arguments": " tag_field_name (): the field name to store the tags. It's\n \"image_tags\" in default.\n"}, {"index": 101, "class_name": "imgdiff_difference_area_generator_mapper", "class_desc": "Generates and filters bounding boxes for image pairs based on similarity, segmentation,\n and text matching.\n\n This operator processes image pairs to identify and filter regions with significant\n differences. It uses a sequence of operations:\n - Filters out image pairs with large differences.\n - Segments the images to identify potential objects.\n - Crops sub-images based on bounding boxes.\n - Determines if the sub-images contain valid objects using image-text matching.\n - Filters out sub-images that are too similar.\n - Removes overlapping bounding boxes.\n - Uses Hugging Face models for similarity and text matching, and FastSAM for\n segmentation.\n - Caches intermediate results in `DATA_JUICER_ASSETS_CACHE`.\n - Returns the filtered bounding boxes in the `MetaKeys.bbox_tag` field.", "arguments": " image_pair_similarity_filter_args (typing.Optional[typing.Dict]): Arguments for image pair similarity filter.\n Controls the similarity filtering between image pairs. Default empty dict will use\n fixed values: min_score_1=0.1, max_score_1=1.0, min_score_2=0.1, max_score_2=1.0,\n hf_clip=\"openai/clip-vit-base-patch32\", num_proc=1.\n image_segment_mapper_args (typing.Optional[typing.Dict]): Arguments for image segmentation mapper.\n Controls the image segmentation process. Default empty dict will use\n fixed values: imgsz=1024, conf=0.05, iou=0.5, model_path=\"FastSAM-x.pt\".\n image_text_matching_filter_args (typing.Optional[typing.Dict]): Arguments for image-text matching filter.\n Controls the matching between cropped image regions and text descriptions.\n Default empty dict will use fixed values: min_score=0.1, max_score=1.0,\n hf_blip=\"Salesforce/blip-itm-base-coco\", num_proc=1.\n"}, {"index": 102, "class_name": "imgdiff_difference_caption_generator_mapper", "class_desc": "Generates difference captions for bounding box regions in two images.\n\n This operator processes pairs of images and generates captions for the differences in\n their bounding box regions. It uses a multi-step process:\n - Describes the content of each bounding box region using a Hugging Face model.\n - Crops the bounding box regions from both images.\n - Checks if the cropped regions match the generated captions.\n - Determines if there are differences between the two captions.\n - Marks the difference area with a red box.\n - Generates difference captions for the marked areas.\n - The key metric is the similarity score between the captions, computed using a CLIP\n model.\n - If no valid bounding boxes or differences are found, it returns empty captions and\n zeroed bounding boxes.\n - Uses 'cuda' as the accelerator if any of the fused operations support it.\n - Caches temporary images during processing and clears them afterward.", "arguments": " mllm_mapper_args (typing.Optional[typing.Dict]): Arguments for multimodal language model mapper.\n Controls the generation of captions for bounding box regions. Default empty dict\n will use fixed values: max_new_tokens=256, temperature=0.2, top_p=None,\n num_beams=1, hf_model=\"llava-hf/llava-v1.6-vicuna-7b-hf\".\n image_text_matching_filter_args (typing.Optional[typing.Dict]): Arguments for image-text matching filter.\n Controls the matching between cropped regions and generated captions.\n Default empty dict will use fixed values: min_score=0.1, max_score=1.0,\n hf_blip=\"Salesforce/blip-itm-base-coco\", num_proc=1.\n text_pair_similarity_filter_args (typing.Optional[typing.Dict]): Arguments for text pair similarity filter.\n Controls the similarity comparison between caption pairs. Default empty dict\n will use fixed values: min_score=0.1, max_score=1.0,\n hf_clip=\"openai/clip-vit-base-patch32\", text_key_second=\"target_text\", num_proc=1.\n"}, {"index": 103, "class_name": "mllm_mapper", "class_desc": "Mapper to use MLLMs for visual question answering tasks. This operator uses a Hugging\n Face model to generate answers based on input text and images. It supports models like\n `llava-hf/llava-v1.6-vicuna-7b-hf` and `Qwen/Qwen2-VL-7B-Instruct`. The operator\n processes each sample, loading and processing images, and generating responses using the\n specified model. The generated responses are appended to the sample's text field. The\n key parameters include the model ID, maximum new tokens, temperature, top-p sampling,\n and beam search size, which control the generation process.", "arguments": " hf_model (): hugginface model id.\n max_new_tokens (): the maximum number of new tokens\n generated by the model.\n temperature (): used to control the randomness of generated text. The higher the temperature, the more random and creative the generated text will be.\n top_p (): randomly select the next word from the group of words whose cumulative probability reaches p.\n num_beams (): the larger the beam search size, the higher the quality of the generated text.\n"}, {"index": 104, "class_name": "nlpaug_en_mapper", "class_desc": "Augments English text samples using various methods from the nlpaug library.\n\n This operator applies a series of text augmentation techniques to generate new samples.\n It supports both word-level and character-level augmentations, such as deleting,\n swapping, and inserting words or characters. The number of augmented samples can be\n controlled, and the original samples can be kept or removed. When multiple augmentation\n methods are enabled, they can be applied sequentially or independently. Sequential\n application means each sample is augmented by all enabled methods in sequence, while\n independent application generates multiple augmented samples for each method. We\n recommend using 1-3 augmentation methods at a time to avoid significant changes in\n sample semantics.", "arguments": " sequential (): whether combine all augmentation methods to a\n sequence. If it's True, a sample will be augmented by all opened\n augmentation methods sequentially. If it's False, each opened\n augmentation method would generate its augmented samples\n independently.\n aug_num (typing.Annotated[int, Gt(gt=0)]): number of augmented samples to be generated. If\n `sequential` is True, there will be total aug_num augmented samples\n generated. If it's False, there will be (aug_num *\n #opened_aug_method) augmented samples generated.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only generated texts in the final\n datasets and the original texts will be removed. It's True in\n default.\n delete_random_word (): whether to open the augmentation method of\n deleting random words from the original texts. e.g. \"I love LLM\"\n --> \"I LLM\"\n swap_random_word (): whether to open the augmentation method of\n swapping random contiguous words in the original texts. e.g. \"I\n love LLM\" --> \"Love I LLM\"\n spelling_error_word (): whether to open the augmentation method of\n simulating the spelling error for words in the original texts. e.g.\n \"I love LLM\" --> \"Ai love LLM\"\n split_random_word (): whether to open the augmentation method of\n splitting words randomly with whitespaces in the original texts.\n e.g. \"I love LLM\" --> \"I love LL M\"\n keyboard_error_char (): whether to open the augmentation method of\n simulating the keyboard error for characters in the original texts.\n e.g. \"I love LLM\" --> \"I ;ov4 LLM\"\n ocr_error_char (): whether to open the augmentation method of\n simulating the OCR error for characters in the original texts.\n e.g. \"I love LLM\" --> \"I 10ve LLM\"\n delete_random_char (): whether to open the augmentation method of\n deleting random characters from the original texts. e.g. \"I love\n LLM\" --> \"I oe LLM\"\n swap_random_char (): whether to open the augmentation method of\n swapping random contiguous characters in the original texts.\n e.g. \"I love LLM\" --> \"I ovle LLM\"\n insert_random_char (): whether to open the augmentation method of\n inserting random characters into the original texts. e.g. \"I love\n LLM\" --> \"I ^lKove LLM\"\n"}, {"index": 105, "class_name": "nlpcda_zh_mapper", "class_desc": "Augments Chinese text samples using the nlpcda library.\n\n This operator applies various augmentation methods to Chinese text, such as replacing\n similar words, homophones, deleting random characters, swapping characters, and\n replacing equivalent numbers. The number of augmented samples generated can be\n controlled by the `aug_num` parameter. If `sequential` is set to True, the augmentation\n methods are applied in sequence; otherwise, they are applied independently. The original\n sample can be kept or removed based on the `keep_original_sample` flag. It is\n recommended to use 1-3 augmentation methods at a time to avoid significant changes in\n the semantics of the samples. Some augmentation methods may not work for special texts,\n resulting in no augmented samples being generated.", "arguments": " sequential (): whether combine all augmentation methods to a\n sequence. If it's True, a sample will be augmented by all opened\n augmentation methods sequentially. If it's False, each opened\n augmentation method would generate its augmented samples\n independently.\n aug_num (typing.Annotated[int, Gt(gt=0)]): number of augmented samples to be generated. If\n `sequential` is True, there will be total aug_num augmented samples\n generated. If it's False, there will be (aug_num *\n #opened_aug_method) augmented samples generated.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only generated texts in the final\n datasets and the original texts will be removed. It's True in\n default.\n replace_similar_word (): whether to open the augmentation method of\n replacing random words with their similar words in the original\n texts. e.g. \"\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5\" --> \"\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5\"\n replace_homophone_char (): whether to open the augmentation method\n of replacing random characters with their homophones in the\n original texts. e.g. \"\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5\" --> \"\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5\"\n delete_random_char (): whether to open the augmentation method of\n deleting random characters from the original texts. e.g.\n \"\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5\" --> \"\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\"\n swap_random_char (): whether to open the augmentation method of\n swapping random contiguous characters in the original texts. e.g.\n \"\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5\" --> \"\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5\"\n replace_equivalent_num (): whether to open the augmentation method\n of replacing random numbers with their equivalent representations\n in the original texts. **Notice**: Only for numbers for now. e.g.\n \"\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5\" --> \"\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5\"\n"}, {"index": 106, "class_name": "optimize_prompt_mapper", "class_desc": "\n Mapper to optimize prompts based on the existing ones.\n This OP will use the existing prompts in the same batch and newly optimized prompts as the examples to optimize\n the next ones.\n\n Reference: https://doc.agentscope.io/v0/en/build_tutorial/prompt_optimization.html\n ", "arguments": " api_or_hf_model (): API or huggingface model name.\n gen_num (typing.Annotated[int, Gt(gt=0)]): The number of new prompts to generate.\n max_example_num (typing.Annotated[int, Gt(gt=0)])\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only generated texts in the final\n datasets and the original texts will be removed. It's True in\n default.\n retry_num (): how many times to retry to generate the prompt if the\n parsed generated prompt is empty. It's 3 in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for guiding the generation task.\n input_template (typing.Optional[str]): Template for building the input prompt. It must\n include one placeholder '{}', which will be replaced by\n `example_num` formatted examples defined by `example_template`.\n example_template (typing.Optional[str]): Template for formatting one prompt example. It\n must include one placeholder '{}', which will be replaced by one\n formatted prompt.\n prompt_template (typing.Optional[str]): Template for formatting a single prompt\n within each example. Must include two placeholders '{}' for the\n question and answer.\n output_pattern (typing.Optional[str]): Regular expression pattern to extract questions\n and answers from model response.\n enable_vllm (): Whether to use vllm for inference acceleration.\n is_hf_model (): If true, use Transformers for loading hugging face or\n local llm.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the model.\n sampling_params (typing.Optional[typing.Dict]): Sampling parameters for text generation.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 107, "class_name": "optimize_qa_mapper", "class_desc": "Mapper to optimize question-answer pairs.\n\n This operator refines and enhances the quality of question-answer pairs. It uses a\n Hugging Face model to generate more detailed and accurate questions and answers. The\n input is formatted using a template, and the output is parsed using a regular\n expression. The system prompt, input template, and output pattern can be customized. If\n VLLM is enabled, the operator accelerates inference on CUDA devices.", "arguments": " api_or_hf_model (): API or huggingface model name.\n is_hf_model (): If true, use huggingface model. Otherwise, use API.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for guiding the optimization task.\n input_template (typing.Optional[str]): Template for building the input for the model.\n Please make sure the template contains one placeholder '{}', which\n corresponds to the question and answer pair generated by\n param `qa_pair_template`.\n qa_pair_template (typing.Optional[str]): Template for formatting the question and\n answer pair. Please make sure the template contains two\n '{}' to format question and answer.\n output_pattern (typing.Optional[str]): Regular expression pattern to extract question\n and answer from model response.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n enable_vllm (): Whether to use VLLM for inference acceleration.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the model.\n sampling_params (typing.Optional[typing.Dict]): Sampling parameters for text generation (e.g.,\n {'temperature': 0.9, 'top_p': 0.95}).\n"}, {"index": 108, "class_name": "optimize_query_mapper", "class_desc": "Optimize queries in question-answer pairs to make them more specific and detailed.\n\n This mapper refines the questions in a QA pair, making them more specific and detailed\n while ensuring that the original answer can still address the optimized question. It\n uses a predefined system prompt for the optimization process. The optimized query is\n extracted from the raw output by stripping any leading or trailing whitespace. The\n mapper utilizes a CUDA accelerator for faster processing.", "arguments": " api_or_hf_model (): API or huggingface model name.\n is_hf_model (): If true, use huggingface model. Otherwise, use API.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for guiding the optimization task.\n input_template (typing.Optional[str]): Template for building the input for the model.\n Please make sure the template contains one placeholder '{}', which\n corresponds to the question and answer pair generated by\n param `qa_pair_template`.\n qa_pair_template (typing.Optional[str]): Template for formatting the question and\n answer pair. Please make sure the template contains two\n '{}' to format question and answer.\n output_pattern (typing.Optional[str]): Regular expression pattern to extract question\n and answer from model response.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n enable_vllm (): Whether to use VLLM for inference acceleration.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the model.\n sampling_params (typing.Optional[typing.Dict]): Sampling parameters for text generation (e.g.,\n {'temperature': 0.9, 'top_p': 0.95}).\n"}, {"index": 109, "class_name": "optimize_response_mapper", "class_desc": "Optimize response in question-answer pairs to be more detailed and specific.\n\n This operator enhances the responses in question-answer pairs, making them more detailed\n and specific while ensuring they still address the original question. It uses a\n predefined system prompt for optimization. The optimized response is stripped of any\n leading or trailing whitespace before being returned. This mapper leverages a Hugging\n Face model for the optimization process, which is accelerated using CUDA.", "arguments": " api_or_hf_model (): API or huggingface model name.\n is_hf_model (): If true, use huggingface model. Otherwise, use API.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for guiding the optimization task.\n input_template (typing.Optional[str]): Template for building the input for the model.\n Please make sure the template contains one placeholder '{}', which\n corresponds to the question and answer pair generated by\n param `qa_pair_template`.\n qa_pair_template (typing.Optional[str]): Template for formatting the question and\n answer pair. Please make sure the template contains two\n '{}' to format question and answer.\n output_pattern (typing.Optional[str]): Regular expression pattern to extract question\n and answer from model response.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n enable_vllm (): Whether to use VLLM for inference acceleration.\n model_params (typing.Optional[typing.Dict]): Parameters for initializing the model.\n sampling_params (typing.Optional[typing.Dict]): Sampling parameters for text generation (e.g.,\n {'temperature': 0.9, 'top_p': 0.95}).\n"}, {"index": 110, "class_name": "pair_preference_mapper", "class_desc": "Mapper to construct paired preference samples by generating a rejected response and its\n reason.\n\n This operator uses an API model to generate a new response that is opposite in style,\n factuality, or stance to the original response. The generated response and the reason\n for its generation are stored in the sample. The default system prompt and input\n template are provided, but can be customized. The output is parsed using a regular\n expression to extract the new response and the reason. If parsing fails, the operator\n retries up to a specified number of times. The generated response and reason are stored\n in the sample under the keys 'rejected_response' and 'reason', respectively.", "arguments": " api_model (): API model name.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt (typing.Optional[str]): System prompt for guiding the generation task.\n input_template (typing.Optional[str]): Template for building the model input. It must\n contain placeholders '{query}' and '{response}', and can optionally\n include '{reference}'.\n output_pattern (typing.Optional[str]): Regular expression for parsing model output.\n rejected_key (): The field name in the sample to store the\n generated rejected response. Defaults to 'rejected_response'.\n reason_key (): The field name in the sample to store the reason for\n generating the response. Defaults to 'reason'.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retries for the API call in case of\n response parsing failure. Defaults to 3.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 111, "class_name": "punctuation_normalization_mapper", "class_desc": "Normalizes unicode punctuations to their English equivalents in text samples.\n\n This operator processes a batch of text samples and replaces any unicode punctuation\n with its corresponding English punctuation. The mapping includes common substitutions\n like \"\uff0c\" to \",\", \"\u3002\" to \".\", and \"\u201c\" to \". It iterates over each character in the text,\n replacing it if it is found in the predefined punctuation map. The result is a set of\n text samples with consistent punctuation formatting.", "arguments": ""}, {"index": 112, "class_name": "python_file_mapper", "class_desc": "Executes a Python function defined in a file on input data.\n\n This operator loads a specified Python function from a given file and applies it to the\n input data. The function must take exactly one argument and return a dictionary. The\n operator can process data either sample by sample or in batches, depending on the\n `batched` parameter. If the file path is not provided, the operator acts as an identity\n function, returning the input sample unchanged. The function is loaded dynamically, and\n its name and file path are configurable. Important notes:\n - The file must be a valid Python file (`.py`).\n - The function must be callable and accept exactly one argument.\n - The function's return value must be a dictionary.", "arguments": " file_path (): The path to the Python file containing the function\n to be executed.\n function_name (): The name of the function defined in the file\n to be executed.\n batched (): A boolean indicating whether to process input data in\n batches.\n"}, {"index": 113, "class_name": "python_lambda_mapper", "class_desc": "Mapper for applying a Python lambda function to data samples.\n\n This operator allows users to define a custom transformation using a Python lambda\n function. The lambda function is applied to each sample, and the result must be a\n dictionary. If the `batched` parameter is set to True, the lambda function will process\n a batch of samples at once. If no lambda function is provided, the identity function is\n used, which returns the input sample unchanged. The operator validates the lambda\n function to ensure it has exactly one argument and compiles it safely.", "arguments": " lambda_str (): A string representation of the lambda function to be\n executed on data samples. If empty, the identity function is used.\n batched (): A boolean indicating whether to process input data in\n batches.\n"}, {"index": 114, "class_name": "query_intent_detection_mapper", "class_desc": "Predicts the user's intent label and corresponding score for a given query. The operator\n uses a Hugging Face model to classify the intent of the input query. If the query is in\n Chinese, it can optionally be translated to English using another Hugging Face\n translation model before classification. The predicted intent label and its confidence\n score are stored in the meta field with the keys 'query_intent_label' and\n 'query_intent_score', respectively. If these keys already exist in the meta field, the\n operator will skip processing for those samples.", "arguments": " hf_model (): Huggingface model ID to predict intent label.\n zh_to_en_hf_model (typing.Optional[str]): Translation model from Chinese to English.\n If not None, translate the query from Chinese to English.\n model_params (typing.Dict): model param for hf_model.\n zh_to_en_model_params (typing.Dict): model param for zh_to_hf_model.\n label_key (): The key name in the meta field to store the\n output label. It is 'query_intent_label' in default.\n score_key (): The key name in the meta field to store the\n corresponding label score. It is 'query_intent_label_score'\n in default.\n"}, {"index": 115, "class_name": "query_sentiment_detection_mapper", "class_desc": "Predicts user's sentiment label ('negative', 'neutral', 'positive') in a query.\n\n This mapper takes input from the specified query key and outputs the predicted sentiment\n label and its corresponding score. The results are stored in the Data-Juicer meta field\n under 'query_sentiment_label' and 'query_sentiment_label_score'. It uses a Hugging Face\n model for sentiment detection. If a Chinese-to-English translation model is provided, it\n first translates the query from Chinese to English before performing sentiment analysis.", "arguments": " hf_model (): Huggingface model ID to predict sentiment label.\n zh_to_en_hf_model (typing.Optional[str]): Translation model from Chinese to English.\n If not None, translate the query from Chinese to English.\n model_params (typing.Dict): model param for hf_model.\n zh_to_en_model_params (typing.Dict): model param for zh_to_hf_model.\n label_key (): The key name in the meta field to store the\n output label. It is 'query_sentiment_label' in default.\n score_key (): The key name in the meta field to store the\n corresponding label score. It is 'query_sentiment_label_score'\n in default.\n"}, {"index": 116, "class_name": "query_topic_detection_mapper", "class_desc": "Predicts the topic label and its corresponding score for a given query. The input is\n taken from the specified query key. The output, which includes the predicted topic label\n and its score, is stored in the 'query_topic_label' and 'query_topic_label_score' fields\n of the Data-Juicer meta field. This operator uses a Hugging Face model for topic\n classification. If a Chinese to English translation model is provided, it will first\n translate the query from Chinese to English before predicting the topic.\n\n - Uses a Hugging Face model for topic classification.\n - Optionally translates Chinese queries to English using another Hugging Face\n model.\n - Stores the predicted topic label in 'query_topic_label'.\n - Stores the corresponding score in 'query_topic_label_score'.", "arguments": " hf_model (): Huggingface model ID to predict topic label.\n zh_to_en_hf_model (typing.Optional[str]): Translation model from Chinese to English.\n If not None, translate the query from Chinese to English.\n model_params (typing.Dict): model param for hf_model.\n zh_to_en_model_params (typing.Dict): model param for zh_to_hf_model.\n label_key (): The key name in the meta field to store the\n output label. It is 'query_topic_label' in default.\n score_key (): The key name in the meta field to store the\n corresponding label score. It is 'query_topic_label_score'\n in default.\n"}, {"index": 117, "class_name": "relation_identity_mapper", "class_desc": "Identify the relation between two entities in a given text.\n\n This operator uses an API model to analyze the relationship between two specified\n entities in the text. It constructs a prompt with the provided system and input\n templates, then sends it to the API model for analysis. The output is parsed using a\n regular expression to extract the relationship. If the two entities are the same, the\n relationship is identified as \"another identity.\" The result is stored in the meta field\n under the key 'role_relation' by default. The operator retries the API call up to a\n specified number of times in case of errors. If `drop_text` is set to True, the original\n text is removed from the sample after processing.", "arguments": " api_model (): API model name.\n source_entity (): The source entity of the relation to be\n identified.\n target_entity (): The target entity of the relation to be\n identified.\n output_key (): The output key in the meta field in the\n samples. It is 'role_relation' in default.\n api_endpoint (typing.Optional[str]): URL endpoint for the API.\n response_path (typing.Optional[str]): Path to extract content from the API response.\n Defaults to 'choices.0.message.content'.\n system_prompt_template (typing.Optional[str]): System prompt template for the task.\n input_template (typing.Optional[str]): Template for building the model input.\n output_pattern_template (typing.Optional[str]): Regular expression template for\n parsing model output.\n try_num (typing.Annotated[int, Gt(gt=0)]): The number of retry attempts when there is an API\n call error or output parsing error.\n drop_text (): If drop the text in the output.\n model_params (typing.Dict): Parameters for initializing the API model.\n sampling_params (typing.Dict): Extra parameters passed to the API call.\n e.g {'temperature': 0.9, 'top_p': 0.95}\n"}, {"index": 118, "class_name": "remove_bibliography_mapper", "class_desc": "Removes bibliography sections at the end of LaTeX documents.\n\n This operator identifies and removes bibliography sections in LaTeX documents. It uses a\n regular expression to match common bibliography commands such as \\appendix,\n \\begin{references}, \\begin{thebibliography}, and \\bibliography. The matched sections are\n removed from the text. The operator processes samples in batch mode for efficiency.", "arguments": ""}, {"index": 119, "class_name": "remove_comments_mapper", "class_desc": "Removes comments from documents, currently supporting only 'tex' format.\n\n This operator removes inline and multiline comments from text samples. It supports both\n inline and multiline comment removal, controlled by the `inline` and `multiline`\n parameters. Currently, it is designed to work with 'tex' documents. The operator\n processes each sample in the batch and applies regular expressions to remove comments.\n The processed text is then updated in the original samples.\n\n - Inline comments are removed using the pattern `[^\\]%.+$`.\n - Multiline comments are removed using the pattern `^%.*\n?`.\n\n Important notes:\n - Only 'tex' document type is supported at present.\n - The operator processes the text in place and updates the original samples.", "arguments": " doc_type (typing.Union[str, typing.List[str]]): Type of document to remove comments.\n inline (): Whether to remove inline comments.\n multiline (): Whether to remove multiline comments.\n"}, {"index": 120, "class_name": "remove_header_mapper", "class_desc": "Removes headers at the beginning of documents in LaTeX samples.\n\n This operator identifies and removes headers such as chapter, part, section, subsection,\n subsubsection, paragraph, and subparagraph. It uses a regular expression to match these\n headers. If a sample does not contain any headers and `drop_no_head` is set to True, the\n sample text will be removed. Otherwise, the sample remains unchanged. The operator\n processes samples in batches for efficiency.", "arguments": " drop_no_head (): whether to drop sample texts without\n headers.\n"}, {"index": 121, "class_name": "remove_long_words_mapper", "class_desc": "Mapper to remove long words within a specific range.\n\n This operator filters out words in the text that are either shorter than the specified\n minimum length or longer than the specified maximum length. Words are first checked with\n their original length, and if they do not meet the criteria, they are stripped of\n special characters and re-evaluated. The key metric used is the character-based length\n of each word. The processed text retains only the words that fall within the defined\n length range. This operator processes text in batches for efficiency.", "arguments": " min_len (): The min mapper word length in this op, words\n will be filtered if their length is below this parameter.\n max_len (): The max mapper word length in this op, words\n will be filtered if their length exceeds this parameter.\n"}, {"index": 122, "class_name": "remove_non_chinese_character_mapper", "class_desc": "Removes non-Chinese characters from text samples.\n\n This mapper removes all characters that are not part of the Chinese character set.\n - It can optionally keep alphabets, numbers, and punctuation based on the configuration.\n - The removal is done using a regular expression pattern.\n - The pattern is constructed to exclude or include alphabets, numbers, and punctuation\n as specified.\n - The key metric for this operation is the presence of non-Chinese characters, which are\n removed.\n - The operator processes samples in a batched manner.", "arguments": " keep_alphabet (): whether to keep alphabet\n keep_number (): whether to keep number\n keep_punc (): whether to keep punctuation\n"}, {"index": 123, "class_name": "remove_repeat_sentences_mapper", "class_desc": "Mapper to remove repeat sentences in text samples.\n\n This operator processes text samples to remove duplicate sentences. It splits the text\n into lines and then further splits each line into sentences. Sentences are considered\n duplicates if they are identical after optional case normalization and special character\n removal. The operator uses a hash set to track unique sentences. Sentences shorter than\n `min_repeat_sentence_length` are not deduplicated. If `ignore_special_character` is\n enabled, special characters (all except Chinese, letters, and numbers) are ignored when\n checking for duplicates. The resulting text is reassembled with unique sentences.", "arguments": " lowercase (): Whether to convert sample text to lower case\n ignore_special_character (): Whether to ignore special\n characters when judging repeated sentences. Special characters\n are all characters except Chinese characters, letters and\n numbers.\n min_repeat_sentence_length (): Sentences shorter than this\n length will not be deduplicated. If ignore_special_character is\n set to True, then special characters are not included in this\n length.\n"}, {"index": 124, "class_name": "remove_specific_chars_mapper", "class_desc": "Removes specific characters from text samples.\n\n This operator removes specified characters from the text. The characters to be removed\n can be provided as a string or a list of strings. If no characters are specified, the\n default set includes special and non-alphanumeric characters. The operator processes the\n text using a regular expression pattern that matches any of the specified characters and\n replaces them with an empty string. This is done in a batched manner for efficiency.", "arguments": " chars_to_remove (typing.Union[str, typing.List[str]]): a list or a string including all\n characters that need to be removed from text.\n"}, {"index": 125, "class_name": "remove_table_text_mapper", "class_desc": "Mapper to remove table texts from text samples.\n\n This operator uses regular expressions to identify and remove tables from the text. It\n targets tables with a specified range of columns, defined by the minimum and maximum\n number of columns. The operator iterates over each sample, applying the regex pattern to\n remove tables that match the column criteria. The processed text, with tables removed,\n is then stored back in the sample. This operation is batched for efficiency.", "arguments": " min_col (typing.Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=2), Le(le=20)])]): The min number of columns of table to remove.\n max_col (typing.Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=2), Le(le=20)])]): The max number of columns of table to remove.\n"}, {"index": 126, "class_name": "remove_words_with_incorrect_substrings_mapper", "class_desc": "Mapper to remove words containing specified incorrect substrings.\n\n This operator processes text by removing words that contain any of the specified\n incorrect substrings. By default, it removes words with substrings like \"http\", \"www\",\n \".com\", \"href\", and \"//\". The operator can operate in tokenized or non-tokenized mode.\n In tokenized mode, it uses a Hugging Face tokenizer to tokenize the text before\n processing. The key metric is not computed; this operator focuses on filtering out\n specific words.\n\n - If `tokenization` is True, the text is tokenized using a Hugging Face\n tokenizer, and words are filtered based on the specified substrings.\n - If `tokenization` is False, the text is split into sentences and words,\n and words are filtered based on the specified substrings.\n - The filtered text is then merged back into a single string.\n\n The operator processes samples in batches and updates the text in place.", "arguments": " lang (): sample in which language\n tokenization (): whether to use model to tokenize documents\n substrings (typing.Optional[typing.List[str]]): The incorrect substrings in words.\n"}, {"index": 127, "class_name": "replace_content_mapper", "class_desc": "Replaces content in the text that matches a specific regular expression pattern with a\n designated replacement string.\n\n This operator processes text by searching for patterns defined in `pattern` and\n replacing them with the corresponding `repl` string. If multiple patterns and\n replacements are provided, each pattern is replaced by its respective replacement. The\n operator supports both single and multiple patterns and replacements. The regular\n expressions are compiled with the `re.DOTALL` flag to match across multiple lines. If\n the length of the patterns and replacements do not match, a `ValueError` is raised. This\n operation is batched, meaning it processes multiple samples at once.", "arguments": " pattern (typing.Union[str, typing.List[str], NoneType]): regular expression pattern(s) to search for within text\n repl (typing.Union[str, typing.List[str]]): replacement string(s), default is empty string\n"}, {"index": 128, "class_name": "sdxl_prompt2prompt_mapper", "class_desc": "Generates pairs of similar images using the SDXL model.\n\n This operator uses a Hugging Face diffusion model to generate image pairs based on two\n text prompts. The quality and similarity of the generated images are controlled by\n parameters such as `num_inference_steps` and `guidance_scale`. The first and second text\n prompts are specified using `text_key` and `text_key_second`, respectively. The\n generated images are saved in the specified `output_dir` with unique filenames. The\n operator requires both text keys to be set for processing.", "arguments": " hf_diffusion (): diffusion model name on huggingface to generate\n the image.\n trust_remote_code ()\n torch_dtype (): the floating point type used to load the diffusion\n model.\n num_inference_steps (): The larger the value, the better the\n image generation quality; however, this also increases the time\n required for generation.\n guidance_scale (): A higher guidance scale value encourages the\n model to generate images closely linked to the text prompt at the\n expense of lower image quality. Guidance scale is enabled when\n text_key (): the key name used to store the first caption\n in the caption pair.\n text_key_second (): the key name used to store the second caption\n in the caption pair.\n output_dir (): the storage location of the generated images.\n"}, {"index": 129, "class_name": "sentence_augmentation_mapper", "class_desc": "Augments sentences by generating enhanced versions using a Hugging Face model. This\n operator enhances input sentences by generating new, augmented versions. It is designed\n to work best with individual sentences rather than full documents. For optimal results,\n ensure the input text is at the sentence level. The augmentation process uses a Hugging\n Face model, such as `lmsys/vicuna-13b-v1.5` or `Qwen/Qwen2-7B-Instruct`. The operator\n requires specifying both the primary and secondary text keys, where the augmented\n sentence will be stored in the secondary key. The generation process can be customized\n with parameters like temperature, top-p sampling, and beam search size.", "arguments": " hf_model (): Huggingface model id.\n system_prompt (): System prompt.\n task_sentence (): The instruction for the current task.\n max_new_tokens (): the maximum number of new tokens\n generated by the model.\n temperature (): used to control the randomness of\n generated text. The higher the temperature, the more\n random and creative the generated text will be.\n top_p (): randomly select the next word from the group\n of words whose cumulative probability reaches p.\n num_beams (): the larger the beam search size, the higher\n the quality of the generated text.\n text_key (): the key name used to store the first sentence\n in the text pair. (optional, defalut='text')\n text_key_second (): the key name used to store the second sentence\n in the text pair.\n"}, {"index": 130, "class_name": "sentence_split_mapper", "class_desc": "Splits text samples into individual sentences based on the specified language.\n\n This operator uses an NLTK-based tokenizer to split the input text into sentences. The\n language for the tokenizer is specified during initialization. The original text in each\n sample is replaced with a list of sentences. This operator processes samples in batches\n for efficiency. Ensure that the `lang` parameter is set to the appropriate language code\n (e.g., \"en\" for English) to achieve accurate sentence splitting.", "arguments": " lang (): split sentence of text in which language.\n"}, {"index": 131, "class_name": "text_chunk_mapper", "class_desc": "Split input text into chunks based on specified criteria.\n\n - Splits the input text into multiple chunks using a specified maximum length and a\n split pattern.\n - If `max_len` is provided, the text is split into chunks with a maximum length of\n `max_len`.\n - If `split_pattern` is provided, the text is split at occurrences of the pattern. If\n the length exceeds `max_len`, it will force a cut.\n - The `overlap_len` parameter specifies the overlap length between consecutive chunks if\n the split does not occur at the pattern.\n - Uses a Hugging Face tokenizer to calculate the text length in tokens if a tokenizer\n name is provided; otherwise, it uses the string length.\n - Caches the following stats: 'chunk_count' (number of chunks generated for each\n sample).\n - Raises a `ValueError` if both `max_len` and `split_pattern` are `None` or if\n `overlap_len` is greater than or equal to `max_len`.", "arguments": " max_len (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): Split text into multi texts with this max len if not\n None.\n split_pattern (typing.Optional[str]): Make sure split in this pattern if it is not None\n and force cut if the length exceeds max_len.\n overlap_len (typing.Annotated[int, Ge(ge=0)]): Overlap length of the split texts if not split in\n the split pattern.\n tokenizer (typing.Optional[str]): The tokenizer name of Hugging Face tokenizers.\n The text length will be calculate as the token num if it is\n offered. Otherwise, the text length equals to string length.\n Support tiktoken tokenizer (such as gpt-4o), dashscope tokenizer (\n such as qwen2.5-72b-instruct) and huggingface tokenizer.\n :trust_remote_code: for loading huggingface model\n trust_remote_code ()\n"}, {"index": 132, "class_name": "video_captioning_from_audio_mapper", "class_desc": "Mapper to caption a video according to its audio streams based on\n Qwen-Audio model.\n ", "arguments": " keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only captioned sample in the\n final datasets and the original sample will be removed. It's True\n in default.\n"}, {"index": 133, "class_name": "video_captioning_from_frames_mapper", "class_desc": "Generates video captions from sampled frames using an image-to-text model. Captions from\n different frames are concatenated into a single string.\n\n - Uses a Hugging Face image-to-text model to generate captions for sampled video frames.\n - Supports different frame sampling methods: 'all_keyframes' or 'uniform'.\n - Can apply horizontal and vertical flips to the frames before captioning.\n - Offers multiple strategies for retaining generated captions: 'random_any',\n 'similar_one_simhash', or 'all'.\n - Optionally keeps the original sample in the final dataset.\n - Allows setting a global prompt or per-sample prompts to guide caption generation.\n - Generates a specified number of candidate captions per video, which can be reduced\n based on the selected retention strategy.\n - The number of output samples depends on the retention strategy and whether original\n samples are kept.", "arguments": " hf_img2seq (): model name on huggingface to generate caption\n trust_remote_code ()\n caption_num (typing.Annotated[int, Gt(gt=0)]): how many candidate captions to generate\n for each video\n keep_candidate_mode (): retain strategy for the generated\n $caption_num$ candidates.\n\n 'random_any': Retain the random one from generated captions\n\n 'similar_one_simhash': Retain the generated one that is most\n similar to the original caption\n\n 'all': Retain all generated captions by concatenation\n\n Note:\n This is a batched_OP, whose input and output type are\n both list. Suppose there are $N$ list of input samples, whose batch\n size is $b$, and denote caption_num as $M$.\n The number of total samples after generation is $2Nb$ when\n keep_original_sample is True and $Nb$ when keep_original_sample is\n False. For 'random_any' and 'similar_one_simhash' mode,\n it's $(1+M)Nb$ for 'all' mode when keep_original_sample is True\n and $MNb$ when keep_original_sample is False.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only generated captions in the\n final datasets and the original captions will be removed. It's True\n in default.\n prompt (typing.Optional[str]): a string prompt to guide the generation of image-to-text\n model for all samples globally. It's None in default, which means\n no prompt provided.\n prompt_key (typing.Optional[str]): the key name of fields in samples to store prompts\n for each sample. It's used for set different prompts for different\n samples. If it's none, use prompt in parameter \"prompt\". It's None\n in default.\n frame_sampling_method (): sampling method of extracting frame\n videos from the videos. Should be one of\n [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames (the number\n of which depends on the duration of the video) and the latter\n one extract specified number of frames uniformly from the video.\n Default: \"all_keyframes\".\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n horizontal_flip (): flip frame video horizontally (left to right).\n vertical_flip (): flip frame video vertically (top to bottom).\n"}, {"index": 134, "class_name": "video_captioning_from_summarizer_mapper", "class_desc": "\n Mapper to generate video captions by summarizing several kinds of generated\n texts (captions from video/audio/frames, tags from audio/frames, ...)\n ", "arguments": " hf_summarizer (): the summarizer model used to summarize texts\n generated by other methods.\n trust_remote_code ()\n consider_video_caption_from_video (): whether to consider the video\n caption generated from video directly in the summarization process.\n Default: True.\n consider_video_caption_from_audio (): whether to consider the video\n caption generated from audio streams in the video in the\n summarization process. Default: True.\n consider_video_caption_from_frames (): whether to consider the\n video caption generated from sampled frames from the video in the\n summarization process. Default: True.\n consider_video_tags_from_audio (): whether to consider the video\n tags generated from audio streams in the video in the summarization\n process. Default: True.\n consider_video_tags_from_frames (): whether to consider the video\n tags generated from sampled frames from the video in the\n summarization process. Default: True.\n vid_cap_from_vid_args (typing.Optional[typing.Dict]): the arg dict for video captioning from\n video directly with keys are the arg names and values are the arg\n values. Default: None.\n vid_cap_from_frm_args (typing.Optional[typing.Dict]): the arg dict for video captioning from\n sampled frames from the video with keys are the arg names and\n values are the arg values. Default: None.\n vid_tag_from_aud_args (typing.Optional[typing.Dict]): the arg dict for video tagging from audio\n streams in the video with keys are the arg names and values are the\n arg values. Default: None.\n vid_tag_from_frm_args (typing.Optional[typing.Dict]): the arg dict for video tagging from\n sampled frames from the video with keys are the arg names and\n values are the arg values. Default: None.\n keep_tag_num (typing.Annotated[int, Gt(gt=0)]): max number N of tags from sampled frames to keep.\n Too many tags might bring negative influence to summarized text, so\n we consider to only keep the N most frequent tags. Default: 5.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only summarized captions in the\n final datasets and the original captions will be removed. It's True\n in default.\n"}, {"index": 135, "class_name": "video_captioning_from_video_mapper", "class_desc": "Generates video captions using a Hugging Face video-to-text model and sampled video\n frames.\n\n This operator processes video samples to generate captions based on the provided video\n frames.\n It uses a Hugging Face video-to-text model, such as 'kpyu/video-blip-opt-2.7b-ego4d',\n to generate multiple caption candidates for each video. The number of generated\n captions and the strategy to keep or filter these candidates can be configured. The\n operator supports different frame sampling methods, including extracting all\n keyframes or uniformly sampling a specified number of frames. Additionally, it allows\n for horizontal and vertical flipping of the frames. The final output can include both\n the original sample and the generated captions, depending on the configuration.", "arguments": " hf_video_blip (): video-blip model name on huggingface\n to generate caption\n trust_remote_code ()\n caption_num (typing.Annotated[int, Gt(gt=0)]): how many candidate captions to generate\n for each video\n keep_candidate_mode (): retain strategy for the generated\n $caption_num$ candidates.\n\n 'random_any': Retain the random one from generated captions\n\n 'similar_one_simhash': Retain the generated one that is most\n similar to the original caption\n\n 'all': Retain all generated captions by concatenation\n\n Note:\n This is a batched_OP, whose input and output type are\n both list. Suppose there are $N$ list of input samples, whose batch\n size is $b$, and denote caption_num as $M$.\n The number of total samples after generation is $2Nb$ when\n keep_original_sample is True and $Nb$ when keep_original_sample is\n False. For 'random_any' and 'similar_one_simhash' mode,\n it's $(1+M)Nb$ for 'all' mode when keep_original_sample is True\n and $MNb$ when keep_original_sample is False.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only generated captions in the\n final datasets and the original captions will be removed. It's True\n in default.\n prompt (typing.Optional[str]): a string prompt to guide the generation of video-blip\n model for all samples globally. It's None in default, which means\n no prompt provided.\n prompt_key (typing.Optional[str]): the key name of fields in samples to store prompts\n for each sample. It's used for set different prompts for different\n samples. If it's none, use prompt in parameter \"prompt\". It's None\n in default.\n frame_sampling_method (): sampling method of extracting frame\n videos from the videos. Should be one of\n [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames (the number\n of which depends on the duration of the video) and the latter\n one extract specified number of frames uniformly from the video.\n Default: \"all_keyframes\".\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n horizontal_flip (): flip frame video horizontally (left to right).\n vertical_flip (): flip frame video vertically (top to bottom).\n"}, {"index": 136, "class_name": "video_extract_frames_mapper", "class_desc": "Mapper to extract frames from video files according to specified methods.\n\n Extracts frames from video files using either all keyframes or a uniform sampling\n method. The extracted frames are saved in a directory, and the mapping from video keys\n to frame directories is stored in the sample's metadata. The data format for the\n extracted frames is a dictionary mapping video keys to their respective frame\n directories:\n - \"video_key_1\": \"/${frame_dir}/video_key_1_filename/\"\n - \"video_key_2\": \"/${frame_dir}/video_key_2_filename/\"\n\n - **Frame Sampling Methods**:\n - \"all_keyframes\": Extracts all keyframes from the video.\n - \"uniform\": Extracts a specified number of frames uniformly from the video.\n - If `duration` is set, the video is segmented into multiple segments based on the\n duration, and frames are extracted from each segment.\n - The output directory for the frames can be specified; otherwise, a default directory\n is used.\n - The field name in the sample's metadata where the frame information is stored can be\n customized.", "arguments": " frame_sampling_method (): sampling method of extracting frame\n videos from the videos. Should be one of\n [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames (the number\n of which depends on the duration of the video) and the latter\n one extract specified number of frames uniformly from the video.\n If \"duration\" > 0, frame_sampling_method acts on every segment.\n Default: \"all_keyframes\".\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n If \"duration\" > 0, frame_num is the number of frames per segment.\n duration (): The duration of each segment in seconds.\n If 0, frames are extracted from the entire video.\n If duration > 0, the video is segmented into multiple segments\n based on duration, and frames are extracted from each segment.\n frame_dir (): Output directory to save extracted frames.\n If None, a default directory based on the video file path is used.\n frame_key (): The name of field to save generated frames info.\n"}, {"index": 137, "class_name": "video_face_blur_mapper", "class_desc": "Mapper to blur faces detected in videos.\n\n This operator uses an OpenCV classifier for face detection and applies a specified blur\n type to the detected faces. The default classifier is 'haarcascade_frontalface_alt.xml'.\n Supported blur types include 'mean', 'box', and 'gaussian'. The radius of the blur\n kernel can be adjusted. If a save directory is not provided, the processed videos will\n be saved in the same directory as the input files. The `DJ_PRODUCED_DATA_DIR`\n environment variable can also be used to specify the save directory.", "arguments": " cv_classifier (): OpenCV classifier path for face detection.\n By default, we will use 'haarcascade_frontalface_alt.xml'.\n blur_type (): Type of blur kernel, including\n ['mean', 'box', 'gaussian'].\n radius (): Radius of blur kernel.\n save_dir (): The directory where generated video files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 138, "class_name": "video_ffmpeg_wrapped_mapper", "class_desc": "Wraps FFmpeg video filters for processing video files in a dataset.\n\n This operator applies a specified FFmpeg video filter to each video file in the dataset.\n It supports passing keyword arguments to the filter and global arguments to the FFmpeg\n command line. The processed videos are saved in a specified directory or the same\n directory as the input files. If no filter name is provided, the videos remain\n unmodified. The operator updates the source file paths in the dataset to reflect any\n changes.", "arguments": " filter_name (typing.Optional[str]): ffmpeg video filter name.\n filter_kwargs (typing.Optional[typing.Dict]): keyword-arguments passed to ffmpeg filter.\n global_args (typing.Optional[typing.List[str]]): list-arguments passed to ffmpeg command-line.\n capture_stderr (): whether to capture stderr.\n overwrite_output (): whether to overwrite output file.\n save_dir (): The directory where generated video files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 139, "class_name": "video_remove_watermark_mapper", "class_desc": "Remove watermarks from videos based on specified regions.\n\n This operator removes watermarks from video frames by detecting and masking the\n watermark areas. It supports two detection methods: 'pixel_value' and 'pixel_diversity'.\n The regions of interest (ROIs) for watermark detection can be specified as either pixel\n coordinates or ratios of the frame dimensions. The operator extracts a set number of\n frames uniformly from the video to detect watermark pixels. A pixel is considered part\n of a watermark if it meets the detection criteria in a minimum number of frames. The\n cleaned video is saved in the specified directory or the same directory as the input\n file if no save directory is provided.", "arguments": " roi_strings (typing.List[str]): a given list of regions the watermarks locate.\n The format of each can be \"x1, y1, x2, y2\", \"(x1, y1, x2, y2)\",\n or \"[x1, y1, x2, y2]\".\n roi_type (): the roi string type. When the type is 'pixel', (x1,\n y1), (x2, y2) are the locations of pixels in the top left corner\n and the bottom right corner respectively. If the roi_type is\n 'ratio', the coordinates are normalized by widths and heights.\n roi_key (typing.Optional[str]): the key name of fields in samples to store roi_strings\n for each sample. It's used for set different rois for different\n samples. If it's none, use rois in parameter \"roi_strings\".\n It's None in default.\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video to detect the pixels of watermark.\n min_frame_threshold (typing.Annotated[int, Gt(gt=0)]): a coordination is considered as the\n location of a watermark pixel when it is that in no less\n min_frame_threshold frames.\n detection_method (): the method to detect the pixels of watermark.\n If it is 'pixel_value', we consider the distribution of pixel\n value in each frame. If it is 'pixel_diversity', we will consider\n the pixel diversity in different frames. The min_frame_threshold\n is useless and frame_num must be greater than 1 in\n 'pixel_diversity' mode.\n save_dir (): The directory where generated video files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 140, "class_name": "video_resize_aspect_ratio_mapper", "class_desc": "Resizes videos to fit within a specified aspect ratio range. This operator adjusts the\n dimensions of videos to ensure their aspect ratios fall within a defined range. It can\n either increase or decrease the video dimensions based on the specified strategy. The\n aspect ratio is calculated as width divided by height. If a video's aspect ratio is\n outside the given range, it will be resized to match the closest boundary (either the\n minimum or maximum ratio). The `min_ratio` and `max_ratio` should be provided as strings\n in the format \"9:21\" or \"9/21\". The resizing process uses the `ffmpeg` library to handle\n the actual video scaling. Videos that do not need resizing are left unchanged. The\n operator supports saving the modified videos to a specified directory or the same\n directory as the input files.", "arguments": " min_ratio (): The minimum aspect ratio to enforce videos with\n an aspect ratio below `min_ratio` will be resized to match\n this minimum ratio. The ratio should be provided as a string\n in the format \"9:21\" or \"9/21\".\n max_ratio (): The maximum aspect ratio to enforce videos with\n an aspect ratio above `max_ratio` will be resized to match\n this maximum ratio. The ratio should be provided as a string\n in the format \"21:9\" or \"21/9\".\n strategy (): The resizing strategy to apply when adjusting the\n video dimensions. It can be either 'decrease' to reduce the\n dimension or 'increase' to enlarge it. Accepted values are\n ['decrease', 'increase'].\n save_dir (): The directory where generated video files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 141, "class_name": "video_resize_resolution_mapper", "class_desc": "Resizes video resolution based on specified width and height constraints.\n\n This operator resizes videos to fit within the provided minimum and maximum width and\n height limits. It can optionally maintain the original aspect ratio by adjusting the\n dimensions accordingly. The resized videos are saved in the specified directory or the\n same directory as the input if no save directory is provided. The key metric for\n resizing is the video's width and height, which are adjusted to meet the constraints\n while maintaining the aspect ratio if configured. The `force_divisible_by` parameter\n ensures that the output dimensions are divisible by a specified integer, which must be a\n positive even number when used with aspect ratio adjustments.", "arguments": " min_width (): Videos with width less than 'min_width' will be\n mapped to videos with equal or bigger width.\n max_width (): Videos with width more than 'max_width' will be\n mapped to videos with equal of smaller width.\n min_height (): Videos with height less than 'min_height' will be\n mapped to videos with equal or bigger height.\n max_height (): Videos with height more than 'max_height' will be\n mapped to videos with equal or smaller height.\n force_original_aspect_ratio (): Enable decreasing or increasing output video width or height if necessary to keep the original aspect ratio, including ['disable', 'decrease', 'increase'].\n force_divisible_by (typing.Annotated[int, Gt(gt=0)]): Ensures that both the output dimensions, width and height, are divisible by the given integer when used together with force_original_aspect_ratio, must be a positive even number.\n save_dir (): The directory where generated video files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 142, "class_name": "video_split_by_duration_mapper", "class_desc": "Splits videos into segments based on a specified duration.\n\n This operator splits each video in the dataset into smaller segments, each with a fixed\n duration. The last segment is discarded if its duration is less than the specified\n minimum last split duration. The original sample can be kept or removed based on the\n `keep_original_sample` parameter. The generated video files are saved in the specified\n directory or, if not provided, in the same directory as the input files. The key metric\n for this operation is the duration of each segment, which is character-based (seconds).\n\n - Splits videos into segments of a specified duration.\n - Discards the last segment if it is shorter than the minimum allowed duration.\n - Keeps or removes the original sample based on the `keep_original_sample` parameter.\n - Saves the generated video files in the specified directory or the input file's\n directory.\n - Uses the duration in seconds to determine the segment boundaries.", "arguments": " split_duration (): duration of each video split in seconds.\n min_last_split_duration (): The minimum allowable duration in\n seconds for the last video split. If the duration of the last\n split is less than this value, it will be discarded.\n keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only cut sample in the\n final datasets and the original sample will be removed. It's True\n in default.\n save_dir (): The directory where generated video files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 143, "class_name": "video_split_by_key_frame_mapper", "class_desc": "Splits a video into segments based on key frames.\n\n This operator processes video data by splitting it into multiple segments at key frame\n boundaries. It uses the key frames to determine where to make the splits. The original\n sample can be kept or discarded based on the `keep_original_sample` parameter. If\n `save_dir` is specified, the split video files will be saved in that directory;\n otherwise, they will be saved in the same directory as the input files. The operator\n processes each video in the sample and updates the sample with the new video keys and\n text placeholders. The `Fields.source_file` field is updated to reflect the new video\n segments. This operator works in batch mode, processing multiple samples at once.", "arguments": " keep_original_sample (): whether to keep the original sample. If\n it's set to False, there will be only split sample in the\n final datasets and the original sample will be removed. It's True\n in default.\n save_dir (): The directory where generated video files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 144, "class_name": "video_split_by_scene_mapper", "class_desc": "Splits videos into scene clips based on detected scene changes.\n\n This operator uses a specified scene detector to identify and split video scenes. It\n supports three types of detectors: ContentDetector, ThresholdDetector, and\n AdaptiveDetector. The operator processes each video in the sample, detects scenes, and\n splits the video into individual clips. The minimum length of a scene can be set, and\n progress can be shown during processing. The resulting clips are saved in the specified\n directory or the same directory as the input files if no save directory is provided. The\n operator also updates the text field in the sample to reflect the new video clips. If a\n video does not contain any scenes, it remains unchanged.", "arguments": " detector (): Algorithm from `scenedetect.detectors`. Should be one\n of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`].\n threshold (typing.Annotated[float, Ge(ge=0)]): Threshold passed to the detector.\n min_scene_len (typing.Annotated[int, Ge(ge=0)]): Minimum length of any scene.\n show_progress (): Whether to show progress from scenedetect.\n save_dir (): The directory where generated video files will be stored.\n If not specified, outputs will be saved in the same directory as their corresponding input files.\n This path can alternatively be defined by setting the `DJ_PRODUCED_DATA_DIR` environment variable.\n"}, {"index": 145, "class_name": "video_tagging_from_audio_mapper", "class_desc": "Generates video tags from audio streams using the Audio Spectrogram Transformer.\n\n This operator extracts audio streams from videos and uses a Hugging Face Audio\n Spectrogram Transformer (AST) model to generate tags. The tags are stored in the\n specified metadata field, defaulting to 'video_audio_tags'. If no valid audio stream is\n found, the tag is set to 'EMPTY'. The operator resamples audio to match the model's\n required sampling rate if necessary. The tags are inferred based on the highest logit\n value from the model's output. If the tags are already present in the sample, the\n operator skips processing for that sample.", "arguments": " hf_ast (): path to the HF model to tag from audios.\n trust_remote_code (): whether to trust the remote code of HF models\n tag_field_name (): the field name to store the tags. It's\n \"video_audio_tags\" in default.\n"}, {"index": 146, "class_name": "video_tagging_from_frames_mapper", "class_desc": "Generates video tags from frames extracted from videos.\n\n This operator extracts frames from videos and generates tags based on the\n content of these frames. The frame extraction method can be either\n \"all_keyframes\" or \"uniform\". For \"all_keyframes\", all keyframes are\n extracted, while for \"uniform\", a specified number of frames are\n extracted uniformly across the video. The tags are generated using a\n pre-trained model and stored in the specified field name. If the tags\n are already present in the sample, the operator skips processing.\n Important notes:\n - Uses a Hugging Face tokenizer and a pre-trained model for tag generation.\n - If no video is present in the sample, an empty tag array is stored.\n - Frame tensors are processed to generate tags, which are then sorted by\n frequency and stored.", "arguments": " frame_sampling_method (): sampling method of extracting frame\n images from the videos. Should be one of\n [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames (the number of which depends\n on the duration of the video) and the latter one extract specified\n number of frames uniformly from the video.\n Default: \"all_keyframes\".\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n tag_field_name (): the field name to store the tags. It's\n \"video_frame_tags\" in default.\n"}, {"index": 147, "class_name": "whitespace_normalization_mapper", "class_desc": "Normalizes various types of whitespace characters to standard spaces in text samples.\n\n This mapper converts all non-standard whitespace characters, such as tabs and newlines,\n to the standard space character (' ', 0x20). It also trims leading and trailing\n whitespace from the text. This ensures consistent spacing across all text samples,\n improving readability and consistency. The normalization process is based on a\n comprehensive list of whitespace characters, which can be found at\n https://en.wikipedia.org/wiki/Whitespace_character.", "arguments": ""}, {"index": 148, "class_name": "video_tagging_from_frames_filter", "class_desc": "Filter to keep samples whose videos contain specified tags.\n\n This operator filters video samples based on the presence of given tags in the video\n frames. It uses a Hugging Face tokenizer to extract and tag frames. The filtering can be\n configured to require any or all of the specified tags to be present. The operator\n supports two frame sampling methods: \"all_keyframes\" and \"uniform\". When using\n \"uniform\", the number of frames to sample can be specified. The extracted tags are\n stored in the meta field with the key 'video_frame_tags' by default. The decision to\n keep a sample is based on whether any or all of the video frames meet the tag criteria,\n as specified by the 'any_or_all' parameter.", "arguments": " tags (typing.List[str]): a tag list to shift the videos, total tags can be found\n in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt # noqa: E501\n contain (): require the videos containing 'any' or 'all' tags.\n When tags equal to [], 'all' keeps all samples, 'any' keeps no\n sample.\n frame_sampling_method (): sampling method of extracting frame\n images from the videos. Should be one of\n [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames (the number of which depends\n on the duration of the video) and the latter one extract specified\n number of frames uniformly from the video.\n Default: \"all_keyframes\".\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n tag_field_name (): the key name to store the tags in the meta\n field. It's \"video_frame_tags\" in default.\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n"}, {"index": 149, "class_name": "video_watermark_filter", "class_desc": "Filter to keep samples whose videos have no watermark with high probability.\n\n This operator uses a Hugging Face watermark detection model to predict the probability\n of watermarks in video frames. It keeps samples where the predicted watermark\n probability is below a specified threshold. The key metric, 'video_watermark_prob', is\n computed by extracting frames from the video using a specified sampling method and then\n averaging, maximizing, or minimizing the probabilities based on the reduce mode. If\n multiple videos are present, the operator can use either an 'any' or 'all' strategy to\n determine if the sample should be kept. The frame sampling method can be 'all_keyframes'\n or 'uniform', and the reduce mode can be 'avg', 'max', or 'min'.", "arguments": " hf_watermark_model (): watermark detection model name on\n huggingface.\n trust_remote_code ()\n prob_threshold (): the predicted watermark probability threshold\n for samples. range from 0 to 1. Samples with watermark probability\n less than this threshold will be kept.\n frame_sampling_method (): sampling method of extracting frame\n images from the videos.\n Should be one of [\"all_keyframes\", \"uniform\"].\n The former one extracts all key frames (the number of which depends\n on the duration of the video) and the latter one extract specified\n number of frames uniformly from the video.\n Default: \"all_keyframes\".\n frame_num (typing.Annotated[int, Gt(gt=0)]): the number of frames to be extracted uniformly from\n the video. Only works when frame_sampling_method is \"uniform\". If\n it's 1, only the middle frame will be extracted. If it's 2, only\n the first and the last frames will be extracted. If it's larger\n than 2, in addition to the first and the last frames, other frames\n will be extracted uniformly within the video duration.\n reduce_mode (): reduce mode for multiple sampled video frames.\n 'avg': Take the average of multiple values\n 'max': Take the max of multiple values\n 'min': Take the min of multiple values\n any_or_all (): keep this sample with 'any' or 'all' strategy of\n all videos. 'any': keep this sample if any videos meet the\n condition. 'all': keep this sample only if all videos meet the\n condition.\n"}, {"index": 150, "class_name": "word_repetition_filter", "class_desc": "Filter to keep samples with word-level n-gram repetition ratio within a specific range.\n\n This operator calculates the word-level n-gram repetition ratio for each sample and\n filters out those that do not fall within the specified range. The n-gram length and the\n min/max ratio thresholds are configurable. If tokenization is enabled, a Hugging Face\n tokenizer is used to tokenize the text. The key metric, `word_rep_ratio`, is computed as\n the ratio of repeated n-grams to the total number of n-grams. This ratio is then\n compared against the min and max ratio thresholds to decide whether to keep or filter\n the sample. If the ratio is outside the specified range, the sample is filtered out.", "arguments": " lang (): sample in which language.\n tokenization (): whether to use model to tokenize documents\n rep_len (typing.Annotated[int, Gt(gt=0)]): Repetition length for word-level n-gram.\n min_ratio (): The min filter ratio in this op, samples will\n be filtered if their word-level n-gram repetition ratio is\n below this parameter.\n max_ratio (): The max filter ratio in this op, samples will\n be filtered if their word-level n-gram repetition ratio\n exceeds this parameter.\n"}, {"index": 151, "class_name": "words_num_filter", "class_desc": "Filter to keep samples with a total word count within a specified range.\n\n This operator filters samples based on the number of words they contain. It retains\n samples if their word count is within the given minimum and maximum limits. If\n tokenization is enabled, it uses a Hugging Face tokenizer to count words. The key metric\n `num_words` is computed and stored in the sample's stats under the `num_words` field. If\n the word count is already cached, it reuses the cached value to avoid redundant\n computation.", "arguments": " lang (): sample in which language.\n tokenization (): whether to use model to tokenize documents\n min_num (): The min filter word number in this op, samples\n will be filtered if their word number is below this\n parameter.\n max_num (): The max filter word number in this op, samples\n will be filtered if their word number exceeds this\n parameter.\n"}, {"index": 152, "class_name": "naive_grouper", "class_desc": "Group all samples in a dataset into a single batched sample.\n\n This operator takes a dataset and combines all its samples into one batched sample. If\n the input dataset is empty, it returns an empty dataset. The resulting batched sample is\n a dictionary where each key corresponds to a list of values from all samples in the\n dataset.", "arguments": ""}, {"index": 153, "class_name": "key_value_grouper", "class_desc": "Groups samples into batches based on values in specified keys.\n\n This operator groups samples by the values of the given keys, which can be nested. If no\n keys are provided, it defaults to using the text key. It uses a naive grouping strategy\n to batch samples with identical key values. The resulting dataset is a list of batched\n samples, where each batch contains samples that share the same key values. This is\n useful for organizing data by specific attributes or features.", "arguments": " group_by_keys (typing.Optional[typing.List[str]]): group samples according values in the keys.\n Support for nested keys such as \"__dj__stats__.text_len\".\n It is [self.text_key] in default.\n"}, {"index": 154, "class_name": "naive_reverse_grouper", "class_desc": "Split batched samples into individual samples.\n\n This operator processes a dataset by splitting each batched sample into\n individual samples. It also handles and optionally exports batch metadata.\n - If a sample contains 'batch_meta', it is separated and can be exported\n to a specified path.\n - The operator converts the remaining data from a dictionary of lists\n to a list of dictionaries, effectively unbatching the samples.\n - If `batch_meta_export_path` is provided, the batch metadata is written\n to this file in JSON format, one entry per line.\n - If no samples are present in the dataset, the original dataset is returned.", "arguments": " batch_meta_export_path (): the path to export the batch meta.\n Just drop the batch meta if it is None.\n"}, {"index": 155, "class_name": "frequency_specified_field_selector", "class_desc": "Selector to filter samples based on the frequency of a specified field.\n\n This operator selects samples based on the frequency of values in a specified field. The\n field can be multi-level, with keys separated by dots. It supports filtering by either a\n top ratio or a fixed number (topk) of the most frequent values. If both top_ratio and\n topk are provided, the one resulting in fewer samples is used. The sorting order can be\n controlled with the reverse parameter. The operator processes the dataset and returns a\n new dataset containing only the selected samples.", "arguments": " field_key (): Selector based on the specified value\n corresponding to the target key. The target key\n corresponding to multi-level field information need to be\n separated by '.'.\n top_ratio (typing.Optional[typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]]): Ratio of selected top specified field value,\n samples will be selected if their specified field values are\n within this parameter. When both topk and top_ratio are set,\n the value corresponding to the smaller number of samples\n will be applied.\n topk (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): Number of selected top specified field value,\n samples will be selected if their specified field values are\n within this parameter. When both topk and top_ratio are set,\n the value corresponding to the smaller number of samples\n will be applied.\n reverse (): Determine the sorting rule, if reverse=True,\n then sort in descending order.\n"}, {"index": 156, "class_name": "random_selector", "class_desc": "Randomly selects a subset of samples from the dataset.\n\n This operator randomly selects a subset of samples based on either a specified ratio or\n a fixed number. If both `select_ratio` and `select_num` are provided, the one that\n results in fewer samples is used. The selection is skipped if the dataset has only one\n or no samples. The `random_sample` function is used to perform the actual sampling.\n\n - `select_ratio`: The ratio of samples to select (0 to 1).\n - `select_num`: The exact number of samples to select.\n - If neither `select_ratio` nor `select_num` is set, the dataset remains unchanged.", "arguments": " select_ratio (typing.Optional[typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]]): The ratio to select. When both\n select_ratio and select_num are set, the value corresponding\n to the smaller number of samples will be applied.\n select_num (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): The number of samples to select. When both\n select_ratio and select_num are set, the value corresponding\n to the smaller number of samples will be applied.\n"}, {"index": 157, "class_name": "range_specified_field_selector", "class_desc": "Selects a range of samples based on the sorted values of a specified field.\n\n This operator selects samples whose values for a specified field fall within a given\n range. The range can be defined using percentiles or ranks, and the operator will use\n the more inclusive bounds if both are provided. The field values are first sorted in\n ascending order, and then the samples are selected based on the lower and upper bounds.\n If no bounds are provided, the original dataset is returned. The operator ensures that\n the specified field exists in the dataset and handles multi-level fields by separating\n keys with dots.", "arguments": " field_key (): Selector based on the specified value\n corresponding to the target key. The target key\n corresponding to multi-level field information need to be\n separated by '.'.\n lower_percentile (typing.Optional[typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]]): The lower bound of the percentile to\n be sample, samples will be selected if their specified field\n values are greater than this lower bound. When both\n lower_percentile and lower_rank are set, the value corresponding\n to the larger number of samples will be applied.\n upper_percentile (typing.Optional[typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]]): The upper bound of the percentile to\n be sample, samples will be selected if their specified field\n values are less or equal to the upper bound. When both\n upper_percentile and upper_rank are set, the value corresponding\n to the smaller number of samples will be applied.\n lower_rank (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): The lower bound of the rank to be sample,\n samples will be selected if their specified field values are\n greater than this lower bound. When both lower_percentile and\n lower_rank are set, the value corresponding to the larger number\n of samples will be applied.\n upper_rank (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): The upper bound of the rank to be sample,\n samples will be selected if their specified field values are\n less or equal to the upper bound. When both upper_percentile and\n upper_rank are set, the value corresponding to the smaller number\n of samples will be applied.\n"}, {"index": 158, "class_name": "tags_specified_field_selector", "class_desc": "Selector to filter samples based on the tags of a specified field.\n\n This operator selects samples where the value of the specified field matches one of the\n target tags. The field can be multi-level, with levels separated by dots (e.g.,\n 'level1.level2'). The operator checks if the specified field exists in the dataset and\n if the field value is a string, number, or None. If the field value matches any of the\n target tags, the sample is kept. The selection is case-sensitive.\n\n - The `field_key` parameter specifies the field to check.\n - The `target_tags` parameter is a list of tags to match against the field value.\n - If the dataset has fewer than two samples or if `field_key` is empty, the dataset is\n returned unchanged.", "arguments": " field_key (): Selector based on the specified value\n corresponding to the target key. The target key\n corresponding to multi-level field information need to be\n separated by '.'.\n target_tags (typing.List[str]): Target tags to be select.\n"}, {"index": 159, "class_name": "topk_specified_field_selector", "class_desc": "Selects top samples based on the sorted values of a specified field.\n\n This operator selects the top samples from a dataset based on the values of a specified\n field. The field can be multi-level, with keys separated by dots. The selection is based\n on either a specified ratio of the dataset or a fixed number of top samples. If both\n `top_ratio` and `topk` are provided, the one resulting in fewer samples is used. The\n sorting order can be ascending or descending, controlled by the `reverse` parameter. The\n key metric is the value of the specified field, and the operator uses this to determine\n which samples to keep.", "arguments": " field_key (): Selector based on the specified value\n corresponding to the target key. The target key\n corresponding to multi-level field information need to be\n separated by '.'.\n top_ratio (typing.Optional[typing.Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])]]): Ratio of selected top samples, samples will be\n selected if their specified field values are within this\n parameter. When both topk and top_ratio are set, the value\n corresponding to the smaller number of samples will be\n applied.\n topk (typing.Optional[typing.Annotated[int, Gt(gt=0)]]): Number of selected top sample, samples will be\n selected if their specified field values are within this\n parameter. When both topk and top_ratio are set, the value\n corresponding to the smaller number of samples will be\n applied.\n reverse (): Determine the sorting rule, if reverse=True,\n then sort in descending order.\n"}] \ No newline at end of file diff --git a/data_juicer_agent/tools/op_manager/op_retrieval.py b/data_juicer_agent/tools/op_manager/op_retrieval.py new file mode 100644 index 0000000..c9fbb88 --- /dev/null +++ b/data_juicer_agent/tools/op_manager/op_retrieval.py @@ -0,0 +1,380 @@ +import os +import os.path as osp +import json +import logging +import pickle +import hashlib +import time +from typing import Optional + +from langchain_community.vectorstores import FAISS + +TOOLS_INFO_PATH = osp.join(osp.dirname(__file__), "dj_funcs_all.json") +CACHE_RETRIEVED_TOOLS_PATH = osp.join(osp.dirname(__file__), "cache_retrieve") +VECTOR_INDEX_CACHE_PATH = osp.join(osp.dirname(__file__), "vector_index_cache") + +# Global variable to cache the vector store +_cached_vector_store: Optional[FAISS] = None +_cached_tools_info: Optional[list] = None +_cached_file_hash: Optional[str] = None + +RETRIEVAL_PROMPT = """You are a professional tool retrieval assistant responsible for filtering the top {limit} most relevant tools from a large tool library based on user requirements. Execute the following steps: + +# Requirement Analysis + Carefully read the user's [requirement description], extract core keywords, functional objectives, usage scenarios, and technical requirements (such as real-time performance, data types, industry domains, etc.). + +# Tool Matching + Perform multi-dimensional matching based on the following tool attributes: + - Tool name and functional description + - Supported input/output formats + - Applicable industry or scenario tags + - Technical implementation principles (API, local deployment, AI model types) + - Relevance ranking + +# Use weighted scoring mechanism (example weights): + - Functional match (40%) + - Scenario compatibility (30%) + - Technical compatibility (20%) + - User rating/usage rate (10%) + +# Deduplication and Optimization + Exclude the following low-quality results: + - Tools with duplicate functionality (keep only the best one) + - Tools that cannot meet basic requirements + - Tools missing critical parameter descriptions + +# Constraints + - Strictly control output to a maximum of {limit} tools + - Refuse to speculate on unknown tool attributes + - Maintain accuracy of domain expertise + +# Output Format + Return a JSON format TOP{limit} tool list containing: + [ + {{ + "rank": 1, + "tool_name": "Tool Name", + "description": "Core functionality summary", + "relevance_score": 98.7, + "key_match": ["Matching keywords/features"] + }} + ] + Output strictly in JSON array format, and only output the JSON array format tool list. +""" + + +def fast_text_encoder(text: str) -> str: + """Fast encoding using xxHash algorithm""" + import xxhash + + hasher = xxhash.xxh64(seed=0) + hasher.update(text.encode("utf-8")) + + # Return 16-bit hexadecimal string + return hasher.hexdigest() + + +async def retrieve_ops_lm(user_query, limit=20): + """Tool retrieval using language model - returns list of tool names""" + hash_id = fast_text_encoder(user_query + str(limit)) + + # Ensure cache directory exists + os.makedirs(CACHE_RETRIEVED_TOOLS_PATH, exist_ok=True) + + cache_tools_path = osp.join(CACHE_RETRIEVED_TOOLS_PATH, f"{hash_id}.json") + if osp.exists(cache_tools_path): + with open(cache_tools_path, "r", encoding="utf-8") as f: + return json.loads(f.read()) + + if osp.exists(TOOLS_INFO_PATH): + with open(TOOLS_INFO_PATH, "r", encoding="utf-8") as f: + dj_func_info = json.loads(f.read()) + tool_descriptions = [ + f"{t['class_name']}: {t['class_desc']}" for t in dj_func_info + ] + tools_string = "\n".join(tool_descriptions) + else: + from create_dj_func_info import dj_func_info + + project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) + + with open(os.path.join(project_root, TOOLS_INFO_PATH), "w") as f: + f.write(json.dumps(dj_func_info)) + + tool_descriptions = [ + f"{t['class_name']}: {t['class_desc']}" for t in dj_func_info + ] + tools_string = "\n".join(tool_descriptions) + + from agentscope.model import DashScopeChatModel + from agentscope.message import Msg + from agentscope.formatter import DashScopeChatFormatter + + model = DashScopeChatModel( + model_name="qwen-turbo", + api_key=os.environ.get("DASHSCOPE_API_KEY"), + stream=False, + ) + + formatter = DashScopeChatFormatter() + + # Update retrieval prompt to use the specified limit + retrieval_prompt_with_limit = RETRIEVAL_PROMPT.format(limit=limit) + + user_prompt = ( + retrieval_prompt_with_limit + + """ +User requirement description: +{user_query} + +Available tools: +{tools_string} +""".format( + user_query=user_query, tools_string=tools_string + ) + ) + + msgs = [ + Msg(name="user", role="user", content=user_prompt), + ] + + formatted_msgs = await formatter.format(msgs) + + response = await model(formatted_msgs) + + msg = Msg(name="assistant", role="assistant", content=response.content) + retrieved_tools_text = msg.get_text_content() + retrieved_tools = json.loads(retrieved_tools_text) + + # Extract tool names and validate they exist + tool_names = [] + for tool_info in retrieved_tools: + if not isinstance(tool_info, dict) or "tool_name" not in tool_info: + logging.warning(f"Invalid tool info format: {tool_info}") + continue + + tool_name = tool_info["tool_name"] + + # Verify tool exists in dj_func_info + tool_exists = any(t["class_name"] == tool_name for t in dj_func_info) + if not tool_exists: + logging.error(f"Tool not found: `{tool_name}`, skipping!") + continue + + tool_names.append(tool_name) + + # Cache the result + with open(cache_tools_path, "w", encoding="utf-8") as f: + json.dump(tool_names, f) + + return tool_names + + +def _get_file_hash(file_path: str) -> str: + """Get file content hash using SHA256""" + try: + with open(file_path, "rb") as f: + file_content = f.read() + return hashlib.sha256(file_content).hexdigest() + except (OSError, IOError): + return "" + + +def _load_cached_index() -> bool: + """Load cached vector index from disk""" + global _cached_vector_store, _cached_tools_info, _cached_file_hash + + try: + # Ensure cache directory exists + os.makedirs(VECTOR_INDEX_CACHE_PATH, exist_ok=True) + + index_path = osp.join(VECTOR_INDEX_CACHE_PATH, "faiss_index") + metadata_path = osp.join(VECTOR_INDEX_CACHE_PATH, "metadata.json") + + if not all( + os.path.exists(p) for p in [index_path, metadata_path] + ): + return False + + # Check if cached index matches current tools info file + with open(metadata_path, "r") as f: + metadata = json.load(f) + + cached_hash = metadata.get("tools_info_hash", "") + current_hash = _get_file_hash(TOOLS_INFO_PATH) + + if current_hash != cached_hash: + return False + + # Load cached data + from langchain_community.embeddings import DashScopeEmbeddings + + embeddings = DashScopeEmbeddings( + dashscope_api_key=os.environ.get("DASHSCOPE_API_KEY"), + model="text-embedding-v1", + ) + + _cached_vector_store = FAISS.load_local( + index_path, embeddings, allow_dangerous_deserialization=True + ) + + _cached_file_hash = cached_hash + + logging.info("Successfully loaded cached vector index") + return True + + except Exception as e: + logging.warning(f"Failed to load cached index: {e}") + return False + + +def _save_cached_index(): + """Save vector index to disk cache""" + global _cached_vector_store, _cached_file_hash + + try: + # Ensure cache directory exists + os.makedirs(VECTOR_INDEX_CACHE_PATH, exist_ok=True) + + index_path = osp.join(VECTOR_INDEX_CACHE_PATH, "faiss_index") + metadata_path = osp.join(VECTOR_INDEX_CACHE_PATH, "metadata.json") + + # Save vector store + if _cached_vector_store: + _cached_vector_store.save_local(index_path) + + # Save metadata + metadata = {"tools_info_hash": _cached_file_hash, "created_at": time.time()} + with open(metadata_path, "w") as f: + json.dump(metadata, f) + + logging.info("Successfully saved vector index to cache") + + except Exception as e: + logging.error(f"Failed to save cached index: {e}") + + +def _build_vector_index(): + """Build and cache vector index""" + global _cached_vector_store, _cached_file_hash + + with open(TOOLS_INFO_PATH, "r", encoding="utf-8") as f: + tools_info = json.loads(f.read()) + + tool_descriptions = [f"{t['class_name']}: {t['class_desc']}" for t in tools_info] + + from langchain_community.embeddings import DashScopeEmbeddings + + embeddings = DashScopeEmbeddings( + dashscope_api_key=os.environ.get("DASHSCOPE_API_KEY"), model="text-embedding-v1" + ) + + metadatas = [{"index": i} for i in range(len(tool_descriptions))] + vector_store = FAISS.from_texts(tool_descriptions, embeddings, metadatas=metadatas) + + # Cache the results + _cached_vector_store = vector_store + _cached_file_hash = _get_file_hash(TOOLS_INFO_PATH) + + # Save to disk cache + _save_cached_index() + + logging.info("Successfully built and cached vector index") + + +def retrieve_ops_vector(user_query, limit=20): + """Tool retrieval using vector search with caching - returns list of tool names""" + global _cached_vector_store + + # Try to load from cache first + if not _load_cached_index(): + logging.info("Building new vector index...") + _build_vector_index() + + # Perform similarity search + retrieved_tools = _cached_vector_store.similarity_search(user_query, k=limit) + retrieved_indices = [doc.metadata["index"] for doc in retrieved_tools] + + with open(TOOLS_INFO_PATH, "r", encoding="utf-8") as f: + tools_info = json.loads(f.read()) + + # Extract tool names from retrieved indices + tool_names = [] + for raw_idx in retrieved_indices: + tool_info = tools_info[raw_idx] + tool_names.append(tool_info["class_name"]) + + return tool_names + + +async def retrieve_ops(user_query: str, limit: int = 20, mode: str = "auto") -> list: + """ + Tool retrieval with configurable mode + + Args: + user_query: User query string + limit: Maximum number of tools to retrieve + mode: Retrieval mode - "llm", "vector", or "auto" (default: "auto") + - "llm": Use language model only + - "vector": Use vector search only + - "auto": Try LLM first, fallback to vector search on failure + + Returns: + List of tool names + """ + if mode == "llm": + try: + return await retrieve_ops_lm(user_query, limit=limit) + except Exception as e: + logging.error(f"LLM retrieval failed: {str(e)}") + return [] + + elif mode == "vector": + try: + return retrieve_ops_vector(user_query, limit=limit) + except Exception as e: + logging.error(f"Vector retrieval failed: {str(e)}") + return [] + + elif mode == "auto": + try: + return await retrieve_ops_lm(user_query, limit=limit) + except Exception as e: + import traceback + + print(traceback.format_exc()) + try: + return retrieve_ops_vector(user_query, limit=limit) + except Exception as fallback_e: + logging.error( + f"Tool retrieval failed: {str(e)}, fallback retrieval also failed: {str(fallback_e)}" + ) + return [] + + else: + raise ValueError(f"Invalid mode: {mode}. Must be 'llm', 'vector', or 'auto'") + + +if __name__ == "__main__": + import asyncio + + user_query = ( + "Clean special characters from text and filter samples with excessive length. Mask sensitive information and filter unsafe content including adult/terror-related terms." + + "Additionally, filter out small images, perform image tagging, and remove duplicate images." + ) + + # Test different modes + print("=== Testing LLM mode ===") + tool_names_llm = asyncio.run(retrieve_ops(user_query, limit=10, mode="llm")) + print("Retrieved tool names (LLM):") + print(tool_names_llm) + + print("\n=== Testing Vector mode ===") + tool_names_vector = asyncio.run(retrieve_ops(user_query, limit=10, mode="vector")) + print("Retrieved tool names (Vector):") + print(tool_names_vector) + + print("\n=== Testing Auto mode (default) ===") + tool_names_auto = asyncio.run(retrieve_ops(user_query, limit=10, mode="auto")) + print("Retrieved tool names (Auto):") + print(tool_names_auto) diff --git a/data_juicer_agent/tools/router_tools.py b/data_juicer_agent/tools/router_tools.py new file mode 100644 index 0000000..4114ce3 --- /dev/null +++ b/data_juicer_agent/tools/router_tools.py @@ -0,0 +1,62 @@ +# -*- coding: utf-8 -*- +"""Router agent using implicit routing""" +from typing import Callable +from agentscope.agent import AgentBase +from agentscope.message import Msg +from agentscope.tool import ToolResponse + + +def agent_to_tool( + agent: AgentBase, tool_name: str = None, description: str = None +) -> Callable: + """ + Convert any agent to a tool function that can be registered in toolkit. + + Args: + agent: The agent instance to convert + tool_name: Optional custom tool name (defaults to agent.name) + description: Optional tool description (defaults to agent's docstring or sys_prompt) + + Returns: + A tool function that can be registered with toolkit.register_tool_function() + """ + # Get tool name and description + if tool_name is None: + tool_name = getattr(agent, "name", "agent_tool") + + if description is None: + # Try to get description from agent's docstring or sys_prompt + if hasattr(agent, "__doc__") and agent.__doc__: + description = agent.__doc__.strip() + elif hasattr(agent, "sys_prompt"): + description = f"Agent: {agent.sys_prompt[:100]}..." + elif hasattr(agent, "_sys_prompt"): + description = f"Agent: {agent._sys_prompt[:100]}..." + else: + description = f"Tool function for {tool_name}" + + async def tool_function(task: str) -> ToolResponse: + # Create message and call the agent + msg = Msg("user", task, "user") + result = await agent(msg) + + # Extract content from the result + if hasattr(result, "get_content_blocks"): + content = result.get_content_blocks("text") + return ToolResponse( + content=content, + metadata={ + "agent_name": getattr(agent, "name", "unknown"), + "task": task, + }, + ) + else: + raise ValueError(f"Not a valid Msg object: {result}") + + # Set function name and docstring + tool_function.__name__ = f"call_{tool_name.lower().replace(' ', '_')}" + tool_function.__doc__ = ( + f"{description}\n\nArgs:\n task (str): The task for {tool_name} to handle" + ) + + return tool_function