Add examples for frozenlake and emailsearch (#94)

2026-01-19 12:25:13 +08:00
parent 3821fb04ac
commit 654c35127a
26 changed files with 3370 additions and 14 deletions
--- a/tuner/frozen_lake/_utils.py
+++ b/tuner/frozen_lake/_utils.py
@@ -0,0 +1,209 @@
+# -*- coding: utf-8 -*-
+"""
+Utils for the FrozenLake environment.
+Modified from rllm
+"""
+
+from typing import Literal, Optional, Tuple
+import numpy as np
+from pydantic import BaseModel, Field
+
+# Map gym state in integer
+MAP_LOOKUP = {
+    b"P": 0,
+    b"F": 1,
+    b"H": 2,
+    b"G": 3,
+}
+
+# Define rules to transform to rendered text observation of the environment
+GRID_LOOKUP = {
+    0: " P \t",  # player
+    1: " _ \t",  # frozen
+    2: " O \t",  # hole
+    3: " G \t",  # goal
+    4: " X \t",  # player fall into hole
+    5: " √ \t",  # player on goal
+}
+
+ACTION_LOOKUP = {
+    0: "None",
+    1: "Left",
+    2: "Down",
+    3: "Right",
+    4: "Up",
+}
+
+# Prompting format inspired by the RAGEN project
+SYSTEM_PROMPT = """You are Qwen, created by Alibaba Cloud. \
+You are a helpful assistant. You are walking on a frozen lake.
+
+FrozenLake Quick Guide
+Goal: Reach the goal (G). Player (P) and Goal (G) must overlap.
+
+Symbols:
+_ Frozen | O Hole | G Goal | P Player
+
+Rules:
+1. Avoid falling into holes (O).
+2. Frozen tiles are slippery, you may move perpendicular to
+   your intended direction.
+
+Valid Action (separated by | ):
+Up | Down | Left | Right
+
+Rewards:
+Fall into hole: 0
+Reach goal: +1.0
+
+You will be provided the current observation, please decide on
+the next Action.
+You should show your thought process and then input the final
+action in ``` ```.
+You should only output the NEXT ACTION at each iteration in
+the ``` ```. For example, if you want to move up, you should
+output ```Up```.
+You should plan ahead and need to achieve it in minimum number
+of steps.
+You should be aware that frozen tiles can be slippery, but the
+chance is small and you should not overthink it.
+
+Please show your thinking process and put the final action in
+``` ```. In every turn, the final action MUST be one of Up,
+Down, Left, Right.
+"""
+
+
+class FrozenLakeAction(BaseModel):
+    """Action model for FrozenLake environment."""
+
+    action: Literal["Up", "Down", "Left", "Right"] = Field(
+        description=(
+            "The action to take in the FrozenLake environment, "
+            "must be one of Up, Down, Left, Right"
+        ),
+    )
+
+
+def is_valid(board: list[list[str]], max_size: int, max_steps: int) -> bool:
+    """DFS to check that it's a valid path.
+
+    Args:
+        board: The board representation as a list of lists.
+        max_size: Maximum size of the board.
+        max_steps: Maximum number of steps allowed.
+
+    Returns:
+        True if there's a valid path from start to goal within max_steps,
+        False otherwise.
+    """
+    frontier, discovered = [], set()
+    # find the start point
+    start_r, start_c = np.where(np.array(board) == "S")
+    frontier.append((start_r[0], start_c[0], 0))  # row, col steps
+    # dfs to check if there is a path from start to goal
+    while frontier:
+        r, c, steps = frontier.pop()
+        if steps > max_steps:
+            continue
+
+        if (r, c) not in discovered:
+            discovered.add((r, c))
+            directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
+            for x, y in directions:
+                r_new = r + x
+                c_new = c + y
+                if (
+                    r_new < 0
+                    or r_new >= max_size
+                    or c_new < 0
+                    or c_new >= max_size
+                ):  # noqa: PLR2004
+                    continue
+                if board[r_new][c_new] == "G":
+                    return True
+                if board[r_new][c_new] != "H":
+                    frontier.append((r_new, c_new, steps + 1))
+    return False
+
+
+def generate_random_map(
+    size: int = 8,
+    p: float = 0.8,
+    seed: int = 0,
+    max_steps: int = 5,
+) -> Tuple[list[str], Tuple[int, int]]:
+    """Generates a random valid map (one that has a path from start to goal).
+
+    Args:
+        size: Size of each side of the grid.
+        p: Probability that a tile is frozen.
+        seed: Seed to ensure the generation of reproducible maps.
+        max_steps: Maximum number of steps allowed.
+
+    Returns:
+        A tuple containing a random valid map and the goal position (row, col).
+    """
+    valid = False
+    board: list[list[str]] = []  # initialize to make pyright happy
+
+    try:
+        from gymnasium.utils import seeding
+
+        np_random, _ = seeding.np_random(seed)
+    except ImportError as exc:
+        raise ImportError(
+            "Gymnasium is not installed. "
+            "Please install gymnasium first before "
+            "running the frozen_lake workflow.",
+        ) from exc
+
+    # generate random start and end points
+    while not valid:
+        p = min(1, p)
+        board = np_random.choice(
+            ["F", "H"],
+            (size, size),
+            p=[p, 1 - p],
+        ).tolist()
+
+        while True:
+            start_r = int(np_random.integers(0, size))
+            start_c = int(np_random.integers(0, size))
+            goal_r = int(np_random.integers(0, size))
+            goal_c = int(np_random.integers(0, size))
+
+            # Ensure start and goal are different positions
+            if (start_r, start_c) != (goal_r, goal_c):
+                break
+
+        board[start_r][start_c] = "S"
+        board[goal_r][goal_c] = "G"
+
+        valid = is_valid(board, size, max_steps)
+    return ["".join(x) for x in board], (goal_r, goal_c)
+
+
+def get_goal_position(
+    random_map: np.ndarray,
+) -> Optional[Tuple[int, int]]:
+    """Get the goal position from a random map.
+
+    Args:
+        random_map: The map as a numpy array.
+
+    Returns:
+        Tuple of (row, col) if goal found, None otherwise.
+    """
+    positions = np.argwhere(random_map == b"G")
+    if positions.size == 0:
+        return None  # G not found
+    return tuple(positions[0])  # returns (row, col)
+
+
+__all__ = [
+    "SYSTEM_PROMPT",
+    "FrozenLakeAction",
+    "generate_random_map",
+    "get_goal_position",
+]