evotraders/tuner/frozen_lake/_utils.py

# -*- coding: utf-8 -*-
"""
Utils for the FrozenLake environment.
Modified from rllm
"""

from typing import Literal, Optional, Tuple
import numpy as np
from pydantic import BaseModel, Field

# Map gym state in integer
MAP_LOOKUP = {
    b"P": 0,
    b"F": 1,
    b"H": 2,
    b"G": 3,
}

# Define rules to transform to rendered text observation of the environment
GRID_LOOKUP = {
    0: " P \t",  # player
    1: " _ \t",  # frozen
    2: " O \t",  # hole
    3: " G \t",  # goal
    4: " X \t",  # player fall into hole
    5: " √ \t",  # player on goal
}

ACTION_LOOKUP = {
    0: "None",
    1: "Left",
    2: "Down",
    3: "Right",
    4: "Up",
}

# Prompting format inspired by the RAGEN project
SYSTEM_PROMPT = """You are Qwen, created by Alibaba Cloud. \
You are a helpful assistant. You are walking on a frozen lake.

FrozenLake Quick Guide
Goal: Reach the goal (G). Player (P) and Goal (G) must overlap.

Symbols:
_ Frozen | O Hole | G Goal | P Player

Rules:
1. Avoid falling into holes (O).
2. Frozen tiles are slippery, you may move perpendicular to
   your intended direction.

Valid Action (separated by | ):
Up | Down | Left | Right

Rewards:
Fall into hole: 0
Reach goal: +1.0

You will be provided the current observation, please decide on
the next Action.
You should show your thought process and then input the final
action in ``` ```.
You should only output the NEXT ACTION at each iteration in
the ``` ```. For example, if you want to move up, you should
output ```Up```.
You should plan ahead and need to achieve it in minimum number
of steps.
You should be aware that frozen tiles can be slippery, but the
chance is small and you should not overthink it.

Please show your thinking process and put the final action in
``` ```. In every turn, the final action MUST be one of Up,
Down, Left, Right.
"""


class FrozenLakeAction(BaseModel):
    """Action model for FrozenLake environment."""

    action: Literal["Up", "Down", "Left", "Right"] = Field(
        description=(
            "The action to take in the FrozenLake environment, "
            "must be one of Up, Down, Left, Right"
        ),
    )


def is_valid(board: list[list[str]], max_size: int, max_steps: int) -> bool:
    """DFS to check that it's a valid path.

    Args:
        board: The board representation as a list of lists.
        max_size: Maximum size of the board.
        max_steps: Maximum number of steps allowed.

    Returns:
        True if there's a valid path from start to goal within max_steps,
        False otherwise.
    """
    frontier, discovered = [], set()
    # find the start point
    start_r, start_c = np.where(np.array(board) == "S")
    frontier.append((start_r[0], start_c[0], 0))  # row, col steps
    # dfs to check if there is a path from start to goal
    while frontier:
        r, c, steps = frontier.pop()
        if steps > max_steps:
            continue

        if (r, c) not in discovered:
            discovered.add((r, c))
            directions = [(1, 0), (0, 1), (-1, 0), (0, -1)]
            for x, y in directions:
                r_new = r + x
                c_new = c + y
                if (
                    r_new < 0
                    or r_new >= max_size
                    or c_new < 0
                    or c_new >= max_size
                ):  # noqa: PLR2004
                    continue
                if board[r_new][c_new] == "G":
                    return True
                if board[r_new][c_new] != "H":
                    frontier.append((r_new, c_new, steps + 1))
    return False


def generate_random_map(
    size: int = 8,
    p: float = 0.8,
    seed: int = 0,
    max_steps: int = 5,
) -> Tuple[list[str], Tuple[int, int]]:
    """Generates a random valid map (one that has a path from start to goal).

    Args:
        size: Size of each side of the grid.
        p: Probability that a tile is frozen.
        seed: Seed to ensure the generation of reproducible maps.
        max_steps: Maximum number of steps allowed.

    Returns:
        A tuple containing a random valid map and the goal position (row, col).
    """
    valid = False
    board: list[list[str]] = []  # initialize to make pyright happy

    try:
        from gymnasium.utils import seeding

        np_random, _ = seeding.np_random(seed)
    except ImportError as exc:
        raise ImportError(
            "Gymnasium is not installed. "
            "Please install gymnasium first before "
            "running the frozen_lake workflow.",
        ) from exc

    # generate random start and end points
    while not valid:
        p = min(1, p)
        board = np_random.choice(
            ["F", "H"],
            (size, size),
            p=[p, 1 - p],
        ).tolist()

        while True:
            start_r = int(np_random.integers(0, size))
            start_c = int(np_random.integers(0, size))
            goal_r = int(np_random.integers(0, size))
            goal_c = int(np_random.integers(0, size))

            # Ensure start and goal are different positions
            if (start_r, start_c) != (goal_r, goal_c):
                break

        board[start_r][start_c] = "S"
        board[goal_r][goal_c] = "G"

        valid = is_valid(board, size, max_steps)
    return ["".join(x) for x in board], (goal_r, goal_c)


def get_goal_position(
    random_map: np.ndarray,
) -> Optional[Tuple[int, int]]:
    """Get the goal position from a random map.

    Args:
        random_map: The map as a numpy array.

    Returns:
        Tuple of (row, col) if goal found, None otherwise.
    """
    positions = np.argwhere(random_map == b"G")
    if positions.size == 0:
        return None  # G not found
    return tuple(positions[0])  # returns (row, col)


__all__ = [
    "SYSTEM_PROMPT",
    "FrozenLakeAction",
    "generate_random_map",
    "get_goal_position",
]