diff --git a/.github/workflows/deploy-hf-env.yml b/.github/workflows/deploy-hf-env.yml index d84833df..12849a3f 100644 --- a/.github/workflows/deploy-hf-env.yml +++ b/.github/workflows/deploy-hf-env.yml @@ -15,6 +15,7 @@ on: - 'chat_env' - 'atari_env' - 'openspiel_env' + - 'maze_env' custom_environment: description: 'Custom environment to deploy (leave empty for none)' required: false @@ -63,7 +64,7 @@ jobs: if [ "${{ github.event.inputs.environment }}" = "all" ]; then echo "deploy_all=true" >> $GITHUB_OUTPUT echo "use_matrix=true" >> $GITHUB_OUTPUT - echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env" >> $GITHUB_OUTPUT + echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env,maze_env" >> $GITHUB_OUTPUT echo "Manual trigger - deploying all environments with matrix" else echo "deploy_all=false" >> $GITHUB_OUTPUT @@ -78,14 +79,14 @@ jobs: if git diff --name-only HEAD~1 HEAD | grep -E '^src/core/' > /dev/null; then echo "deploy_all=true" >> $GITHUB_OUTPUT echo "use_matrix=true" >> $GITHUB_OUTPUT - echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env" >> $GITHUB_OUTPUT + echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env,maze_env" >> $GITHUB_OUTPUT echo "Core files changed - deploying all environments with matrix" exit 0 fi # Check which specific environments changed changed_envs=() - for env in echo_env coding_env chat_env atari_env openspiel_env; do + for env in echo_env coding_env chat_env atari_env openspiel_env maze_env; do if git diff --name-only HEAD~1 HEAD | grep -E "^src/envs/$env/" > /dev/null; then changed_envs+=("$env") fi @@ -110,7 +111,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - environment: [echo_env, coding_env, chat_env, atari_env, openspiel_env] + environment: [echo_env, coding_env, chat_env, atari_env, openspiel_env, maze_env] permissions: contents: read diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 32452a1a..92062cd9 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -79,6 +79,8 @@ jobs: dockerfile: src/envs/atari_env/server/Dockerfile - name: git-env dockerfile: src/envs/git_env/server/Dockerfile + - name: maze-env + dockerfile: src/envs/maze_env/server/Dockerfile - name: my-env # Add your environment here dockerfile: src/envs/connect4_env/server/Dockerfile - name: textarena-env diff --git a/examples/maze_human.py b/examples/maze_human.py new file mode 100644 index 00000000..1efe3912 --- /dev/null +++ b/examples/maze_human.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Simple example of using Maze environment with OpenEnv. + +This demonstrates: +1. Connecting to the Maze environment server +2. Resetting the environment +3. Taking actions +4. Observing rewards +5. Inspecting environment state + +Usage: + python examples/maze_simple.py +""" + +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +import numpy as np +from envs.maze_env import MazeEnv, MazeAction + + +def main(): + print("🧩 Simple Maze Environment Example") + print("=" * 60) + + # Connect to environment server + # Ensure server is running: python -m envs.maze_env.server.app + env = MazeEnv(base_url="http://localhost:8000") + maze = np.array([ + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 1], + [0, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0] + ]) + try: + # Reset environment + print("\nšŸ“ Resetting environment...") + result = env.reset() + + print(f" Initial position: {result.observation.position}") + print(f" Legal actions: {result.observation.legal_actions}") + # Note: Initial total reward is 0 however it is observed it doesn't resets if you run this example again during the same server app session + print(f" Initial Total reward: {result.observation.total_reward}") + # Run one episode + print("\n🚶 Navigating through maze...") + step = 0 + + while not result.done and step < 25: + # Choose random legal action + print(f" Current position: {result.observation.position}") + print(f" Legal actions: {result.observation.legal_actions}") + env.render_ascii_maze(maze,result.observation.position,[0,0],[maze.shape[0],maze.shape[1]]) + action_id = int(input("Make any move from the legal actions")) + # Take action + result = env.step(MazeAction(action=action_id)) + reward = result.observation.total_reward or 0 + + print(f" Step {step + 1}: action={action_id}, pos={result.observation.position}, reward={reward:.2f}, done={result.done}") + step += 1 + print("-----------------------------------------------------") + + print("\nāœ… Episode finished!") + print(f" Total steps: {step}") + print(f" Total reward: {reward}") + + # Get environment state + state = env.state() + print("\nšŸ“Š Environment State:") + print(f" Episode ID: {state.episode_id}") + print(f" Step count: {state.step_count}") + print(f" Done: {state.done}") + + except Exception as e: + print(f"\nāŒ Error: {e}") + print("\nMake sure the server is running:") + print(" python -m envs.maze_env.server.app") + print("\nOr start with Docker:") + print(" docker run -p 8000:8000 maze-env:latest") + + finally: + env.close() + print("\nšŸ‘‹ Done!") + + +if __name__ == "__main__": + main() diff --git a/examples/maze_simple.py b/examples/maze_simple.py new file mode 100644 index 00000000..2b5f5e5f --- /dev/null +++ b/examples/maze_simple.py @@ -0,0 +1,103 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Simple example of using Maze environment with OpenEnv. + +This demonstrates: +1. Connecting to the Maze environment server +2. Resetting the environment +3. Taking actions +4. Observing rewards +5. Inspecting environment state + +Usage: + python examples/maze_simple.py +""" + +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +import numpy as np +from envs.maze_env import MazeEnv, MazeAction + + +def main(): + print("🧩 Simple Maze Environment Example") + print("=" * 60) + + # Connect to environment server + # Ensure server is running: python -m envs.maze_env.server.app + env = MazeEnv(base_url="http://localhost:8000") + maze = np.array([ + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 1], + [0, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0] + ]) + try: + # Reset environment + print("\nšŸ“ Resetting environment...") + result = env.reset() + + print(f" Initial position: {result.observation.position}") + print(f" Legal actions: {result.observation.legal_actions}") + # Note: Initial total reward is 0 however it is observed it doesn't resets if you run this example again during the same server app session + print(f" Initial Total reward: {result.observation.total_reward}") + + # Run one episode + print("\n🚶 Navigating through maze...") + step = 0 + total_reward = 0 + + while not result.done and step < 20: + # Choose random legal action + print(f" Current position: {result.observation.position}") + print(f" Legal actions: {result.observation.legal_actions}") + env.render_ascii_maze(maze,result.observation.position,[0,0],[maze.shape[0],maze.shape[1]]) + action_id = result.observation.legal_actions[step % len(result.observation.legal_actions)] + # Take action + result = env.step(MazeAction(action=action_id)) + + reward = result.reward or 0 + total_reward += reward + + print(f" Step {step + 1}: action={action_id}, pos={result.observation.position}, reward={reward:.2f}, done={result.done}") + step += 1 + print("-----------------------------------------------------") + + print("\nāœ… Episode finished!") + print(f" Total steps: {step}") + print(f" Total reward: {total_reward}") + + # Get environment state + state = env.state() + print("\nšŸ“Š Environment State:") + print(f" Episode ID: {state.episode_id}") + print(f" Step count: {state.step_count}") + print(f" Done: {state.done}") + + except Exception as e: + print(f"\nāŒ Error: {e}") + print("\nMake sure the server is running:") + print(" python -m envs.maze_env.server.app") + print("\nOr start with Docker:") + print(" docker run -p 8000:8000 maze-env:latest") + + finally: + env.close() + print("\nšŸ‘‹ Done!") + + +if __name__ == "__main__": + main() diff --git a/scripts/deploy_to_hf.sh b/scripts/deploy_to_hf.sh index 20256c9a..ef212ffb 100755 --- a/scripts/deploy_to_hf.sh +++ b/scripts/deploy_to_hf.sh @@ -290,6 +290,13 @@ DOCKERFILE_EOF echo "OpenSpiel builds can take 10-15 minutes due to C++ compilation" return # Skip the common parts since OpenSpiel has its own complete Dockerfile ;; + "maze_env") + cat >> "$CURRENT_STAGING_DIR/Dockerfile" << 'DOCKERFILE_EOF' +# Install additional dependencies for ChatEnvironment +RUN pip install --no-cache-dir numpy +DOCKERFILE_EOF + # Maze env requre + ;; esac # Add common parts diff --git a/scripts/prepare_hf_deployment.sh b/scripts/prepare_hf_deployment.sh index 23fd4779..381edffd 100755 --- a/scripts/prepare_hf_deployment.sh +++ b/scripts/prepare_hf_deployment.sh @@ -157,6 +157,7 @@ README_EOF "chat_env") ENV_CLASS="ChatEnv" ;; "atari_env") ENV_CLASS="AtariEnv" ;; "openspiel_env") ENV_CLASS="OpenSpielEnv" ;; + "maze_env") ENV_CLASS="MazeEnv" ;; *) ENV_CLASS="Env" ;; esac diff --git a/src/envs/maze_env/README.md b/src/envs/maze_env/README.md new file mode 100644 index 00000000..c2b4e5cd --- /dev/null +++ b/src/envs/maze_env/README.md @@ -0,0 +1,123 @@ +# Maze Environment + +Integration of Maze game with the OpenEnv framework. + +## Architecture + +``` +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ RL Training Code (Client) │ +│ MazeEnv.step(action) │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ HTTP +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā–¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ FastAPI Server (Docker) │ +│ MazeEnvironment │ +│ ā”œā”€ Wraps Maze environment │ +│ └─ Agent controls player │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +``` + +## Installation & Usage + +### Option 1: Local Development (without Docker) + +**Requirements:** +- Python 3.11+ +- Numpy + +```python +from envs.maze_env import MazeEnv, MazeAction + +# Start local server manually +# python -m envs.maze_env.server.app + +# Connect to local server +env = MazeEnv(base_url="http://localhost:8000") + +# Reset environment +result = env.reset() +print(f"Initial state: {result.observation.info_state}") +print(f"Legal actions: {result.observation.legal_actions}") + +# Take actions +for _ in range(10): + action_id = result.observation.legal_actions[0] # Choose first legal action + result = env.step(MazeAction(action_id=action_id)) + print(f"Reward: {result.reward}, Done: {result.done}") + if result.done: + break + +# Cleanup +env.close() +``` + +### Option 2: Docker (Recommended) + +**Build Docker image:** + +```bash +cd OpenEnv +docker build -f src/envs/maze_env/server/Dockerfile -t maze-env:latest . +``` + +**Use with from_docker_image():** + +```python +from envs.maze_env import MazeEnv, MazeAction + +# Automatically starts container +env = MazeEnv.from_docker_image("maze-env:latest") + +result = env.reset() +result = env.step(MazeAction(action_id=0)) + +env.close() # Stops container +``` + +## Configuration + +### Variables + +- `maze` : Maze as a numpy array saved in mazearray.py + +### Example + +```bash +docker run -p 8000:8000 maze-env:latest +``` + +## API Reference + +### MazeAction + +```python +@dataclass +class MazeAction(Action): + action: int # Action to be taken +``` + +### MazeObservation + +```python +@dataclass +class MazeObservation(Observation): + position: List[int] # [row, col] + total_reward: float # Total reward + legal_actions: List[int] = field(default_factory=list) # Legal action based on the current position +``` + +### MazeState + +```python +@dataclass +class MazeState(State): + episode_id: str # Episode + step_count: int # Number of steps + done: bool = False # Solve status + +``` + +## References + +- [Maze Environment](https://github.com/erikdelange/Reinforcement-Learning-Maze) diff --git a/src/envs/maze_env/__init__.py b/src/envs/maze_env/__init__.py new file mode 100644 index 00000000..0c2c79f7 --- /dev/null +++ b/src/envs/maze_env/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Maze Environment Integration. + +This module provides integration between Maze game and the OpenEnv framework. +""" + +from .client import MazeEnv +from .models import MazeAction, MazeObservation, MazeState + +__all__ = ["MazeEnv", "MazeAction", "MazeObservation", "MazeState"] diff --git a/src/envs/maze_env/client.py b/src/envs/maze_env/client.py new file mode 100644 index 00000000..dfbc1013 --- /dev/null +++ b/src/envs/maze_env/client.py @@ -0,0 +1,92 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +MazeEnv HTTP Client. + +This module provides the client for connecting to a Maze Environment server +over HTTP. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, TYPE_CHECKING + +from core.client_types import StepResult +from core.http_env_client import HTTPEnvClient + +from .models import MazeAction, MazeObservation, MazeState + +if TYPE_CHECKING: + pass + + +class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]): + """HTTP client for Maze Environment.""" + + def render_ascii_maze( + self, + maze: List[List[int]], + position: List[int], + start: List[int], + goal: List[int], + ) -> None: + """ + Render the maze grid as ASCII art in the terminal. + - 0 = free cell + - 1 = wall + - S = start + - G = goal + - P = player + - E = exit + """ + print("\nCurrent Maze State:") + rows, cols = len(maze), len(maze[0]) + for r in range(rows): + line = "" + for c in range(cols): + if [r, c] == position: + line += "P " + elif [r, c] == start: + line += "S " + elif [r, c] == goal: + line += "G " + elif maze[r][c] == 1: + line += "ā–ˆ " + elif r == rows - 1 and c == cols - 1: + line += "E " + else: + line += ". " + print(line) + print() + + def _step_payload(self, action: MazeAction) -> Dict[str, Any]: + """Prepare payload to send to the environment server.""" + return {"action": action.action} + + def _parse_result(self, payload: Dict[str, Any]) -> StepResult[MazeObservation]: + """Parse the response from the server into MazeObservation + reward/done.""" + obs_data = payload.get("observation", {}) + + observation = MazeObservation( + position=obs_data.get("position", []), + total_reward=obs_data.get("total_reward", 0.0), + legal_actions=obs_data.get("legal_actions", []), + ) + + return StepResult( + observation=observation, + reward=payload.get("reward", 0.0), + done=payload.get("done", False), + ) + + def _parse_state(self, payload: Dict[str, Any]) -> MazeState: + """Parse environment state from payload.""" + return MazeState( + episode_id=payload.get("episode_id", ""), + step_count=payload.get("step_count", 0), + done=payload.get("done", False), + ) diff --git a/src/envs/maze_env/models.py b/src/envs/maze_env/models.py new file mode 100644 index 00000000..35a00b14 --- /dev/null +++ b/src/envs/maze_env/models.py @@ -0,0 +1,37 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Data models for Maze Environment. + +This module defines the Action, Observation, and State types for Maze games. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import List + +from core.env_server import Action, Observation, State + + +@dataclass +class MazeAction(Action): + action: int + + +@dataclass +class MazeObservation(Observation): + position: List[int] # [row, col] + total_reward: float + legal_actions: List[int] = field(default_factory=list) + + +@dataclass +class MazeState(State): + episode_id: str + step_count: int + done: bool = False diff --git a/src/envs/maze_env/server/Dockerfile b/src/envs/maze_env/server/Dockerfile new file mode 100644 index 00000000..2d2e3d6f --- /dev/null +++ b/src/envs/maze_env/server/Dockerfile @@ -0,0 +1,43 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Use the pre-built OpenEnv base image +# Built from: docker build -t openenv-base:latest -f src/core/Dockerfile.openenv-base . +# In CI, this can be overridden to use GHCR or other registries +ARG OPENENV_BASE_IMAGE=openenv-base:latest +FROM ${OPENENV_BASE_IMAGE} + +# Install Python dependencies that all environments need +RUN pip install --no-cache-dir \ + numpy>=2.3.4 \ + matplotlib>=3.10.7 + +# Set working directory +WORKDIR /app + +# Copy OpenEnv core (already expected in base image but ensure updated) +COPY src/core/ /app/src/core/ + +# Copy Maze environment +COPY src/envs/maze_env/ /app/src/envs/maze_env/ + +# Copy README for web interface documentation +COPY src/envs/maze_env/README.md /app/README.md + +# Extend Python path for OpenEnv (base image sets PYTHONPATH=/app/src) +# We prepend Maze paths +ENV PYTHONPATH=/app/src + + +# Health check (curl provided by openenv-base) +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Expose default port +EXPOSE 8000 + +# Run the FastAPI server (uvicorn installed by openenv-base) +CMD ["uvicorn", "envs.maze_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/src/envs/maze_env/server/__init__.py b/src/envs/maze_env/server/__init__.py new file mode 100644 index 00000000..1fca47db --- /dev/null +++ b/src/envs/maze_env/server/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Server-side implementation for Maze environments.""" +from .maze import Maze, Status +from .maze_environment import MazeEnvironment + +__all__ = ["Maze", "MazeEnvironment", "Status"] diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py new file mode 100644 index 00000000..d81ed695 --- /dev/null +++ b/src/envs/maze_env/server/app.py @@ -0,0 +1,44 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +FastAPI application for the Maze Environment. + +This module creates an HTTP server that exposes Maze game +over HTTP endpoints, making them compatible with HTTPEnvClient. + +Usage: + # Development (with auto-reload): + uvicorn envs.maze_env.server.app:app --reload --host 0.0.0.0 --port 8000 + + # Production: + uvicorn envs.maze_env.server.app:app --host 0.0.0.0 --port 8000 --workers 4 + + # Or run directly: + python -m envs.maze_env.server.app + +Variables: + maze: np.array - Maze as a numpy array +""" + +from core.env_server import create_app +from ..models import MazeAction, MazeObservation +from .maze_environment import MazeEnvironment +from .mazearray import maze + +# Get game configuration from environment variables + +# Create the environment instance +env = MazeEnvironment(maze_array=maze, start_cell=(0, 0), exit_cell=(7, 7)) + +# Create the FastAPI app with web interface and README integration +app = create_app(env, MazeAction, MazeObservation, env_name="maze_env") + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/src/envs/maze_env/server/maze.py b/src/envs/maze_env/server/maze.py new file mode 100644 index 00000000..1385654f --- /dev/null +++ b/src/envs/maze_env/server/maze.py @@ -0,0 +1,395 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Derived from https://github.com/erikdelange/Reinforcement-Learning-Maze/blob/master/main.py (MIT LICENSE) +# Original Author: Erik de Lange + +import logging +from enum import Enum, IntEnum + +import numpy as np + + +class Cell(IntEnum): + EMPTY = 0 # indicates empty cell where the agent can move to + OCCUPIED = 1 # indicates cell which contains a wall and cannot be entered + CURRENT = 2 # indicates current cell of the agent + + +class Action(IntEnum): + MOVE_LEFT = 2 + MOVE_RIGHT = 3 + MOVE_UP = 0 + MOVE_DOWN = 1 + + +class Render(Enum): + NOTHING = 0 + TRAINING = 1 + MOVES = 2 + + +class Status(Enum): + WIN = 0 + LOSE = 1 + PLAYING = 2 + + +class Maze: + """A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze. + + The layout of the maze and the rules how to move through it are called the environment. An agent is placed + at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every + action results in a reward or penalty which are accumulated during the game. Every move gives a small + penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into + a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The + game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means + winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is + assumed to wander around clueless and looses. + + A note on cell coordinates: + The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze. + This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze + itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple + to (row, col) use (col, row)[::-1] + """ + + actions = [ + Action.MOVE_LEFT, + Action.MOVE_RIGHT, + Action.MOVE_UP, + Action.MOVE_DOWN, + ] # all possible actions + + reward_exit = 10.0 # reward for reaching the exit cell + penalty_move = ( + -0.05 + ) # penalty for a move which did not result in finding the exit cell + penalty_visited = -0.25 # penalty for returning to a cell which was visited earlier + penalty_impossible_move = ( + -0.75 + ) # penalty for trying to enter an occupied cell or moving out of the maze + + def __init__(self, maze, start_cell=(0, 0), exit_cell=None): + """Create a new maze game. + + :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1) + :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left) + :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right) + """ + self.maze = maze + + self.__minimum_reward = ( + -0.5 * self.maze.size + ) # stop game if accumulated reward is below this threshold + + nrows, ncols = self.maze.shape + self.cells = [(col, row) for col in range(ncols) for row in range(nrows)] + self.empty = [ + (col, row) + for col in range(ncols) + for row in range(nrows) + if self.maze[row, col] == Cell.EMPTY + ] + self.__exit_cell = (ncols - 1, nrows - 1) if exit_cell is None else exit_cell + self.empty.remove(self.__exit_cell) + + # Check for impossible maze layout + if self.__exit_cell not in self.cells: + raise Exception( + "Error: exit cell at {} is not inside maze".format(self.__exit_cell) + ) + if self.maze[self.__exit_cell[::-1]] == Cell.OCCUPIED: + raise Exception( + "Error: exit cell at {} is not free".format(self.__exit_cell) + ) + + # Variables for rendering using Matplotlib + self.__render = Render.NOTHING # what to render + self.__ax1 = None # axes for rendering the moves + self.__ax2 = None # axes for rendering the best action per cell + + self.reset(start_cell) + + def reset(self, start_cell=(0, 0)): + """Reset the maze to its initial state and place the agent at start_cell. + + :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left) + :return: new state after reset + """ + if start_cell not in self.cells: + raise Exception( + "Error: start cell at {} is not inside maze".format(start_cell) + ) + if self.maze[start_cell[::-1]] == Cell.OCCUPIED: + raise Exception("Error: start cell at {} is not free".format(start_cell)) + if start_cell == self.__exit_cell: + raise Exception( + "Error: start- and exit cell cannot be the same {}".format(start_cell) + ) + + self.__previous_cell = self.__current_cell = start_cell + self.__total_reward = 0.0 # accumulated reward + self.__visited = set() # a set() only stores unique values + + if self.__render in (Render.TRAINING, Render.MOVES): + # render the maze + nrows, ncols = self.maze.shape + self.__ax1.clear() + self.__ax1.set_xticks(np.arange(0.5, nrows, step=1)) + self.__ax1.set_xticklabels([]) + self.__ax1.set_yticks(np.arange(0.5, ncols, step=1)) + self.__ax1.set_yticklabels([]) + self.__ax1.grid(True) + self.__ax1.plot( + *self.__current_cell, "rs", markersize=30 + ) # start is a big red square + self.__ax1.text( + *self.__current_cell, "Start", ha="center", va="center", color="white" + ) + self.__ax1.plot( + *self.__exit_cell, "gs", markersize=30 + ) # exit is a big green square + self.__ax1.text( + *self.__exit_cell, "Exit", ha="center", va="center", color="white" + ) + self.__ax1.imshow(self.maze, cmap="binary") + self.__ax1.get_figure().canvas.draw() + self.__ax1.get_figure().canvas.flush_events() + + return self.__observe() + + def __draw(self): + """Draw a line from the agents previous cell to its current cell.""" + self.__ax1.plot( + *zip(*[self.__previous_cell, self.__current_cell]), "bo-" + ) # previous cells are blue dots + self.__ax1.plot(*self.__current_cell, "ro") # current cell is a red dot + self.__ax1.get_figure().canvas.draw() + self.__ax1.get_figure().canvas.flush_events() + + def step(self, action): + """Move the agent according to 'action' and return the new state, reward and game status. + + :param Action action: the agent will move in this direction + :return: state, reward, status + """ + reward = self.__execute(action) + self.__total_reward += reward + status = self.__status() + state = self.__observe() + logging.debug( + "action: {:10s} | reward: {: .2f} | status: {}".format( + Action(action).name, reward, status + ) + ) + return state, reward, status + + def __execute(self, action): + """Execute action and collect the reward or penalty. + + :param Action action: direction in which the agent will move + :return float: reward or penalty which results from the action + """ + possible_actions = self.__possible_actions(self.__current_cell) + + if not possible_actions: + reward = ( + self.__minimum_reward - 1 + ) # cannot move anywhere, force end of game + elif action in possible_actions: + col, row = self.__current_cell + if action == Action.MOVE_LEFT: + col -= 1 + elif action == Action.MOVE_UP: + row -= 1 + if action == Action.MOVE_RIGHT: + col += 1 + elif action == Action.MOVE_DOWN: + row += 1 + + self.__previous_cell = self.__current_cell + self.__current_cell = (col, row) + + if self.__render != Render.NOTHING: + self.__draw() + + if self.__current_cell == self.__exit_cell: + reward = Maze.reward_exit # maximum reward when reaching the exit cell + elif self.__current_cell in self.__visited: + reward = ( + Maze.penalty_visited + ) # penalty when returning to a cell which was visited earlier + else: + reward = ( + Maze.penalty_move + ) # penalty for a move which did not result in finding the exit cell + + self.__visited.add(self.__current_cell) + else: + reward = ( + Maze.penalty_impossible_move + ) # penalty for trying to enter an occupied cell or move out of the maze + + return reward + + def __possible_actions(self, cell=None): + """Create a list with all possible actions from 'cell', avoiding the maze's edges and walls. + + :param tuple cell: location of the agent (optional, else use current cell) + :return list: all possible actions + """ + if cell is None: + col, row = self.__current_cell + else: + col, row = cell + + possible_actions = Maze.actions.copy() # initially allow all + + # now restrict the initial list by removing impossible actions + nrows, ncols = self.maze.shape + if row == 0 or (row > 0 and self.maze[row - 1, col] == Cell.OCCUPIED): + possible_actions.remove(Action.MOVE_UP) + if row == nrows - 1 or ( + row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED + ): + possible_actions.remove(Action.MOVE_DOWN) + + if col == 0 or (col > 0 and self.maze[row, col - 1] == Cell.OCCUPIED): + possible_actions.remove(Action.MOVE_LEFT) + if col == ncols - 1 or ( + col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED + ): + possible_actions.remove(Action.MOVE_RIGHT) + + return possible_actions + + def __status(self): + """Return the game status. + + :return Status: current game status (WIN, LOSE, PLAYING) + """ + if self.__current_cell == self.__exit_cell: + return Status.WIN + + if ( + self.__total_reward < self.__minimum_reward + ): # force end of game after too much loss + return Status.LOSE + + return Status.PLAYING + + def __observe(self): + """Return the state of the maze - in this game the agents current location. + + :return numpy.array [1][2]: agents current location + """ + return np.array([[*self.__current_cell]]) + + def play(self, model, start_cell=(0, 0)): + """Play a single game, choosing the next move based a prediction from 'model'. + + :param class AbstractModel model: the prediction model to use + :param tuple start_cell: agents initial cell (optional, else upper left) + :return Status: WIN, LOSE + """ + self.reset(start_cell) + + state = self.__observe() + + while True: + action = model.predict(state=state) + state, reward, status = self.step(action) + if status in (Status.WIN, Status.LOSE): + return status + + def check_win_all(self, model): + """Check if the model wins from all possible starting cells.""" + previous = self.__render + self.__render = ( + Render.NOTHING + ) # avoid rendering anything during execution of the check games + + win = 0 + lose = 0 + + for cell in self.empty: + if self.play(model, cell) == Status.WIN: + win += 1 + else: + lose += 1 + + self.__render = previous # restore previous rendering setting + + logging.info( + "won: {} | lost: {} | win rate: {:.5f}".format( + win, lose, win / (win + lose) + ) + ) + + result = True if lose == 0 else False + + return result, win / (win + lose) + + def render_q(self, model): + """Render the recommended action(s) for each cell as provided by 'model'. + + :param class AbstractModel model: the prediction model to use + """ + + def clip(n): + return max(min(1, n), 0) + + if self.__render == Render.TRAINING: + nrows, ncols = self.maze.shape + + self.__ax2.clear() + self.__ax2.set_xticks(np.arange(0.5, nrows, step=1)) + self.__ax2.set_xticklabels([]) + self.__ax2.set_yticks(np.arange(0.5, ncols, step=1)) + self.__ax2.set_yticklabels([]) + self.__ax2.grid(True) + self.__ax2.plot( + *self.__exit_cell, "gs", markersize=30 + ) # exit is a big green square + self.__ax2.text( + *self.__exit_cell, "Exit", ha="center", va="center", color="white" + ) + + for cell in self.empty: + q = model.q(cell) if model is not None else [0, 0, 0, 0] + a = np.nonzero(q == np.max(q))[0] + + for action in a: + dx = 0 + dy = 0 + if action == Action.MOVE_LEFT: + dx = -0.2 + if action == Action.MOVE_RIGHT: + dx = +0.2 + if action == Action.MOVE_UP: + dy = -0.2 + if action == Action.MOVE_DOWN: + dy = 0.2 + + # color (from red to green) represents the certainty of the preferred action(s) + maxv = 1 + minv = -1 + color = clip( + (q[action] - minv) / (maxv - minv) + ) # normalize in [-1, 1] + + self.__ax2.arrow( + *cell, + dx, + dy, + color=(1 - color, color, 0), + head_width=0.2, + head_length=0.1, + ) + + self.__ax2.imshow(self.maze, cmap="binary") + self.__ax2.get_figure().canvas.draw() diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py new file mode 100644 index 00000000..b9675bcf --- /dev/null +++ b/src/envs/maze_env/server/maze_environment.py @@ -0,0 +1,198 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Maze Environment Server Implementation. + +This module wraps Maze's environment and exposes it +via the OpenEnv Environment interface. +""" + +from typing import List, Tuple, Optional +from core.env_server import Environment +from .maze import Maze +from ..models import MazeAction, MazeObservation, MazeState + +try: + import numpy as np +except ImportError as e: + raise ImportError( + "Numpy is not installed. " + "Please install it following instructions at: " + "pip install numpy" + ) from e + + +class MazeEnvironment(Environment): + """ + Maze Environment wrapper for OpenEnv. + + This environment wraps Maze game and provides a single-agent interface. + + Args: + maze_array: Maze array as numpy array + start cell: Start of the maze + exit_cell: Exit for the maze + """ + + def __init__( + self, + maze_array: np.ndarray, + start_cell: Tuple[int, int] = (0, 0), + exit_cell: Optional[Tuple[int, int]] = (7, 7), + ): + # Create underlying Maze instance (matches your working code) + self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell) + self.total_reward = 0 + self.start_cell = start_cell + self.exit_cell = exit_cell + # env.reset() will be called in reset(); state initialized to None until then + self.state: Optional[MazeState] = None + + def reset(self) -> MazeObservation: + """Reset environment and return initial observation (MazeObservation).""" + observation = ( + self.env.reset() + ) # typically returns np.array([row, col]) or similar + # initialize episode state + self.state = MazeState(episode_id="episode_1", step_count=0, done=False) + + # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields + pos_list = ( + observation.tolist() + if hasattr(observation, "tolist") + else list(observation) + ) + self.total_reward = 0 + legal_actions = self._compute_legal_actions(pos_list[0]) + + return MazeObservation( + position=pos_list, + total_reward=self.total_reward, + legal_actions=legal_actions, + ) + + def step(self, action: MazeAction) -> MazeObservation: + """ + Step function that manipulates the maze position grid + and applies rewards/penalties for movement outcomes. + """ + + # --- Get current position --- + if hasattr(self.env, "agent_position"): + row, col = self.env.agent_position + elif hasattr(self.env, "_Maze__current_cell"): + row, col = self.env._Maze__current_cell + else: + row, col = self.env._Maze__start_cell + + maze = np.array(self.env.maze) + + # --- Define movement directions --- + # 0 = UP, 1 = DOWN, 2 = LEFT, 3 = RIGHT + move_map = { + 0: (-1, 0), + 1: (1, 0), + 2: (0, -1), + 3: (0, 1), + } + + # --- Reward settings --- + reward_exit = 10.0 # reward for reaching the exit cell + reward_move = 0.05 # reward for a move that didn't find the exit but is valid + penalty_visited = -0.25 # penalty for revisiting a cell + penalty_impossible = -0.75 # penalty for invalid move (wall/outside) + + dr, dc = move_map.get(action.action, (0, 0)) + new_r, new_c = row + dr, col + dc + + # Keep track of visited cells + if not hasattr(self, "_visited"): + self._visited = set() + self._visited.add((row, col)) + + # --- Check if move is valid --- + valid_move = ( + 0 <= new_r < maze.shape[0] + and 0 <= new_c < maze.shape[1] + and maze[new_r, new_c] != 1 + ) + + reward = 0.0 + done = False + + if valid_move: + # Update position + row, col = new_r, new_c + + if self.exit_cell and (row, col) == self.exit_cell: + reward += reward_exit + done = True + self._visited = set() + elif (row, col) in self._visited: + reward += penalty_visited + else: + reward += reward_move + else: + # Invalid move + reward += penalty_impossible + + # --- Update environment position --- + if hasattr(self.env, "agent_position"): + self.env.agent_position = (row, col) + elif hasattr(self.env, "_Maze__current_cell"): + self.env._Maze__current_cell = (row, col) + + # --- Total reward update --- + self.total_reward += reward + + # --- Update state --- + if self.state is None: + self.state = MazeState(episode_id="episode_1", step_count=0, done=done) + self.state.step_count += 1 + self.state.done = done + + # --- Observation --- + pos_list = [row, col] + legal_actions = self._compute_legal_actions(pos_list) + # --- Return observation --- + return MazeObservation( + position=pos_list, + total_reward=self.total_reward, + legal_actions=legal_actions, + done=done, + ) + + def state(self) -> Optional[MazeState]: + """Return the current MazeState object.""" + return self.state + + def _compute_legal_actions(self, pos: List[int]) -> List[int]: + """ + Compute which actions are legal given the current normalized position [row, col]. + (0=UP, 1=DOWN, 2=LEFT, 3=RIGHT) + """ + actions: List[int] = [] + if not pos or len(pos) < 2: + return actions + + row, col = int(pos[0]), int(pos[1]) + nrows, ncols = self.env.maze.shape + + # UP + if row > 0 and self.env.maze[row - 1, col] == 0: + actions.append(0) + # DOWN + if row < nrows - 1 and self.env.maze[row + 1, col] == 0: + actions.append(1) + # LEFT + if col > 0 and self.env.maze[row, col - 1] == 0: + actions.append(2) + # RIGHT + if col < ncols - 1 and self.env.maze[row, col + 1] == 0: + actions.append(3) + + return actions diff --git a/src/envs/maze_env/server/mazearray.py b/src/envs/maze_env/server/mazearray.py new file mode 100644 index 00000000..3cd7dbd6 --- /dev/null +++ b/src/envs/maze_env/server/mazearray.py @@ -0,0 +1,15 @@ +import numpy as np + +# Maze +maze = np.array( + [ + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 1], + [0, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + ] +)