From 2bc12df23847d55ed82377ddbe6bae16c26d5a45 Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Tue, 28 Oct 2025 00:00:28 +0530 Subject: [PATCH 01/14] Add maze environment and example --- examples/maze_simple.py | 101 ++++++ src/envs/maze_env/README.md | 123 +++++++ src/envs/maze_env/__init__.py | 16 + src/envs/maze_env/client.py | 85 +++++ src/envs/maze_env/models.py | 37 ++ src/envs/maze_env/server/Dockerfile | 38 ++ src/envs/maze_env/server/__init__.py | 11 + src/envs/maze_env/server/app.py | 44 +++ src/envs/maze_env/server/maze.py | 351 +++++++++++++++++++ src/envs/maze_env/server/maze_environment.py | 160 +++++++++ src/envs/maze_env/server/mazearray.py | 13 + 11 files changed, 979 insertions(+) create mode 100644 examples/maze_simple.py create mode 100644 src/envs/maze_env/README.md create mode 100644 src/envs/maze_env/__init__.py create mode 100644 src/envs/maze_env/client.py create mode 100644 src/envs/maze_env/models.py create mode 100644 src/envs/maze_env/server/Dockerfile create mode 100644 src/envs/maze_env/server/__init__.py create mode 100644 src/envs/maze_env/server/app.py create mode 100644 src/envs/maze_env/server/maze.py create mode 100644 src/envs/maze_env/server/maze_environment.py create mode 100644 src/envs/maze_env/server/mazearray.py diff --git a/examples/maze_simple.py b/examples/maze_simple.py new file mode 100644 index 00000000..c3f27d91 --- /dev/null +++ b/examples/maze_simple.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Simple example of using Maze environment with OpenEnv. + +This demonstrates: +1. Connecting to the Maze environment server +2. Resetting the environment +3. Taking actions +4. Observing rewards +5. Inspecting environment state + +Usage: + python examples/maze_simple.py +""" + +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +import numpy as np +from envs.maze_env import MazeEnv, MazeAction + + +def main(): + print("🧩 Simple Maze Environment Example") + print("=" * 60) + + # Connect to environment server + # Ensure server is running: python -m envs.maze_env.server.app + env = MazeEnv(base_url="http://localhost:8000") + maze = np.array([ + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 1], + [0, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0] + ]) + try: + # Reset environment + print("\nšŸ“ Resetting environment...") + result = env.reset() + + print(f" Initial position: {result.observation.position}") + print(f" Legal actions: {result.observation.legal_actions}") + + # Run one episode + print("\n🚶 Navigating through maze...") + step = 0 + total_reward = 0 + + while not result.done and step < 20: + # Choose random legal action + print(f" Current position: {result.observation.position}") + print(f" Legal actions: {result.observation.legal_actions}") + env.render_ascii_maze(maze,result.observation.position,[0,0],[maze.shape[0],maze.shape[1]]) + action_id = result.observation.legal_actions[step % len(result.observation.legal_actions)] + # Take action + result = env.step(MazeAction(action=action_id)) + + reward = result.reward or 0 + total_reward += reward + + print(f" Step {step + 1}: action={action_id}, pos={result.observation.position}, reward={reward:.2f}, done={result.done}") + step += 1 + print("-----------------------------------------------------") + + print(f"\nāœ… Episode finished!") + print(f" Total steps: {step}") + print(f" Total reward: {total_reward}") + + # Get environment state + state = env.state() + print(f"\nšŸ“Š Environment State:") + print(f" Episode ID: {state.episode_id}") + print(f" Step count: {state.step_count}") + print(f" Done: {state.done}") + + except Exception as e: + print(f"\nāŒ Error: {e}") + print("\nMake sure the server is running:") + print(" python -m envs.maze_env.server.app") + print("\nOr start with Docker:") + print(" docker run -p 8000:8000 maze-env:latest") + + finally: + env.close() + print("\nšŸ‘‹ Done!") + + +if __name__ == "__main__": + main() diff --git a/src/envs/maze_env/README.md b/src/envs/maze_env/README.md new file mode 100644 index 00000000..c2b4e5cd --- /dev/null +++ b/src/envs/maze_env/README.md @@ -0,0 +1,123 @@ +# Maze Environment + +Integration of Maze game with the OpenEnv framework. + +## Architecture + +``` +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ RL Training Code (Client) │ +│ MazeEnv.step(action) │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”¬ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ + │ HTTP +ā”Œā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā–¼ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā” +│ FastAPI Server (Docker) │ +│ MazeEnvironment │ +│ ā”œā”€ Wraps Maze environment │ +│ └─ Agent controls player │ +ā””ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”€ā”˜ +``` + +## Installation & Usage + +### Option 1: Local Development (without Docker) + +**Requirements:** +- Python 3.11+ +- Numpy + +```python +from envs.maze_env import MazeEnv, MazeAction + +# Start local server manually +# python -m envs.maze_env.server.app + +# Connect to local server +env = MazeEnv(base_url="http://localhost:8000") + +# Reset environment +result = env.reset() +print(f"Initial state: {result.observation.info_state}") +print(f"Legal actions: {result.observation.legal_actions}") + +# Take actions +for _ in range(10): + action_id = result.observation.legal_actions[0] # Choose first legal action + result = env.step(MazeAction(action_id=action_id)) + print(f"Reward: {result.reward}, Done: {result.done}") + if result.done: + break + +# Cleanup +env.close() +``` + +### Option 2: Docker (Recommended) + +**Build Docker image:** + +```bash +cd OpenEnv +docker build -f src/envs/maze_env/server/Dockerfile -t maze-env:latest . +``` + +**Use with from_docker_image():** + +```python +from envs.maze_env import MazeEnv, MazeAction + +# Automatically starts container +env = MazeEnv.from_docker_image("maze-env:latest") + +result = env.reset() +result = env.step(MazeAction(action_id=0)) + +env.close() # Stops container +``` + +## Configuration + +### Variables + +- `maze` : Maze as a numpy array saved in mazearray.py + +### Example + +```bash +docker run -p 8000:8000 maze-env:latest +``` + +## API Reference + +### MazeAction + +```python +@dataclass +class MazeAction(Action): + action: int # Action to be taken +``` + +### MazeObservation + +```python +@dataclass +class MazeObservation(Observation): + position: List[int] # [row, col] + total_reward: float # Total reward + legal_actions: List[int] = field(default_factory=list) # Legal action based on the current position +``` + +### MazeState + +```python +@dataclass +class MazeState(State): + episode_id: str # Episode + step_count: int # Number of steps + done: bool = False # Solve status + +``` + +## References + +- [Maze Environment](https://github.com/erikdelange/Reinforcement-Learning-Maze) diff --git a/src/envs/maze_env/__init__.py b/src/envs/maze_env/__init__.py new file mode 100644 index 00000000..0c2c79f7 --- /dev/null +++ b/src/envs/maze_env/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Maze Environment Integration. + +This module provides integration between Maze game and the OpenEnv framework. +""" + +from .client import MazeEnv +from .models import MazeAction, MazeObservation, MazeState + +__all__ = ["MazeEnv", "MazeAction", "MazeObservation", "MazeState"] diff --git a/src/envs/maze_env/client.py b/src/envs/maze_env/client.py new file mode 100644 index 00000000..81188562 --- /dev/null +++ b/src/envs/maze_env/client.py @@ -0,0 +1,85 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +MazeEnv HTTP Client. + +This module provides the client for connecting to a Maze Environment server +over HTTP. +""" + +from __future__ import annotations + +from typing import Any, Dict, List, TYPE_CHECKING + +from core.client_types import StepResult +from core.http_env_client import HTTPEnvClient + +from .models import MazeAction, MazeObservation, MazeState + +if TYPE_CHECKING: + from core.containers.runtime import ContainerProvider + +class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]): + """HTTP client for Maze Environment.""" + + def render_ascii_maze(self, maze: List[List[int]], position: List[int], start: List[int], goal: List[int]) -> None: + """ + Render the maze grid as ASCII art in the terminal. + - 0 = free cell + - 1 = wall + - S = start + - G = goal + - P = player + - E = exit + """ + print("\nCurrent Maze State:") + rows, cols = len(maze), len(maze[0]) + for r in range(rows): + line = "" + for c in range(cols): + if [r, c] == position: + line += "P " + elif [r, c] == start: + line += "S " + elif [r, c] == goal: + line += "G " + elif maze[r][c] == 1: + line += "ā–ˆ " + elif r == rows-1 and c == cols-1: + line+= "E " + else: + line += ". " + print(line) + print() + + def _step_payload(self, action: MazeAction) -> Dict[str, Any]: + """Prepare payload to send to the environment server.""" + return {"action": action.action} + + def _parse_result(self, payload: Dict[str, Any]) -> StepResult[MazeObservation]: + """Parse the response from the server into MazeObservation + reward/done.""" + obs_data = payload.get("observation", {}) + + observation = MazeObservation( + position=obs_data.get("position", []), + total_reward=obs_data.get("total_reward", 0.0), + legal_actions=obs_data.get("legal_actions", []), + ) + + return StepResult( + observation=observation, + reward=payload.get("reward", 0.0), + done=payload.get("done", False), + ) + + def _parse_state(self, payload: Dict[str, Any]) -> MazeState: + """Parse environment state from payload.""" + return MazeState( + episode_id=payload.get("episode_id", ""), + step_count=payload.get("step_count", 0), + done=payload.get("done", False), + ) \ No newline at end of file diff --git a/src/envs/maze_env/models.py b/src/envs/maze_env/models.py new file mode 100644 index 00000000..d642d305 --- /dev/null +++ b/src/envs/maze_env/models.py @@ -0,0 +1,37 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Data models for Maze Environment. + +This module defines the Action, Observation, and State types for Maze games. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from pydantic import Field +from typing import Any, Dict, List, Optional, Tuple, Literal + +from core.env_server import Action, Observation, State + + +@dataclass +class MazeAction(Action): + action: int + + +@dataclass +class MazeObservation(Observation): + position: List[int] # [row, col] + total_reward: float + legal_actions: List[int] = field(default_factory=list) + +@dataclass +class MazeState(State): + episode_id: str + step_count: int + done: bool = False diff --git a/src/envs/maze_env/server/Dockerfile b/src/envs/maze_env/server/Dockerfile new file mode 100644 index 00000000..3000b570 --- /dev/null +++ b/src/envs/maze_env/server/Dockerfile @@ -0,0 +1,38 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Use the pre-built OpenEnv base image +# Built from: docker build -t openenv-base:latest -f src/core/Dockerfile.openenv-base . +# In CI, this can be overridden to use GHCR or other registries +ARG OPENENV_BASE_IMAGE=openenv-base:latest +FROM ${OPENENV_BASE_IMAGE} + +# Set working directory +WORKDIR /app + +# Copy OpenEnv core (already expected in base image but ensure updated) +COPY src/core/ /app/src/core/ + +# Copy Maze environment +COPY src/envs/maze_env/ /app/src/envs/maze_env/ + +# Copy README for web interface documentation +COPY src/envs/maze_env/README.md /app/README.md + +# Extend Python path for OpenEnv (base image sets PYTHONPATH=/app/src) +# We prepend Maze paths +ENV PYTHONPATH=/repo:/repo/build/python:/app/src + + +# Health check (curl provided by openenv-base) +HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \ + CMD curl -f http://localhost:8000/health || exit 1 + +# Expose default port +EXPOSE 8000 + +# Run the FastAPI server (uvicorn installed by openenv-base) +CMD ["uvicorn", "envs.maze_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/src/envs/maze_env/server/__init__.py b/src/envs/maze_env/server/__init__.py new file mode 100644 index 00000000..f3cfcf4a --- /dev/null +++ b/src/envs/maze_env/server/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +"""Server-side implementation for Maze environments.""" +from .maze import Maze, Status +from .maze_environment import MazeEnvironment + +__all__ = ["Maze","MazeEnvironment","Status"] \ No newline at end of file diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py new file mode 100644 index 00000000..0282cd6e --- /dev/null +++ b/src/envs/maze_env/server/app.py @@ -0,0 +1,44 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +FastAPI application for the Maze Environment. + +This module creates an HTTP server that exposes Maze game +over HTTP endpoints, making them compatible with HTTPEnvClient. + +Usage: + # Development (with auto-reload): + uvicorn envs.maze_env.server.app:app --reload --host 0.0.0.0 --port 8000 + + # Production: + uvicorn envs.maze_env.server.app:app --host 0.0.0.0 --port 8000 --workers 4 + + # Or run directly: + python -m envs.maze_env.server.app + +Variables: + maze: np.array - Maze as a numpy array +""" + +from core.env_server import create_app +import numpy as np +from ..models import MazeAction, MazeObservation +from .maze_environment import MazeEnvironment +from .mazearray import maze +# Get game configuration from environment variables + +# Create the environment instance +env = MazeEnvironment(maze_array=maze) + +# Create the FastAPI app with web interface and README integration +app = create_app(env, MazeAction, MazeObservation, env_name="maze_env") + + +if __name__ == "__main__": + import uvicorn + + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/src/envs/maze_env/server/maze.py b/src/envs/maze_env/server/maze.py new file mode 100644 index 00000000..f9dddaaa --- /dev/null +++ b/src/envs/maze_env/server/maze.py @@ -0,0 +1,351 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Derived from https://github.com/erikdelange/Reinforcement-Learning-Maze/blob/master/main.py (MIT LICENSE) +# Original Author: Erik de Lange + +import logging +from enum import Enum, IntEnum + +import matplotlib.pyplot as plt +import numpy as np + + +class Cell(IntEnum): + EMPTY = 0 # indicates empty cell where the agent can move to + OCCUPIED = 1 # indicates cell which contains a wall and cannot be entered + CURRENT = 2 # indicates current cell of the agent + + +class Action(IntEnum): + MOVE_LEFT = 2 + MOVE_RIGHT = 3 + MOVE_UP = 0 + MOVE_DOWN = 1 + + +class Render(Enum): + NOTHING = 0 + TRAINING = 1 + MOVES = 2 + + +class Status(Enum): + WIN = 0 + LOSE = 1 + PLAYING = 2 + + +class Maze: + """ A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze. + + The layout of the maze and the rules how to move through it are called the environment. An agent is placed + at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every + action results in a reward or penalty which are accumulated during the game. Every move gives a small + penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into + a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The + game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means + winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is + assumed to wander around clueless and looses. + + A note on cell coordinates: + The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze. + This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze + itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple + to (row, col) use (col, row)[::-1] + """ + actions = [Action.MOVE_LEFT, Action.MOVE_RIGHT, Action.MOVE_UP, Action.MOVE_DOWN] # all possible actions + + reward_exit = 10.0 # reward for reaching the exit cell + penalty_move = -0.05 # penalty for a move which did not result in finding the exit cell + penalty_visited = -0.25 # penalty for returning to a cell which was visited earlier + penalty_impossible_move = -0.75 # penalty for trying to enter an occupied cell or moving out of the maze + + def __init__(self, maze, start_cell=(0, 0), exit_cell=None): + """ Create a new maze game. + + :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1) + :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left) + :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right) + """ + self.maze = maze + + self.__minimum_reward = -0.5 * self.maze.size # stop game if accumulated reward is below this threshold + + nrows, ncols = self.maze.shape + self.cells = [(col, row) for col in range(ncols) for row in range(nrows)] + self.empty = [(col, row) for col in range(ncols) for row in range(nrows) if self.maze[row, col] == Cell.EMPTY] + self.__exit_cell = (ncols - 1, nrows - 1) if exit_cell is None else exit_cell + self.empty.remove(self.__exit_cell) + + # Check for impossible maze layout + if self.__exit_cell not in self.cells: + raise Exception("Error: exit cell at {} is not inside maze".format(self.__exit_cell)) + if self.maze[self.__exit_cell[::-1]] == Cell.OCCUPIED: + raise Exception("Error: exit cell at {} is not free".format(self.__exit_cell)) + + # Variables for rendering using Matplotlib + self.__render = Render.NOTHING # what to render + self.__ax1 = None # axes for rendering the moves + self.__ax2 = None # axes for rendering the best action per cell + + self.reset(start_cell) + + def reset(self, start_cell=(0, 0)): + """ Reset the maze to its initial state and place the agent at start_cell. + + :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left) + :return: new state after reset + """ + if start_cell not in self.cells: + raise Exception("Error: start cell at {} is not inside maze".format(start_cell)) + if self.maze[start_cell[::-1]] == Cell.OCCUPIED: + raise Exception("Error: start cell at {} is not free".format(start_cell)) + if start_cell == self.__exit_cell: + raise Exception("Error: start- and exit cell cannot be the same {}".format(start_cell)) + + self.__previous_cell = self.__current_cell = start_cell + self.__total_reward = 0.0 # accumulated reward + self.__visited = set() # a set() only stores unique values + + if self.__render in (Render.TRAINING, Render.MOVES): + # render the maze + nrows, ncols = self.maze.shape + self.__ax1.clear() + self.__ax1.set_xticks(np.arange(0.5, nrows, step=1)) + self.__ax1.set_xticklabels([]) + self.__ax1.set_yticks(np.arange(0.5, ncols, step=1)) + self.__ax1.set_yticklabels([]) + self.__ax1.grid(True) + self.__ax1.plot(*self.__current_cell, "rs", markersize=30) # start is a big red square + self.__ax1.text(*self.__current_cell, "Start", ha="center", va="center", color="white") + self.__ax1.plot(*self.__exit_cell, "gs", markersize=30) # exit is a big green square + self.__ax1.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white") + self.__ax1.imshow(self.maze, cmap="binary") + self.__ax1.get_figure().canvas.draw() + self.__ax1.get_figure().canvas.flush_events() + + return self.__observe() + + def __draw(self): + """ Draw a line from the agents previous cell to its current cell. """ + self.__ax1.plot(*zip(*[self.__previous_cell, self.__current_cell]), "bo-") # previous cells are blue dots + self.__ax1.plot(*self.__current_cell, "ro") # current cell is a red dot + self.__ax1.get_figure().canvas.draw() + self.__ax1.get_figure().canvas.flush_events() + + def render(self, content=Render.NOTHING): + """ Record what will be rendered during play and/or training. + + :param Render content: NOTHING, TRAINING, MOVES + """ + self.__render = content + + if self.__render == Render.NOTHING: + if self.__ax1: + self.__ax1.get_figure().close() + self.__ax1 = None + if self.__ax2: + self.__ax2.get_figure().close() + self.__ax2 = None + if self.__render == Render.TRAINING: + if self.__ax2 is None: + fig, self.__ax2 = plt.subplots(1, 1, tight_layout=True) + fig.canvas.set_window_title("Best move") + self.__ax2.set_axis_off() + self.render_q(None) + if self.__render in (Render.MOVES, Render.TRAINING): + if self.__ax1 is None: + fig, self.__ax1 = plt.subplots(1, 1, tight_layout=True) + fig.canvas.set_window_title("Maze") + + plt.show(block=False) + + def step(self, action): + """ Move the agent according to 'action' and return the new state, reward and game status. + + :param Action action: the agent will move in this direction + :return: state, reward, status + """ + reward = self.__execute(action) + self.__total_reward += reward + status = self.__status() + state = self.__observe() + logging.debug("action: {:10s} | reward: {: .2f} | status: {}".format(Action(action).name, reward, status)) + return state, reward, status + + def __execute(self, action): + """ Execute action and collect the reward or penalty. + + :param Action action: direction in which the agent will move + :return float: reward or penalty which results from the action + """ + possible_actions = self.__possible_actions(self.__current_cell) + + if not possible_actions: + reward = self.__minimum_reward - 1 # cannot move anywhere, force end of game + elif action in possible_actions: + col, row = self.__current_cell + if action == Action.MOVE_LEFT: + col -= 1 + elif action == Action.MOVE_UP: + row -= 1 + if action == Action.MOVE_RIGHT: + col += 1 + elif action == Action.MOVE_DOWN: + row += 1 + + self.__previous_cell = self.__current_cell + self.__current_cell = (col, row) + + if self.__render != Render.NOTHING: + self.__draw() + + if self.__current_cell == self.__exit_cell: + reward = Maze.reward_exit # maximum reward when reaching the exit cell + elif self.__current_cell in self.__visited: + reward = Maze.penalty_visited # penalty when returning to a cell which was visited earlier + else: + reward = Maze.penalty_move # penalty for a move which did not result in finding the exit cell + + self.__visited.add(self.__current_cell) + else: + reward = Maze.penalty_impossible_move # penalty for trying to enter an occupied cell or move out of the maze + + return reward + + def __possible_actions(self, cell=None): + """ Create a list with all possible actions from 'cell', avoiding the maze's edges and walls. + + :param tuple cell: location of the agent (optional, else use current cell) + :return list: all possible actions + """ + if cell is None: + col, row = self.__current_cell + else: + col, row = cell + + possible_actions = Maze.actions.copy() # initially allow all + + # now restrict the initial list by removing impossible actions + nrows, ncols = self.maze.shape + if row == 0 or (row > 0 and self.maze[row - 1, col] == Cell.OCCUPIED): + possible_actions.remove(Action.MOVE_UP) + if row == nrows - 1 or (row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED): + possible_actions.remove(Action.MOVE_DOWN) + + if col == 0 or (col > 0 and self.maze[row, col - 1] == Cell.OCCUPIED): + possible_actions.remove(Action.MOVE_LEFT) + if col == ncols - 1 or (col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED): + possible_actions.remove(Action.MOVE_RIGHT) + + return possible_actions + + def __status(self): + """ Return the game status. + + :return Status: current game status (WIN, LOSE, PLAYING) + """ + if self.__current_cell == self.__exit_cell: + return Status.WIN + + if self.__total_reward < self.__minimum_reward: # force end of game after too much loss + return Status.LOSE + + return Status.PLAYING + + def __observe(self): + """ Return the state of the maze - in this game the agents current location. + + :return numpy.array [1][2]: agents current location + """ + return np.array([[*self.__current_cell]]) + + def play(self, model, start_cell=(0, 0)): + """ Play a single game, choosing the next move based a prediction from 'model'. + + :param class AbstractModel model: the prediction model to use + :param tuple start_cell: agents initial cell (optional, else upper left) + :return Status: WIN, LOSE + """ + self.reset(start_cell) + + state = self.__observe() + + while True: + action = model.predict(state=state) + state, reward, status = self.step(action) + if status in (Status.WIN, Status.LOSE): + return status + + def check_win_all(self, model): + """ Check if the model wins from all possible starting cells. """ + previous = self.__render + self.__render = Render.NOTHING # avoid rendering anything during execution of the check games + + win = 0 + lose = 0 + + for cell in self.empty: + if self.play(model, cell) == Status.WIN: + win += 1 + else: + lose += 1 + + self.__render = previous # restore previous rendering setting + + logging.info("won: {} | lost: {} | win rate: {:.5f}".format(win, lose, win / (win + lose))) + + result = True if lose == 0 else False + + return result, win / (win + lose) + + def render_q(self, model): + """ Render the recommended action(s) for each cell as provided by 'model'. + + :param class AbstractModel model: the prediction model to use + """ + + def clip(n): + return max(min(1, n), 0) + + if self.__render == Render.TRAINING: + nrows, ncols = self.maze.shape + + self.__ax2.clear() + self.__ax2.set_xticks(np.arange(0.5, nrows, step=1)) + self.__ax2.set_xticklabels([]) + self.__ax2.set_yticks(np.arange(0.5, ncols, step=1)) + self.__ax2.set_yticklabels([]) + self.__ax2.grid(True) + self.__ax2.plot(*self.__exit_cell, "gs", markersize=30) # exit is a big green square + self.__ax2.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white") + + for cell in self.empty: + q = model.q(cell) if model is not None else [0, 0, 0, 0] + a = np.nonzero(q == np.max(q))[0] + + for action in a: + dx = 0 + dy = 0 + if action == Action.MOVE_LEFT: + dx = -0.2 + if action == Action.MOVE_RIGHT: + dx = +0.2 + if action == Action.MOVE_UP: + dy = -0.2 + if action == Action.MOVE_DOWN: + dy = 0.2 + + # color (from red to green) represents the certainty of the preferred action(s) + maxv = 1 + minv = -1 + color = clip((q[action] - minv) / (maxv - minv)) # normalize in [-1, 1] + + self.__ax2.arrow(*cell, dx, dy, color=(1 - color, color, 0), head_width=0.2, head_length=0.1) + + self.__ax2.imshow(self.maze, cmap="binary") + self.__ax2.get_figure().canvas.draw() \ No newline at end of file diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py new file mode 100644 index 00000000..e9560bb1 --- /dev/null +++ b/src/envs/maze_env/server/maze_environment.py @@ -0,0 +1,160 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Maze Environment Server Implementation. + +This module wraps Maze's environment and exposes it +via the OpenEnv Environment interface. +""" + +from typing import Any, Dict, List, Tuple, Optional +from core.env_server import Action, Environment, Observation +from .maze import Maze, Status +from ..models import MazeAction, MazeObservation, MazeState + +try: + import numpy as np +except ImportError as e: + raise ImportError( + "Numpy is not installed. " + "Please install it following instructions at: " + "pip install numpy" + ) from e + + +class MazeEnvironment(Environment): + """ + Maze Environment wrapper for OpenEnv. + + This environment wraps Maze game and provides a single-agent interface. + + Args: + maze_array: Maze array as numpy array + start cell: Start of the maze + exit_cell: Exit for the maze + """ + + def __init__( + self, + maze_array: np.ndarray, + start_cell: Tuple[int, int] = (0, 0), + exit_cell: Optional[Tuple[int, int]] = None, + ): + # Create underlying Maze instance (matches your working code) + self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell) + + # env.reset() will be called in reset(); state initialized to None until then + self.state: Optional[MazeState] = None + + def reset(self) -> MazeObservation: + """Reset environment and return initial observation (MazeObservation).""" + observation = self.env.reset() # typically returns np.array([row, col]) or similar + # initialize episode state + self.state = MazeState(episode_id="episode_1", step_count=0, done=False) + + # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields + pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation) + total_reward = getattr(self.env, "_Maze__total_reward", 0.0) + legal_actions = self._compute_legal_actions(pos_list[0]) + + return MazeObservation(position=pos_list, total_reward=total_reward, legal_actions=legal_actions) + + def step(self, action: MazeAction) -> MazeObservation: + """ + Step function that directly manipulates the maze position grid + to ensure visible player movement. + """ + + # --- Get current position --- + if hasattr(self.env, "agent_position"): + row, col = self.env.agent_position + elif hasattr(self.env, "_Maze__current_cell"): + row, col = self.env._Maze__current_cell + else: + row, col = self.env._Maze__start_cell + + maze = np.array(self.env.maze) + + # --- Define movement directions --- + # 0 = UP, 1 = DOWN, 2 = LEFT, 3 = RIGHT + move_map = { + 0: (-1, 0), + 1: (1, 0), + 2: (0, -1), + 3: (0, 1), + } + + dr, dc = move_map.get(action.action, (0, 0)) + new_r, new_c = row + dr, col + dc + + # --- Check if move is within bounds and not a wall --- + if ( + 0 <= new_r < maze.shape[0] + and 0 <= new_c < maze.shape[1] + and maze[new_r, new_c] != 1 # assuming 1 = wall, 0 = free space + ): + row, col = new_r, new_c + + # --- Update environment position --- + if hasattr(self.env, "agent_position"): + self.env.agent_position = (row, col) + elif hasattr(self.env, "_Maze__current_cell"): + self.env._Maze__current_cell = (row, col) + + # --- Reward and done --- + total_reward = getattr(self.env, "_Maze__total_reward", 0.0) + if hasattr(self.env, "_Maze__total_reward"): + self.env._Maze__total_reward = total_reward + 0.0 # change as needed + + exit_cell = getattr(self.env, "exit_cell", None) + done = exit_cell is not None and (row, col) == exit_cell + + # --- Update state --- + if self.state is None: + self.state = MazeState(episode_id="episode_1", step_count=0, done=done) + self.state.step_count += 1 + self.state.done = done + + pos_list = [row, col] + legal_actions = self._compute_legal_actions(pos_list) + + return MazeObservation( + position=pos_list, + total_reward=total_reward, + legal_actions=legal_actions, + ) + + def state(self) -> Optional[MazeState]: + """Return the current MazeState object.""" + return self.state + + def _compute_legal_actions(self, pos: List[int]) -> List[int]: + """ + Compute which actions are legal given the current normalized position [row, col]. + (0=UP, 1=DOWN, 2=LEFT, 3=RIGHT) + """ + actions: List[int] = [] + if not pos or len(pos) < 2: + return actions + + row, col = int(pos[0]), int(pos[1]) + nrows, ncols = self.env.maze.shape + + # UP + if row > 0 and self.env.maze[row - 1, col] == 0: + actions.append(0) + # DOWN + if row < nrows - 1 and self.env.maze[row + 1, col] == 0: + actions.append(1) + # LEFT + if col > 0 and self.env.maze[row, col - 1] == 0: + actions.append(2) + # RIGHT + if col < ncols - 1 and self.env.maze[row, col + 1] == 0: + actions.append(3) + + return actions \ No newline at end of file diff --git a/src/envs/maze_env/server/mazearray.py b/src/envs/maze_env/server/mazearray.py new file mode 100644 index 00000000..b87935e2 --- /dev/null +++ b/src/envs/maze_env/server/mazearray.py @@ -0,0 +1,13 @@ +import numpy as np + +# Maze +maze = np.array([ + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 1], + [0, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0] +]) \ No newline at end of file From e3eafcf12575a29cfd79c31a498cbf35430a0eaf Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Tue, 28 Oct 2025 00:17:56 +0530 Subject: [PATCH 02/14] Add maze environment to workflow matrix --- .github/workflows/docker-build.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 8cba8b47..b82da2a9 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -79,7 +79,8 @@ jobs: dockerfile: src/envs/atari_env/server/Dockerfile - name: git-env dockerfile: src/envs/git_env/server/Dockerfile - + - name: maze-env + dockerfile: src/envs/maze_env/server/Dockerfile steps: - name: Checkout code uses: actions/checkout@v4 From d45843465052682620144142b8e88b5883774b60 Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Tue, 28 Oct 2025 00:23:54 +0530 Subject: [PATCH 03/14] Add dependencies to Dockerfile --- src/envs/maze_env/server/Dockerfile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/envs/maze_env/server/Dockerfile b/src/envs/maze_env/server/Dockerfile index 3000b570..544d3e44 100644 --- a/src/envs/maze_env/server/Dockerfile +++ b/src/envs/maze_env/server/Dockerfile @@ -10,6 +10,11 @@ ARG OPENENV_BASE_IMAGE=openenv-base:latest FROM ${OPENENV_BASE_IMAGE} +# Install Python dependencies that all environments need +RUN pip install --no-cache-dir \ + numpy>=2.3.4 \ + matplotlib>=3.10.7 + # Set working directory WORKDIR /app From e9e1c0f2089f6794edf1abb8e333e06be45df50b Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Tue, 28 Oct 2025 03:58:52 +0530 Subject: [PATCH 04/14] Add reward function --- src/envs/maze_env/server/maze_environment.py | 62 +++++++++++++++----- 1 file changed, 47 insertions(+), 15 deletions(-) diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py index e9560bb1..7d86dc06 100644 --- a/src/envs/maze_env/server/maze_environment.py +++ b/src/envs/maze_env/server/maze_environment.py @@ -46,7 +46,7 @@ def __init__( ): # Create underlying Maze instance (matches your working code) self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell) - + self.total_reward = 0 # env.reset() will be called in reset(); state initialized to None until then self.state: Optional[MazeState] = None @@ -65,8 +65,8 @@ def reset(self) -> MazeObservation: def step(self, action: MazeAction) -> MazeObservation: """ - Step function that directly manipulates the maze position grid - to ensure visible player movement. + Step function that manipulates the maze position grid + and applies rewards/penalties for movement outcomes. """ # --- Get current position --- @@ -88,30 +88,59 @@ def step(self, action: MazeAction) -> MazeObservation: 3: (0, 1), } + # --- Reward settings --- + reward_exit = 10.0 # reward for reaching the exit cell + penalty_move = 0.05 # penalty for a move that didn't find the exit + penalty_visited = -0.25 # penalty for revisiting a cell + penalty_impossible = -0.75 # penalty for invalid move (wall/outside) + dr, dc = move_map.get(action.action, (0, 0)) new_r, new_c = row + dr, col + dc - # --- Check if move is within bounds and not a wall --- - if ( + # Keep track of visited cells + if not hasattr(self, "_visited"): + self._visited = set() + self._visited.add((row, col)) + + # --- Check if move is valid --- + valid_move = ( 0 <= new_r < maze.shape[0] and 0 <= new_c < maze.shape[1] - and maze[new_r, new_c] != 1 # assuming 1 = wall, 0 = free space - ): + and maze[new_r, new_c] != 1 + ) + + reward = 0.0 + done = False + + if valid_move: + # Update position row, col = new_r, new_c + exit_cell = getattr(self.env, "exit_cell", None) + if exit_cell and (row, col) == exit_cell: + reward += reward_exit + done = True + self._visited = set() + elif (row, col) in self._visited: + reward += penalty_visited + else: + reward += penalty_move + else: + # Invalid move + reward += penalty_impossible + # --- Update environment position --- if hasattr(self.env, "agent_position"): self.env.agent_position = (row, col) elif hasattr(self.env, "_Maze__current_cell"): self.env._Maze__current_cell = (row, col) - # --- Reward and done --- - total_reward = getattr(self.env, "_Maze__total_reward", 0.0) - if hasattr(self.env, "_Maze__total_reward"): - self.env._Maze__total_reward = total_reward + 0.0 # change as needed - - exit_cell = getattr(self.env, "exit_cell", None) - done = exit_cell is not None and (row, col) == exit_cell + # --- Total reward update --- + self.total_reward += reward + print("Total reward:",self.total_reward) + print("Reward:",reward) + # if hasattr(self.env, "_Maze__total_reward"): + # self.env._Maze__total_reward = total_reward # --- Update state --- if self.state is None: @@ -119,15 +148,18 @@ def step(self, action: MazeAction) -> MazeObservation: self.state.step_count += 1 self.state.done = done + # --- Observation --- pos_list = [row, col] legal_actions = self._compute_legal_actions(pos_list) + # --- Return observation --- return MazeObservation( position=pos_list, - total_reward=total_reward, + total_reward=self.total_reward, legal_actions=legal_actions, ) + def state(self) -> Optional[MazeState]: """Return the current MazeState object.""" return self.state From f47d107e3dbc964cb28138eb5a33b8e7ef6877c9 Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Tue, 28 Oct 2025 03:59:43 +0530 Subject: [PATCH 05/14] Update maze example for reward --- examples/maze_human.py | 101 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 examples/maze_human.py diff --git a/examples/maze_human.py b/examples/maze_human.py new file mode 100644 index 00000000..1f81449a --- /dev/null +++ b/examples/maze_human.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Simple example of using Maze environment with OpenEnv. + +This demonstrates: +1. Connecting to the Maze environment server +2. Resetting the environment +3. Taking actions +4. Observing rewards +5. Inspecting environment state + +Usage: + python examples/maze_simple.py +""" + +import sys +from pathlib import Path + +# Add src to path +sys.path.insert(0, str(Path(__file__).parent.parent / "src")) +import numpy as np +from envs.maze_env import MazeEnv, MazeAction + + +def main(): + print("🧩 Simple Maze Environment Example") + print("=" * 60) + + # Connect to environment server + # Ensure server is running: python -m envs.maze_env.server.app + env = MazeEnv(base_url="http://localhost:8000") + maze = np.array([ + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 1], + [0, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0] + ]) + try: + # Reset environment + print("\nšŸ“ Resetting environment...") + result = env.reset() + + print(f" Initial position: {result.observation.position}") + print(f" Legal actions: {result.observation.legal_actions}") + + # Run one episode + print("\n🚶 Navigating through maze...") + step = 0 + total_reward = 0 + + while not result.done and step < 25: + # Choose random legal action + print(f" Current position: {result.observation.position}") + print(f" Legal actions: {result.observation.legal_actions}") + env.render_ascii_maze(maze,result.observation.position,[0,0],[maze.shape[0],maze.shape[1]]) + action_id = int(input("Make any move from the legal actions")) + # Take action + result = env.step(MazeAction(action=action_id)) + print(result) + reward = result.observation.total_reward or 0 + total_reward += reward + + print(f" Step {step + 1}: action={action_id}, pos={result.observation.position}, reward={reward:.2f}, done={result.done}") + step += 1 + print("-----------------------------------------------------") + + print(f"\nāœ… Episode finished!") + print(f" Total steps: {step}") + print(f" Total reward: {total_reward}") + + # Get environment state + state = env.state() + print(f"\nšŸ“Š Environment State:") + print(f" Episode ID: {state.episode_id}") + print(f" Step count: {state.step_count}") + print(f" Done: {state.done}") + + except Exception as e: + print(f"\nāŒ Error: {e}") + print("\nMake sure the server is running:") + print(" python -m envs.maze_env.server.app") + print("\nOr start with Docker:") + print(" docker run -p 8000:8000 maze-env:latest") + + finally: + env.close() + print("\nšŸ‘‹ Done!") + + +if __name__ == "__main__": + main() From 6c0c7a8fbe2a35518c418a29d7115703fa3b8a57 Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Tue, 28 Oct 2025 17:45:37 +0530 Subject: [PATCH 06/14] Implement done flag for win --- examples/maze_human.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/examples/maze_human.py b/examples/maze_human.py index 1f81449a..97095e5b 100644 --- a/examples/maze_human.py +++ b/examples/maze_human.py @@ -52,11 +52,11 @@ def main(): print(f" Initial position: {result.observation.position}") print(f" Legal actions: {result.observation.legal_actions}") - + # Note: Initial total reward is 0 however it is observed it doesn't resets if you run this example again during the same server app session + print(f" Initial Total reward: {result.observation.total_reward}") # Run one episode print("\n🚶 Navigating through maze...") step = 0 - total_reward = 0 while not result.done and step < 25: # Choose random legal action @@ -66,9 +66,7 @@ def main(): action_id = int(input("Make any move from the legal actions")) # Take action result = env.step(MazeAction(action=action_id)) - print(result) reward = result.observation.total_reward or 0 - total_reward += reward print(f" Step {step + 1}: action={action_id}, pos={result.observation.position}, reward={reward:.2f}, done={result.done}") step += 1 @@ -76,7 +74,7 @@ def main(): print(f"\nāœ… Episode finished!") print(f" Total steps: {step}") - print(f" Total reward: {total_reward}") + print(f" Total reward: {reward}") # Get environment state state = env.state() From 373ff5bc1a9b7996a23694fbe6e0be270cd4d095 Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Tue, 28 Oct 2025 17:49:06 +0530 Subject: [PATCH 07/14] Minor fix --- src/envs/maze_env/server/app.py | 2 +- src/envs/maze_env/server/maze_environment.py | 15 ++++++--------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py index 0282cd6e..6076e516 100644 --- a/src/envs/maze_env/server/app.py +++ b/src/envs/maze_env/server/app.py @@ -32,7 +32,7 @@ # Get game configuration from environment variables # Create the environment instance -env = MazeEnvironment(maze_array=maze) +env = MazeEnvironment(maze_array=maze,start_cell=(0,0),exit_cell=(7,7)) # Create the FastAPI app with web interface and README integration app = create_app(env, MazeAction, MazeObservation, env_name="maze_env") diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py index 7d86dc06..a79e754f 100644 --- a/src/envs/maze_env/server/maze_environment.py +++ b/src/envs/maze_env/server/maze_environment.py @@ -42,11 +42,13 @@ def __init__( self, maze_array: np.ndarray, start_cell: Tuple[int, int] = (0, 0), - exit_cell: Optional[Tuple[int, int]] = None, + exit_cell: Optional[Tuple[int, int]] = (7,7), ): # Create underlying Maze instance (matches your working code) self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell) self.total_reward = 0 + self.start_cell = start_cell + self.exit_cell = exit_cell # env.reset() will be called in reset(); state initialized to None until then self.state: Optional[MazeState] = None @@ -58,7 +60,7 @@ def reset(self) -> MazeObservation: # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation) - total_reward = getattr(self.env, "_Maze__total_reward", 0.0) + total_reward = 0 legal_actions = self._compute_legal_actions(pos_list[0]) return MazeObservation(position=pos_list, total_reward=total_reward, legal_actions=legal_actions) @@ -116,8 +118,7 @@ def step(self, action: MazeAction) -> MazeObservation: # Update position row, col = new_r, new_c - exit_cell = getattr(self.env, "exit_cell", None) - if exit_cell and (row, col) == exit_cell: + if self.exit_cell and (row, col) == self.exit_cell: reward += reward_exit done = True self._visited = set() @@ -137,10 +138,6 @@ def step(self, action: MazeAction) -> MazeObservation: # --- Total reward update --- self.total_reward += reward - print("Total reward:",self.total_reward) - print("Reward:",reward) - # if hasattr(self.env, "_Maze__total_reward"): - # self.env._Maze__total_reward = total_reward # --- Update state --- if self.state is None: @@ -151,12 +148,12 @@ def step(self, action: MazeAction) -> MazeObservation: # --- Observation --- pos_list = [row, col] legal_actions = self._compute_legal_actions(pos_list) - # --- Return observation --- return MazeObservation( position=pos_list, total_reward=self.total_reward, legal_actions=legal_actions, + done=done ) From cc336cf3117031606397310f2529914e9d12c251 Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Tue, 28 Oct 2025 17:55:41 +0530 Subject: [PATCH 08/14] Typo fix --- examples/maze_simple.py | 2 ++ src/envs/maze_env/server/maze_environment.py | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/maze_simple.py b/examples/maze_simple.py index c3f27d91..ddacdbe3 100644 --- a/examples/maze_simple.py +++ b/examples/maze_simple.py @@ -52,6 +52,8 @@ def main(): print(f" Initial position: {result.observation.position}") print(f" Legal actions: {result.observation.legal_actions}") + # Note: Initial total reward is 0 however it is observed it doesn't resets if you run this example again during the same server app session + print(f" Initial Total reward: {result.observation.total_reward}") # Run one episode print("\n🚶 Navigating through maze...") diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py index a79e754f..860ea936 100644 --- a/src/envs/maze_env/server/maze_environment.py +++ b/src/envs/maze_env/server/maze_environment.py @@ -92,7 +92,7 @@ def step(self, action: MazeAction) -> MazeObservation: # --- Reward settings --- reward_exit = 10.0 # reward for reaching the exit cell - penalty_move = 0.05 # penalty for a move that didn't find the exit + reward_move = 0.05 # reward for a move that didn't find the exit but is valid penalty_visited = -0.25 # penalty for revisiting a cell penalty_impossible = -0.75 # penalty for invalid move (wall/outside) @@ -125,7 +125,7 @@ def step(self, action: MazeAction) -> MazeObservation: elif (row, col) in self._visited: reward += penalty_visited else: - reward += penalty_move + reward += reward_move else: # Invalid move reward += penalty_impossible From 697967949f847a4b9315f311061ca55acc6b66ff Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Sun, 2 Nov 2025 18:08:05 +0530 Subject: [PATCH 09/14] Remove unused imports and minor fix --- examples/maze_human.py | 4 ++-- examples/maze_simple.py | 4 ++-- src/envs/maze_env/client.py | 2 +- src/envs/maze_env/models.py | 3 +-- src/envs/maze_env/server/app.py | 1 - src/envs/maze_env/server/maze_environment.py | 10 +++++----- 6 files changed, 11 insertions(+), 13 deletions(-) diff --git a/examples/maze_human.py b/examples/maze_human.py index 97095e5b..1efe3912 100644 --- a/examples/maze_human.py +++ b/examples/maze_human.py @@ -72,13 +72,13 @@ def main(): step += 1 print("-----------------------------------------------------") - print(f"\nāœ… Episode finished!") + print("\nāœ… Episode finished!") print(f" Total steps: {step}") print(f" Total reward: {reward}") # Get environment state state = env.state() - print(f"\nšŸ“Š Environment State:") + print("\nšŸ“Š Environment State:") print(f" Episode ID: {state.episode_id}") print(f" Step count: {state.step_count}") print(f" Done: {state.done}") diff --git a/examples/maze_simple.py b/examples/maze_simple.py index ddacdbe3..2b5f5e5f 100644 --- a/examples/maze_simple.py +++ b/examples/maze_simple.py @@ -76,13 +76,13 @@ def main(): step += 1 print("-----------------------------------------------------") - print(f"\nāœ… Episode finished!") + print("\nāœ… Episode finished!") print(f" Total steps: {step}") print(f" Total reward: {total_reward}") # Get environment state state = env.state() - print(f"\nšŸ“Š Environment State:") + print("\nšŸ“Š Environment State:") print(f" Episode ID: {state.episode_id}") print(f" Step count: {state.step_count}") print(f" Done: {state.done}") diff --git a/src/envs/maze_env/client.py b/src/envs/maze_env/client.py index 81188562..a00887fd 100644 --- a/src/envs/maze_env/client.py +++ b/src/envs/maze_env/client.py @@ -21,7 +21,7 @@ from .models import MazeAction, MazeObservation, MazeState if TYPE_CHECKING: - from core.containers.runtime import ContainerProvider + pass class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]): """HTTP client for Maze Environment.""" diff --git a/src/envs/maze_env/models.py b/src/envs/maze_env/models.py index d642d305..2461299e 100644 --- a/src/envs/maze_env/models.py +++ b/src/envs/maze_env/models.py @@ -13,8 +13,7 @@ from __future__ import annotations from dataclasses import dataclass, field -from pydantic import Field -from typing import Any, Dict, List, Optional, Tuple, Literal +from typing import List from core.env_server import Action, Observation, State diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py index 6076e516..3a9ba099 100644 --- a/src/envs/maze_env/server/app.py +++ b/src/envs/maze_env/server/app.py @@ -25,7 +25,6 @@ """ from core.env_server import create_app -import numpy as np from ..models import MazeAction, MazeObservation from .maze_environment import MazeEnvironment from .mazearray import maze diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py index 860ea936..eaab821c 100644 --- a/src/envs/maze_env/server/maze_environment.py +++ b/src/envs/maze_env/server/maze_environment.py @@ -11,9 +11,9 @@ via the OpenEnv Environment interface. """ -from typing import Any, Dict, List, Tuple, Optional -from core.env_server import Action, Environment, Observation -from .maze import Maze, Status +from typing import List, Tuple, Optional +from core.env_server import Environment +from .maze import Maze from ..models import MazeAction, MazeObservation, MazeState try: @@ -60,10 +60,10 @@ def reset(self) -> MazeObservation: # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation) - total_reward = 0 + self.total_reward = 0 legal_actions = self._compute_legal_actions(pos_list[0]) - return MazeObservation(position=pos_list, total_reward=total_reward, legal_actions=legal_actions) + return MazeObservation(position=pos_list, total_reward=self.total_reward, legal_actions=legal_actions) def step(self, action: MazeAction) -> MazeObservation: """ From 15fe3672c51cbace87d940c9186c9c92eaa270ba Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Sun, 2 Nov 2025 18:21:20 +0530 Subject: [PATCH 10/14] Remove unused import and function --- src/envs/maze_env/server/maze.py | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/src/envs/maze_env/server/maze.py b/src/envs/maze_env/server/maze.py index f9dddaaa..9ff33989 100644 --- a/src/envs/maze_env/server/maze.py +++ b/src/envs/maze_env/server/maze.py @@ -10,7 +10,6 @@ import logging from enum import Enum, IntEnum -import matplotlib.pyplot as plt import numpy as np @@ -137,33 +136,6 @@ def __draw(self): self.__ax1.get_figure().canvas.draw() self.__ax1.get_figure().canvas.flush_events() - def render(self, content=Render.NOTHING): - """ Record what will be rendered during play and/or training. - - :param Render content: NOTHING, TRAINING, MOVES - """ - self.__render = content - - if self.__render == Render.NOTHING: - if self.__ax1: - self.__ax1.get_figure().close() - self.__ax1 = None - if self.__ax2: - self.__ax2.get_figure().close() - self.__ax2 = None - if self.__render == Render.TRAINING: - if self.__ax2 is None: - fig, self.__ax2 = plt.subplots(1, 1, tight_layout=True) - fig.canvas.set_window_title("Best move") - self.__ax2.set_axis_off() - self.render_q(None) - if self.__render in (Render.MOVES, Render.TRAINING): - if self.__ax1 is None: - fig, self.__ax1 = plt.subplots(1, 1, tight_layout=True) - fig.canvas.set_window_title("Maze") - - plt.show(block=False) - def step(self, action): """ Move the agent according to 'action' and return the new state, reward and game status. From 11347ef774cddae7640209c15187d6a07438c382 Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Sun, 2 Nov 2025 18:29:35 +0530 Subject: [PATCH 11/14] Add maze env to hf build --- .github/workflows/deploy-hf-env.yml | 3 +- src/envs/maze_env/client.py | 15 +- src/envs/maze_env/models.py | 1 + src/envs/maze_env/server/__init__.py | 2 +- src/envs/maze_env/server/app.py | 3 +- src/envs/maze_env/server/maze.py | 218 ++++++++++++------- src/envs/maze_env/server/maze_environment.py | 29 ++- src/envs/maze_env/server/mazearray.py | 22 +- 8 files changed, 193 insertions(+), 100 deletions(-) diff --git a/.github/workflows/deploy-hf-env.yml b/.github/workflows/deploy-hf-env.yml index d84833df..2f188ddf 100644 --- a/.github/workflows/deploy-hf-env.yml +++ b/.github/workflows/deploy-hf-env.yml @@ -15,6 +15,7 @@ on: - 'chat_env' - 'atari_env' - 'openspiel_env' + - 'maze_env' custom_environment: description: 'Custom environment to deploy (leave empty for none)' required: false @@ -110,7 +111,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - environment: [echo_env, coding_env, chat_env, atari_env, openspiel_env] + environment: [echo_env, coding_env, chat_env, atari_env, openspiel_env, maze_env] permissions: contents: read diff --git a/src/envs/maze_env/client.py b/src/envs/maze_env/client.py index a00887fd..dfbc1013 100644 --- a/src/envs/maze_env/client.py +++ b/src/envs/maze_env/client.py @@ -23,10 +23,17 @@ if TYPE_CHECKING: pass + class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]): """HTTP client for Maze Environment.""" - def render_ascii_maze(self, maze: List[List[int]], position: List[int], start: List[int], goal: List[int]) -> None: + def render_ascii_maze( + self, + maze: List[List[int]], + position: List[int], + start: List[int], + goal: List[int], + ) -> None: """ Render the maze grid as ASCII art in the terminal. - 0 = free cell @@ -49,8 +56,8 @@ def render_ascii_maze(self, maze: List[List[int]], position: List[int], start: L line += "G " elif maze[r][c] == 1: line += "ā–ˆ " - elif r == rows-1 and c == cols-1: - line+= "E " + elif r == rows - 1 and c == cols - 1: + line += "E " else: line += ". " print(line) @@ -82,4 +89,4 @@ def _parse_state(self, payload: Dict[str, Any]) -> MazeState: episode_id=payload.get("episode_id", ""), step_count=payload.get("step_count", 0), done=payload.get("done", False), - ) \ No newline at end of file + ) diff --git a/src/envs/maze_env/models.py b/src/envs/maze_env/models.py index 2461299e..35a00b14 100644 --- a/src/envs/maze_env/models.py +++ b/src/envs/maze_env/models.py @@ -29,6 +29,7 @@ class MazeObservation(Observation): total_reward: float legal_actions: List[int] = field(default_factory=list) + @dataclass class MazeState(State): episode_id: str diff --git a/src/envs/maze_env/server/__init__.py b/src/envs/maze_env/server/__init__.py index f3cfcf4a..1fca47db 100644 --- a/src/envs/maze_env/server/__init__.py +++ b/src/envs/maze_env/server/__init__.py @@ -8,4 +8,4 @@ from .maze import Maze, Status from .maze_environment import MazeEnvironment -__all__ = ["Maze","MazeEnvironment","Status"] \ No newline at end of file +__all__ = ["Maze", "MazeEnvironment", "Status"] diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py index 3a9ba099..d81ed695 100644 --- a/src/envs/maze_env/server/app.py +++ b/src/envs/maze_env/server/app.py @@ -28,10 +28,11 @@ from ..models import MazeAction, MazeObservation from .maze_environment import MazeEnvironment from .mazearray import maze + # Get game configuration from environment variables # Create the environment instance -env = MazeEnvironment(maze_array=maze,start_cell=(0,0),exit_cell=(7,7)) +env = MazeEnvironment(maze_array=maze, start_cell=(0, 0), exit_cell=(7, 7)) # Create the FastAPI app with web interface and README integration app = create_app(env, MazeAction, MazeObservation, env_name="maze_env") diff --git a/src/envs/maze_env/server/maze.py b/src/envs/maze_env/server/maze.py index 9ff33989..1385654f 100644 --- a/src/envs/maze_env/server/maze.py +++ b/src/envs/maze_env/server/maze.py @@ -39,52 +39,73 @@ class Status(Enum): class Maze: - """ A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze. - - The layout of the maze and the rules how to move through it are called the environment. An agent is placed - at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every - action results in a reward or penalty which are accumulated during the game. Every move gives a small - penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into - a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The - game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means - winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is - assumed to wander around clueless and looses. - - A note on cell coordinates: - The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze. - This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze - itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple - to (row, col) use (col, row)[::-1] + """A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze. + + The layout of the maze and the rules how to move through it are called the environment. An agent is placed + at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every + action results in a reward or penalty which are accumulated during the game. Every move gives a small + penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into + a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The + game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means + winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is + assumed to wander around clueless and looses. + + A note on cell coordinates: + The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze. + This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze + itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple + to (row, col) use (col, row)[::-1] """ - actions = [Action.MOVE_LEFT, Action.MOVE_RIGHT, Action.MOVE_UP, Action.MOVE_DOWN] # all possible actions + + actions = [ + Action.MOVE_LEFT, + Action.MOVE_RIGHT, + Action.MOVE_UP, + Action.MOVE_DOWN, + ] # all possible actions reward_exit = 10.0 # reward for reaching the exit cell - penalty_move = -0.05 # penalty for a move which did not result in finding the exit cell + penalty_move = ( + -0.05 + ) # penalty for a move which did not result in finding the exit cell penalty_visited = -0.25 # penalty for returning to a cell which was visited earlier - penalty_impossible_move = -0.75 # penalty for trying to enter an occupied cell or moving out of the maze + penalty_impossible_move = ( + -0.75 + ) # penalty for trying to enter an occupied cell or moving out of the maze def __init__(self, maze, start_cell=(0, 0), exit_cell=None): - """ Create a new maze game. + """Create a new maze game. - :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1) - :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left) - :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right) + :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1) + :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left) + :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right) """ self.maze = maze - self.__minimum_reward = -0.5 * self.maze.size # stop game if accumulated reward is below this threshold + self.__minimum_reward = ( + -0.5 * self.maze.size + ) # stop game if accumulated reward is below this threshold nrows, ncols = self.maze.shape self.cells = [(col, row) for col in range(ncols) for row in range(nrows)] - self.empty = [(col, row) for col in range(ncols) for row in range(nrows) if self.maze[row, col] == Cell.EMPTY] + self.empty = [ + (col, row) + for col in range(ncols) + for row in range(nrows) + if self.maze[row, col] == Cell.EMPTY + ] self.__exit_cell = (ncols - 1, nrows - 1) if exit_cell is None else exit_cell self.empty.remove(self.__exit_cell) # Check for impossible maze layout if self.__exit_cell not in self.cells: - raise Exception("Error: exit cell at {} is not inside maze".format(self.__exit_cell)) + raise Exception( + "Error: exit cell at {} is not inside maze".format(self.__exit_cell) + ) if self.maze[self.__exit_cell[::-1]] == Cell.OCCUPIED: - raise Exception("Error: exit cell at {} is not free".format(self.__exit_cell)) + raise Exception( + "Error: exit cell at {} is not free".format(self.__exit_cell) + ) # Variables for rendering using Matplotlib self.__render = Render.NOTHING # what to render @@ -94,17 +115,21 @@ def __init__(self, maze, start_cell=(0, 0), exit_cell=None): self.reset(start_cell) def reset(self, start_cell=(0, 0)): - """ Reset the maze to its initial state and place the agent at start_cell. + """Reset the maze to its initial state and place the agent at start_cell. - :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left) - :return: new state after reset + :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left) + :return: new state after reset """ if start_cell not in self.cells: - raise Exception("Error: start cell at {} is not inside maze".format(start_cell)) + raise Exception( + "Error: start cell at {} is not inside maze".format(start_cell) + ) if self.maze[start_cell[::-1]] == Cell.OCCUPIED: raise Exception("Error: start cell at {} is not free".format(start_cell)) if start_cell == self.__exit_cell: - raise Exception("Error: start- and exit cell cannot be the same {}".format(start_cell)) + raise Exception( + "Error: start- and exit cell cannot be the same {}".format(start_cell) + ) self.__previous_cell = self.__current_cell = start_cell self.__total_reward = 0.0 # accumulated reward @@ -119,10 +144,18 @@ def reset(self, start_cell=(0, 0)): self.__ax1.set_yticks(np.arange(0.5, ncols, step=1)) self.__ax1.set_yticklabels([]) self.__ax1.grid(True) - self.__ax1.plot(*self.__current_cell, "rs", markersize=30) # start is a big red square - self.__ax1.text(*self.__current_cell, "Start", ha="center", va="center", color="white") - self.__ax1.plot(*self.__exit_cell, "gs", markersize=30) # exit is a big green square - self.__ax1.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white") + self.__ax1.plot( + *self.__current_cell, "rs", markersize=30 + ) # start is a big red square + self.__ax1.text( + *self.__current_cell, "Start", ha="center", va="center", color="white" + ) + self.__ax1.plot( + *self.__exit_cell, "gs", markersize=30 + ) # exit is a big green square + self.__ax1.text( + *self.__exit_cell, "Exit", ha="center", va="center", color="white" + ) self.__ax1.imshow(self.maze, cmap="binary") self.__ax1.get_figure().canvas.draw() self.__ax1.get_figure().canvas.flush_events() @@ -130,35 +163,43 @@ def reset(self, start_cell=(0, 0)): return self.__observe() def __draw(self): - """ Draw a line from the agents previous cell to its current cell. """ - self.__ax1.plot(*zip(*[self.__previous_cell, self.__current_cell]), "bo-") # previous cells are blue dots + """Draw a line from the agents previous cell to its current cell.""" + self.__ax1.plot( + *zip(*[self.__previous_cell, self.__current_cell]), "bo-" + ) # previous cells are blue dots self.__ax1.plot(*self.__current_cell, "ro") # current cell is a red dot self.__ax1.get_figure().canvas.draw() self.__ax1.get_figure().canvas.flush_events() def step(self, action): - """ Move the agent according to 'action' and return the new state, reward and game status. + """Move the agent according to 'action' and return the new state, reward and game status. - :param Action action: the agent will move in this direction - :return: state, reward, status + :param Action action: the agent will move in this direction + :return: state, reward, status """ reward = self.__execute(action) self.__total_reward += reward status = self.__status() state = self.__observe() - logging.debug("action: {:10s} | reward: {: .2f} | status: {}".format(Action(action).name, reward, status)) + logging.debug( + "action: {:10s} | reward: {: .2f} | status: {}".format( + Action(action).name, reward, status + ) + ) return state, reward, status def __execute(self, action): - """ Execute action and collect the reward or penalty. + """Execute action and collect the reward or penalty. - :param Action action: direction in which the agent will move - :return float: reward or penalty which results from the action + :param Action action: direction in which the agent will move + :return float: reward or penalty which results from the action """ possible_actions = self.__possible_actions(self.__current_cell) if not possible_actions: - reward = self.__minimum_reward - 1 # cannot move anywhere, force end of game + reward = ( + self.__minimum_reward - 1 + ) # cannot move anywhere, force end of game elif action in possible_actions: col, row = self.__current_cell if action == Action.MOVE_LEFT: @@ -179,21 +220,27 @@ def __execute(self, action): if self.__current_cell == self.__exit_cell: reward = Maze.reward_exit # maximum reward when reaching the exit cell elif self.__current_cell in self.__visited: - reward = Maze.penalty_visited # penalty when returning to a cell which was visited earlier + reward = ( + Maze.penalty_visited + ) # penalty when returning to a cell which was visited earlier else: - reward = Maze.penalty_move # penalty for a move which did not result in finding the exit cell + reward = ( + Maze.penalty_move + ) # penalty for a move which did not result in finding the exit cell self.__visited.add(self.__current_cell) else: - reward = Maze.penalty_impossible_move # penalty for trying to enter an occupied cell or move out of the maze + reward = ( + Maze.penalty_impossible_move + ) # penalty for trying to enter an occupied cell or move out of the maze return reward def __possible_actions(self, cell=None): - """ Create a list with all possible actions from 'cell', avoiding the maze's edges and walls. + """Create a list with all possible actions from 'cell', avoiding the maze's edges and walls. - :param tuple cell: location of the agent (optional, else use current cell) - :return list: all possible actions + :param tuple cell: location of the agent (optional, else use current cell) + :return list: all possible actions """ if cell is None: col, row = self.__current_cell @@ -206,42 +253,48 @@ def __possible_actions(self, cell=None): nrows, ncols = self.maze.shape if row == 0 or (row > 0 and self.maze[row - 1, col] == Cell.OCCUPIED): possible_actions.remove(Action.MOVE_UP) - if row == nrows - 1 or (row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED): + if row == nrows - 1 or ( + row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED + ): possible_actions.remove(Action.MOVE_DOWN) if col == 0 or (col > 0 and self.maze[row, col - 1] == Cell.OCCUPIED): possible_actions.remove(Action.MOVE_LEFT) - if col == ncols - 1 or (col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED): + if col == ncols - 1 or ( + col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED + ): possible_actions.remove(Action.MOVE_RIGHT) return possible_actions def __status(self): - """ Return the game status. + """Return the game status. - :return Status: current game status (WIN, LOSE, PLAYING) + :return Status: current game status (WIN, LOSE, PLAYING) """ if self.__current_cell == self.__exit_cell: return Status.WIN - if self.__total_reward < self.__minimum_reward: # force end of game after too much loss + if ( + self.__total_reward < self.__minimum_reward + ): # force end of game after too much loss return Status.LOSE return Status.PLAYING def __observe(self): - """ Return the state of the maze - in this game the agents current location. + """Return the state of the maze - in this game the agents current location. - :return numpy.array [1][2]: agents current location + :return numpy.array [1][2]: agents current location """ return np.array([[*self.__current_cell]]) def play(self, model, start_cell=(0, 0)): - """ Play a single game, choosing the next move based a prediction from 'model'. + """Play a single game, choosing the next move based a prediction from 'model'. - :param class AbstractModel model: the prediction model to use - :param tuple start_cell: agents initial cell (optional, else upper left) - :return Status: WIN, LOSE + :param class AbstractModel model: the prediction model to use + :param tuple start_cell: agents initial cell (optional, else upper left) + :return Status: WIN, LOSE """ self.reset(start_cell) @@ -254,9 +307,11 @@ def play(self, model, start_cell=(0, 0)): return status def check_win_all(self, model): - """ Check if the model wins from all possible starting cells. """ + """Check if the model wins from all possible starting cells.""" previous = self.__render - self.__render = Render.NOTHING # avoid rendering anything during execution of the check games + self.__render = ( + Render.NOTHING + ) # avoid rendering anything during execution of the check games win = 0 lose = 0 @@ -269,14 +324,18 @@ def check_win_all(self, model): self.__render = previous # restore previous rendering setting - logging.info("won: {} | lost: {} | win rate: {:.5f}".format(win, lose, win / (win + lose))) + logging.info( + "won: {} | lost: {} | win rate: {:.5f}".format( + win, lose, win / (win + lose) + ) + ) result = True if lose == 0 else False return result, win / (win + lose) def render_q(self, model): - """ Render the recommended action(s) for each cell as provided by 'model'. + """Render the recommended action(s) for each cell as provided by 'model'. :param class AbstractModel model: the prediction model to use """ @@ -293,8 +352,12 @@ def clip(n): self.__ax2.set_yticks(np.arange(0.5, ncols, step=1)) self.__ax2.set_yticklabels([]) self.__ax2.grid(True) - self.__ax2.plot(*self.__exit_cell, "gs", markersize=30) # exit is a big green square - self.__ax2.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white") + self.__ax2.plot( + *self.__exit_cell, "gs", markersize=30 + ) # exit is a big green square + self.__ax2.text( + *self.__exit_cell, "Exit", ha="center", va="center", color="white" + ) for cell in self.empty: q = model.q(cell) if model is not None else [0, 0, 0, 0] @@ -315,9 +378,18 @@ def clip(n): # color (from red to green) represents the certainty of the preferred action(s) maxv = 1 minv = -1 - color = clip((q[action] - minv) / (maxv - minv)) # normalize in [-1, 1] - - self.__ax2.arrow(*cell, dx, dy, color=(1 - color, color, 0), head_width=0.2, head_length=0.1) + color = clip( + (q[action] - minv) / (maxv - minv) + ) # normalize in [-1, 1] + + self.__ax2.arrow( + *cell, + dx, + dy, + color=(1 - color, color, 0), + head_width=0.2, + head_length=0.1, + ) self.__ax2.imshow(self.maze, cmap="binary") - self.__ax2.get_figure().canvas.draw() \ No newline at end of file + self.__ax2.get_figure().canvas.draw() diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py index eaab821c..b9675bcf 100644 --- a/src/envs/maze_env/server/maze_environment.py +++ b/src/envs/maze_env/server/maze_environment.py @@ -42,7 +42,7 @@ def __init__( self, maze_array: np.ndarray, start_cell: Tuple[int, int] = (0, 0), - exit_cell: Optional[Tuple[int, int]] = (7,7), + exit_cell: Optional[Tuple[int, int]] = (7, 7), ): # Create underlying Maze instance (matches your working code) self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell) @@ -54,16 +54,26 @@ def __init__( def reset(self) -> MazeObservation: """Reset environment and return initial observation (MazeObservation).""" - observation = self.env.reset() # typically returns np.array([row, col]) or similar + observation = ( + self.env.reset() + ) # typically returns np.array([row, col]) or similar # initialize episode state self.state = MazeState(episode_id="episode_1", step_count=0, done=False) # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields - pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation) + pos_list = ( + observation.tolist() + if hasattr(observation, "tolist") + else list(observation) + ) self.total_reward = 0 legal_actions = self._compute_legal_actions(pos_list[0]) - return MazeObservation(position=pos_list, total_reward=self.total_reward, legal_actions=legal_actions) + return MazeObservation( + position=pos_list, + total_reward=self.total_reward, + legal_actions=legal_actions, + ) def step(self, action: MazeAction) -> MazeObservation: """ @@ -91,9 +101,9 @@ def step(self, action: MazeAction) -> MazeObservation: } # --- Reward settings --- - reward_exit = 10.0 # reward for reaching the exit cell - reward_move = 0.05 # reward for a move that didn't find the exit but is valid - penalty_visited = -0.25 # penalty for revisiting a cell + reward_exit = 10.0 # reward for reaching the exit cell + reward_move = 0.05 # reward for a move that didn't find the exit but is valid + penalty_visited = -0.25 # penalty for revisiting a cell penalty_impossible = -0.75 # penalty for invalid move (wall/outside) dr, dc = move_map.get(action.action, (0, 0)) @@ -153,10 +163,9 @@ def step(self, action: MazeAction) -> MazeObservation: position=pos_list, total_reward=self.total_reward, legal_actions=legal_actions, - done=done + done=done, ) - def state(self) -> Optional[MazeState]: """Return the current MazeState object.""" return self.state @@ -186,4 +195,4 @@ def _compute_legal_actions(self, pos: List[int]) -> List[int]: if col < ncols - 1 and self.env.maze[row, col + 1] == 0: actions.append(3) - return actions \ No newline at end of file + return actions diff --git a/src/envs/maze_env/server/mazearray.py b/src/envs/maze_env/server/mazearray.py index b87935e2..3cd7dbd6 100644 --- a/src/envs/maze_env/server/mazearray.py +++ b/src/envs/maze_env/server/mazearray.py @@ -1,13 +1,15 @@ import numpy as np # Maze -maze = np.array([ - [0, 1, 0, 0, 0, 0, 0, 0], - [0, 1, 0, 1, 0, 1, 0, 0], - [0, 0, 0, 1, 1, 0, 1, 0], - [0, 1, 0, 1, 0, 0, 0, 0], - [1, 0, 0, 1, 0, 1, 0, 0], - [0, 0, 0, 1, 0, 1, 1, 1], - [0, 1, 1, 0, 0, 0, 0, 0], - [0, 0, 0, 0, 0, 1, 0, 0] -]) \ No newline at end of file +maze = np.array( + [ + [0, 1, 0, 0, 0, 0, 0, 0], + [0, 1, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 1, 0, 1, 0], + [0, 1, 0, 1, 0, 0, 0, 0], + [1, 0, 0, 1, 0, 1, 0, 0], + [0, 0, 0, 1, 0, 1, 1, 1], + [0, 1, 1, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 1, 0, 0], + ] +) From 71e6f97d985e2d0b34cc4f204b2740beebecc43b Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Sun, 2 Nov 2025 18:37:26 +0530 Subject: [PATCH 12/14] Fix hf deployment for maze env --- scripts/prepare_hf_deployment.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/prepare_hf_deployment.sh b/scripts/prepare_hf_deployment.sh index 23fd4779..381edffd 100755 --- a/scripts/prepare_hf_deployment.sh +++ b/scripts/prepare_hf_deployment.sh @@ -157,6 +157,7 @@ README_EOF "chat_env") ENV_CLASS="ChatEnv" ;; "atari_env") ENV_CLASS="AtariEnv" ;; "openspiel_env") ENV_CLASS="OpenSpielEnv" ;; + "maze_env") ENV_CLASS="MazeEnv" ;; *) ENV_CLASS="Env" ;; esac From b05505065f454ae5136ea33246e2a85462b4c1a4 Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Sun, 2 Nov 2025 19:06:31 +0530 Subject: [PATCH 13/14] Fix hf deployment --- .github/workflows/deploy-hf-env.yml | 6 +++--- src/envs/maze_env/server/Dockerfile | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/deploy-hf-env.yml b/.github/workflows/deploy-hf-env.yml index 2f188ddf..12849a3f 100644 --- a/.github/workflows/deploy-hf-env.yml +++ b/.github/workflows/deploy-hf-env.yml @@ -64,7 +64,7 @@ jobs: if [ "${{ github.event.inputs.environment }}" = "all" ]; then echo "deploy_all=true" >> $GITHUB_OUTPUT echo "use_matrix=true" >> $GITHUB_OUTPUT - echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env" >> $GITHUB_OUTPUT + echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env,maze_env" >> $GITHUB_OUTPUT echo "Manual trigger - deploying all environments with matrix" else echo "deploy_all=false" >> $GITHUB_OUTPUT @@ -79,14 +79,14 @@ jobs: if git diff --name-only HEAD~1 HEAD | grep -E '^src/core/' > /dev/null; then echo "deploy_all=true" >> $GITHUB_OUTPUT echo "use_matrix=true" >> $GITHUB_OUTPUT - echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env" >> $GITHUB_OUTPUT + echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env,maze_env" >> $GITHUB_OUTPUT echo "Core files changed - deploying all environments with matrix" exit 0 fi # Check which specific environments changed changed_envs=() - for env in echo_env coding_env chat_env atari_env openspiel_env; do + for env in echo_env coding_env chat_env atari_env openspiel_env maze_env; do if git diff --name-only HEAD~1 HEAD | grep -E "^src/envs/$env/" > /dev/null; then changed_envs+=("$env") fi diff --git a/src/envs/maze_env/server/Dockerfile b/src/envs/maze_env/server/Dockerfile index 544d3e44..2d2e3d6f 100644 --- a/src/envs/maze_env/server/Dockerfile +++ b/src/envs/maze_env/server/Dockerfile @@ -29,7 +29,7 @@ COPY src/envs/maze_env/README.md /app/README.md # Extend Python path for OpenEnv (base image sets PYTHONPATH=/app/src) # We prepend Maze paths -ENV PYTHONPATH=/repo:/repo/build/python:/app/src +ENV PYTHONPATH=/app/src # Health check (curl provided by openenv-base) From 3656fc69cb7ca648310f746695875db73662660d Mon Sep 17 00:00:00 2001 From: Vivek Silimkhan Date: Sun, 2 Nov 2025 21:47:11 +0530 Subject: [PATCH 14/14] Update deploy to hf script --- scripts/deploy_to_hf.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/scripts/deploy_to_hf.sh b/scripts/deploy_to_hf.sh index 20256c9a..ef212ffb 100755 --- a/scripts/deploy_to_hf.sh +++ b/scripts/deploy_to_hf.sh @@ -290,6 +290,13 @@ DOCKERFILE_EOF echo "OpenSpiel builds can take 10-15 minutes due to C++ compilation" return # Skip the common parts since OpenSpiel has its own complete Dockerfile ;; + "maze_env") + cat >> "$CURRENT_STAGING_DIR/Dockerfile" << 'DOCKERFILE_EOF' +# Install additional dependencies for ChatEnvironment +RUN pip install --no-cache-dir numpy +DOCKERFILE_EOF + # Maze env requre + ;; esac # Add common parts