From 2bc12df23847d55ed82377ddbe6bae16c26d5a45 Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Tue, 28 Oct 2025 00:00:28 +0530
Subject: [PATCH 01/14] Add maze environment and example

---
 examples/maze_simple.py                      | 101 ++++++
 src/envs/maze_env/README.md                  | 123 +++++++
 src/envs/maze_env/__init__.py                |  16 +
 src/envs/maze_env/client.py                  |  85 +++++
 src/envs/maze_env/models.py                  |  37 ++
 src/envs/maze_env/server/Dockerfile          |  38 ++
 src/envs/maze_env/server/__init__.py         |  11 +
 src/envs/maze_env/server/app.py              |  44 +++
 src/envs/maze_env/server/maze.py             | 351 +++++++++++++++++++
 src/envs/maze_env/server/maze_environment.py | 160 +++++++++
 src/envs/maze_env/server/mazearray.py        |  13 +
 11 files changed, 979 insertions(+)
 create mode 100644 examples/maze_simple.py
 create mode 100644 src/envs/maze_env/README.md
 create mode 100644 src/envs/maze_env/__init__.py
 create mode 100644 src/envs/maze_env/client.py
 create mode 100644 src/envs/maze_env/models.py
 create mode 100644 src/envs/maze_env/server/Dockerfile
 create mode 100644 src/envs/maze_env/server/__init__.py
 create mode 100644 src/envs/maze_env/server/app.py
 create mode 100644 src/envs/maze_env/server/maze.py
 create mode 100644 src/envs/maze_env/server/maze_environment.py
 create mode 100644 src/envs/maze_env/server/mazearray.py

diff --git a/examples/maze_simple.py b/examples/maze_simple.py
new file mode 100644
index 00000000..c3f27d91
--- /dev/null
+++ b/examples/maze_simple.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Simple example of using Maze environment with OpenEnv.
+
+This demonstrates:
+1. Connecting to the Maze environment server
+2. Resetting the environment
+3. Taking actions
+4. Observing rewards
+5. Inspecting environment state
+
+Usage:
+    python examples/maze_simple.py
+"""
+
+import sys
+from pathlib import Path
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+import numpy as np
+from envs.maze_env import MazeEnv, MazeAction
+
+
+def main():
+    print("🧩 Simple Maze Environment Example")
+    print("=" * 60)
+
+    # Connect to environment server
+    # Ensure server is running: python -m envs.maze_env.server.app
+    env = MazeEnv(base_url="http://localhost:8000")
+    maze = np.array([
+            [0, 1, 0, 0, 0, 0, 0, 0],
+            [0, 1, 0, 1, 0, 1, 0, 0],
+            [0, 0, 0, 1, 1, 0, 1, 0],
+            [0, 1, 0, 1, 0, 0, 0, 0],
+            [1, 0, 0, 1, 0, 1, 0, 0],
+            [0, 0, 0, 1, 0, 1, 1, 1],
+            [0, 1, 1, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 1, 0, 0]
+        ])
+    try:
+        # Reset environment
+        print("\n📍 Resetting environment...")
+        result = env.reset()
+
+        print(f"   Initial position: {result.observation.position}")
+        print(f"   Legal actions: {result.observation.legal_actions}")
+
+        # Run one episode
+        print("\n🚶 Navigating through maze...")
+        step = 0
+        total_reward = 0
+
+        while not result.done and step < 20:
+            # Choose random legal action
+            print(f"   Current position: {result.observation.position}")
+            print(f"   Legal actions: {result.observation.legal_actions}")
+            env.render_ascii_maze(maze,result.observation.position,[0,0],[maze.shape[0],maze.shape[1]])
+            action_id = result.observation.legal_actions[step % len(result.observation.legal_actions)]
+            # Take action
+            result = env.step(MazeAction(action=action_id))
+
+            reward = result.reward or 0
+            total_reward += reward
+
+            print(f"   Step {step + 1}: action={action_id}, pos={result.observation.position}, reward={reward:.2f}, done={result.done}")
+            step += 1
+            print("-----------------------------------------------------")
+
+        print(f"\n✅ Episode finished!")
+        print(f"   Total steps: {step}")
+        print(f"   Total reward: {total_reward}")
+
+        # Get environment state
+        state = env.state()
+        print(f"\n📊 Environment State:")
+        print(f"   Episode ID: {state.episode_id}")
+        print(f"   Step count: {state.step_count}")
+        print(f"   Done: {state.done}")
+
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        print("\nMake sure the server is running:")
+        print("  python -m envs.maze_env.server.app")
+        print("\nOr start with Docker:")
+        print("  docker run -p 8000:8000 maze-env:latest")
+
+    finally:
+        env.close()
+        print("\n👋 Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/envs/maze_env/README.md b/src/envs/maze_env/README.md
new file mode 100644
index 00000000..c2b4e5cd
--- /dev/null
+++ b/src/envs/maze_env/README.md
@@ -0,0 +1,123 @@
+# Maze Environment
+
+Integration of Maze game with the OpenEnv framework.
+
+## Architecture
+
+```
+┌────────────────────────────────────┐
+│ RL Training Code (Client)          │
+│   MazeEnv.step(action)             │
+└──────────────┬─────────────────────┘
+               │ HTTP
+┌──────────────▼─────────────────────┐
+│ FastAPI Server (Docker)            │
+│   MazeEnvironment                  │
+│     ├─ Wraps Maze environment      │
+│     └─ Agent controls player       │
+└────────────────────────────────────┘
+```
+
+## Installation & Usage
+
+### Option 1: Local Development (without Docker)
+
+**Requirements:**
+- Python 3.11+
+- Numpy
+
+```python
+from envs.maze_env import MazeEnv, MazeAction
+
+# Start local server manually
+# python -m envs.maze_env.server.app
+
+# Connect to local server
+env = MazeEnv(base_url="http://localhost:8000")
+
+# Reset environment
+result = env.reset()
+print(f"Initial state: {result.observation.info_state}")
+print(f"Legal actions: {result.observation.legal_actions}")
+
+# Take actions
+for _ in range(10):
+    action_id = result.observation.legal_actions[0]  # Choose first legal action
+    result = env.step(MazeAction(action_id=action_id))
+    print(f"Reward: {result.reward}, Done: {result.done}")
+    if result.done:
+        break
+
+# Cleanup
+env.close()
+```
+
+### Option 2: Docker (Recommended)
+
+**Build Docker image:**
+
+```bash
+cd OpenEnv
+docker build -f src/envs/maze_env/server/Dockerfile -t maze-env:latest .
+```
+
+**Use with from_docker_image():**
+
+```python
+from envs.maze_env import MazeEnv, MazeAction
+
+# Automatically starts container
+env = MazeEnv.from_docker_image("maze-env:latest")
+
+result = env.reset()
+result = env.step(MazeAction(action_id=0))
+
+env.close()  # Stops container
+```
+
+## Configuration
+
+### Variables
+
+- `maze` : Maze as a numpy array saved in mazearray.py
+
+### Example
+
+```bash
+docker run -p 8000:8000 maze-env:latest
+```
+
+## API Reference
+
+### MazeAction
+
+```python
+@dataclass
+class MazeAction(Action):
+    action: int                        # Action to be taken
+```
+
+### MazeObservation
+
+```python
+@dataclass
+class MazeObservation(Observation):
+    position: List[int]  # [row, col]
+    total_reward: float  # Total reward
+    legal_actions: List[int] = field(default_factory=list)  # Legal action based on the current position
+```
+
+### MazeState
+
+```python
+@dataclass
+class MazeState(State):
+    episode_id: str     # Episode
+    step_count: int     # Number of steps
+    done: bool = False  # Solve status
+
+```
+
+## References
+
+- [Maze Environment](https://github.com/erikdelange/Reinforcement-Learning-Maze)
diff --git a/src/envs/maze_env/__init__.py b/src/envs/maze_env/__init__.py
new file mode 100644
index 00000000..0c2c79f7
--- /dev/null
+++ b/src/envs/maze_env/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Maze Environment Integration.
+
+This module provides integration between Maze game and the OpenEnv framework.
+"""
+
+from .client import MazeEnv
+from .models import MazeAction, MazeObservation, MazeState
+
+__all__ = ["MazeEnv", "MazeAction", "MazeObservation", "MazeState"]
diff --git a/src/envs/maze_env/client.py b/src/envs/maze_env/client.py
new file mode 100644
index 00000000..81188562
--- /dev/null
+++ b/src/envs/maze_env/client.py
@@ -0,0 +1,85 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+MazeEnv HTTP Client.
+
+This module provides the client for connecting to a Maze Environment server
+over HTTP.
+"""
+
+from __future__ import annotations
+
+from typing import Any, Dict, List, TYPE_CHECKING
+
+from core.client_types import StepResult
+from core.http_env_client import HTTPEnvClient
+
+from .models import MazeAction, MazeObservation, MazeState
+
+if TYPE_CHECKING:
+    from core.containers.runtime import ContainerProvider
+
+class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]):
+    """HTTP client for Maze Environment."""
+
+    def render_ascii_maze(self, maze: List[List[int]], position: List[int], start: List[int], goal: List[int]) -> None:
+        """
+        Render the maze grid as ASCII art in the terminal.
+        - 0 = free cell
+        - 1 = wall
+        - S = start
+        - G = goal
+        - P = player
+        - E = exit
+        """
+        print("\nCurrent Maze State:")
+        rows, cols = len(maze), len(maze[0])
+        for r in range(rows):
+            line = ""
+            for c in range(cols):
+                if [r, c] == position:
+                    line += "P "
+                elif [r, c] == start:
+                    line += "S "
+                elif [r, c] == goal:
+                    line += "G "
+                elif maze[r][c] == 1:
+                    line += "█ "
+                elif r == rows-1 and c == cols-1:
+                    line+= "E "
+                else:
+                    line += ". "
+            print(line)
+        print()
+
+    def _step_payload(self, action: MazeAction) -> Dict[str, Any]:
+        """Prepare payload to send to the environment server."""
+        return {"action": action.action}
+
+    def _parse_result(self, payload: Dict[str, Any]) -> StepResult[MazeObservation]:
+        """Parse the response from the server into MazeObservation + reward/done."""
+        obs_data = payload.get("observation", {})
+
+        observation = MazeObservation(
+            position=obs_data.get("position", []),
+            total_reward=obs_data.get("total_reward", 0.0),
+            legal_actions=obs_data.get("legal_actions", []),
+        )
+
+        return StepResult(
+            observation=observation,
+            reward=payload.get("reward", 0.0),
+            done=payload.get("done", False),
+        )
+
+    def _parse_state(self, payload: Dict[str, Any]) -> MazeState:
+        """Parse environment state from payload."""
+        return MazeState(
+            episode_id=payload.get("episode_id", ""),
+            step_count=payload.get("step_count", 0),
+            done=payload.get("done", False),
+        )
\ No newline at end of file
diff --git a/src/envs/maze_env/models.py b/src/envs/maze_env/models.py
new file mode 100644
index 00000000..d642d305
--- /dev/null
+++ b/src/envs/maze_env/models.py
@@ -0,0 +1,37 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Data models for Maze Environment.
+
+This module defines the Action, Observation, and State types for Maze games.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+from pydantic import Field
+from typing import Any, Dict, List, Optional, Tuple, Literal
+
+from core.env_server import Action, Observation, State
+
+
+@dataclass
+class MazeAction(Action):
+    action: int
+
+
+@dataclass
+class MazeObservation(Observation):
+    position: List[int]  # [row, col]
+    total_reward: float
+    legal_actions: List[int] = field(default_factory=list)
+
+@dataclass
+class MazeState(State):
+    episode_id: str
+    step_count: int
+    done: bool = False
diff --git a/src/envs/maze_env/server/Dockerfile b/src/envs/maze_env/server/Dockerfile
new file mode 100644
index 00000000..3000b570
--- /dev/null
+++ b/src/envs/maze_env/server/Dockerfile
@@ -0,0 +1,38 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Use the pre-built OpenEnv base image
+# Built from: docker build -t openenv-base:latest -f src/core/Dockerfile.openenv-base .
+# In CI, this can be overridden to use GHCR or other registries
+ARG OPENENV_BASE_IMAGE=openenv-base:latest
+FROM ${OPENENV_BASE_IMAGE}
+
+# Set working directory
+WORKDIR /app
+
+# Copy OpenEnv core (already expected in base image but ensure updated)
+COPY src/core/ /app/src/core/
+
+# Copy Maze environment
+COPY src/envs/maze_env/ /app/src/envs/maze_env/
+
+# Copy README for web interface documentation
+COPY src/envs/maze_env/README.md /app/README.md
+
+# Extend Python path for OpenEnv (base image sets PYTHONPATH=/app/src)
+# We prepend Maze paths
+ENV PYTHONPATH=/repo:/repo/build/python:/app/src
+
+
+# Health check (curl provided by openenv-base)
+HEALTHCHECK --interval=30s --timeout=3s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:8000/health || exit 1
+
+# Expose default port
+EXPOSE 8000
+
+# Run the FastAPI server (uvicorn installed by openenv-base)
+CMD ["uvicorn", "envs.maze_env.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/src/envs/maze_env/server/__init__.py b/src/envs/maze_env/server/__init__.py
new file mode 100644
index 00000000..f3cfcf4a
--- /dev/null
+++ b/src/envs/maze_env/server/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""Server-side implementation for Maze environments."""
+from .maze import Maze, Status
+from .maze_environment import MazeEnvironment
+
+__all__ = ["Maze","MazeEnvironment","Status"]
\ No newline at end of file
diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py
new file mode 100644
index 00000000..0282cd6e
--- /dev/null
+++ b/src/envs/maze_env/server/app.py
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+FastAPI application for the Maze Environment.
+
+This module creates an HTTP server that exposes Maze game
+over HTTP endpoints, making them compatible with HTTPEnvClient.
+
+Usage:
+    # Development (with auto-reload):
+    uvicorn envs.maze_env.server.app:app --reload --host 0.0.0.0 --port 8000
+
+    # Production:
+    uvicorn envs.maze_env.server.app:app --host 0.0.0.0 --port 8000 --workers 4
+
+    # Or run directly:
+    python -m envs.maze_env.server.app
+
+Variables:
+    maze: np.array - Maze as a numpy array
+"""
+
+from core.env_server import create_app
+import numpy as np
+from ..models import MazeAction, MazeObservation
+from .maze_environment import MazeEnvironment
+from .mazearray import maze
+# Get game configuration from environment variables
+
+# Create the environment instance
+env = MazeEnvironment(maze_array=maze)
+
+# Create the FastAPI app with web interface and README integration
+app = create_app(env, MazeAction, MazeObservation, env_name="maze_env")
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/src/envs/maze_env/server/maze.py b/src/envs/maze_env/server/maze.py
new file mode 100644
index 00000000..f9dddaaa
--- /dev/null
+++ b/src/envs/maze_env/server/maze.py
@@ -0,0 +1,351 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Derived from https://github.com/erikdelange/Reinforcement-Learning-Maze/blob/master/main.py (MIT LICENSE)
+# Original Author: Erik de Lange <erikdelange@users.noreply.github.com>
+
+import logging
+from enum import Enum, IntEnum
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+class Cell(IntEnum):
+    EMPTY = 0  # indicates empty cell where the agent can move to
+    OCCUPIED = 1  # indicates cell which contains a wall and cannot be entered
+    CURRENT = 2  # indicates current cell of the agent
+
+
+class Action(IntEnum):
+    MOVE_LEFT = 2
+    MOVE_RIGHT = 3
+    MOVE_UP = 0
+    MOVE_DOWN = 1
+
+
+class Render(Enum):
+    NOTHING = 0
+    TRAINING = 1
+    MOVES = 2
+
+
+class Status(Enum):
+    WIN = 0
+    LOSE = 1
+    PLAYING = 2
+
+
+class Maze:
+    """ A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze.
+
+        The layout of the maze and the rules how to move through it are called the environment. An agent is placed
+        at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every
+        action results in a reward or penalty which are accumulated during the game. Every move gives a small
+        penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into
+        a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The
+        game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means
+        winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is
+        assumed to wander around clueless and looses.
+
+        A note on cell coordinates:
+        The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze.
+        This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze
+        itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple
+        to (row, col) use (col, row)[::-1]
+    """
+    actions = [Action.MOVE_LEFT, Action.MOVE_RIGHT, Action.MOVE_UP, Action.MOVE_DOWN]  # all possible actions
+
+    reward_exit = 10.0  # reward for reaching the exit cell
+    penalty_move = -0.05  # penalty for a move which did not result in finding the exit cell
+    penalty_visited = -0.25  # penalty for returning to a cell which was visited earlier
+    penalty_impossible_move = -0.75  # penalty for trying to enter an occupied cell or moving out of the maze
+
+    def __init__(self, maze, start_cell=(0, 0), exit_cell=None):
+        """ Create a new maze game.
+
+            :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1)
+            :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left)
+            :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right)
+        """
+        self.maze = maze
+
+        self.__minimum_reward = -0.5 * self.maze.size  # stop game if accumulated reward is below this threshold
+
+        nrows, ncols = self.maze.shape
+        self.cells = [(col, row) for col in range(ncols) for row in range(nrows)]
+        self.empty = [(col, row) for col in range(ncols) for row in range(nrows) if self.maze[row, col] == Cell.EMPTY]
+        self.__exit_cell = (ncols - 1, nrows - 1) if exit_cell is None else exit_cell
+        self.empty.remove(self.__exit_cell)
+
+        # Check for impossible maze layout
+        if self.__exit_cell not in self.cells:
+            raise Exception("Error: exit cell at {} is not inside maze".format(self.__exit_cell))
+        if self.maze[self.__exit_cell[::-1]] == Cell.OCCUPIED:
+            raise Exception("Error: exit cell at {} is not free".format(self.__exit_cell))
+
+        # Variables for rendering using Matplotlib
+        self.__render = Render.NOTHING  # what to render
+        self.__ax1 = None  # axes for rendering the moves
+        self.__ax2 = None  # axes for rendering the best action per cell
+
+        self.reset(start_cell)
+
+    def reset(self, start_cell=(0, 0)):
+        """ Reset the maze to its initial state and place the agent at start_cell.
+
+            :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left)
+            :return: new state after reset
+        """
+        if start_cell not in self.cells:
+            raise Exception("Error: start cell at {} is not inside maze".format(start_cell))
+        if self.maze[start_cell[::-1]] == Cell.OCCUPIED:
+            raise Exception("Error: start cell at {} is not free".format(start_cell))
+        if start_cell == self.__exit_cell:
+            raise Exception("Error: start- and exit cell cannot be the same {}".format(start_cell))
+
+        self.__previous_cell = self.__current_cell = start_cell
+        self.__total_reward = 0.0  # accumulated reward
+        self.__visited = set()  # a set() only stores unique values
+
+        if self.__render in (Render.TRAINING, Render.MOVES):
+            # render the maze
+            nrows, ncols = self.maze.shape
+            self.__ax1.clear()
+            self.__ax1.set_xticks(np.arange(0.5, nrows, step=1))
+            self.__ax1.set_xticklabels([])
+            self.__ax1.set_yticks(np.arange(0.5, ncols, step=1))
+            self.__ax1.set_yticklabels([])
+            self.__ax1.grid(True)
+            self.__ax1.plot(*self.__current_cell, "rs", markersize=30)  # start is a big red square
+            self.__ax1.text(*self.__current_cell, "Start", ha="center", va="center", color="white")
+            self.__ax1.plot(*self.__exit_cell, "gs", markersize=30)  # exit is a big green square
+            self.__ax1.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white")
+            self.__ax1.imshow(self.maze, cmap="binary")
+            self.__ax1.get_figure().canvas.draw()
+            self.__ax1.get_figure().canvas.flush_events()
+
+        return self.__observe()
+
+    def __draw(self):
+        """ Draw a line from the agents previous cell to its current cell. """
+        self.__ax1.plot(*zip(*[self.__previous_cell, self.__current_cell]), "bo-")  # previous cells are blue dots
+        self.__ax1.plot(*self.__current_cell, "ro")  # current cell is a red dot
+        self.__ax1.get_figure().canvas.draw()
+        self.__ax1.get_figure().canvas.flush_events()
+
+    def render(self, content=Render.NOTHING):
+        """ Record what will be rendered during play and/or training.
+
+            :param Render content: NOTHING, TRAINING, MOVES
+        """
+        self.__render = content
+
+        if self.__render == Render.NOTHING:
+            if self.__ax1:
+                self.__ax1.get_figure().close()
+                self.__ax1 = None
+            if self.__ax2:
+                self.__ax2.get_figure().close()
+                self.__ax2 = None
+        if self.__render == Render.TRAINING:
+            if self.__ax2 is None:
+                fig, self.__ax2 = plt.subplots(1, 1, tight_layout=True)
+                fig.canvas.set_window_title("Best move")
+                self.__ax2.set_axis_off()
+                self.render_q(None)
+        if self.__render in (Render.MOVES, Render.TRAINING):
+            if self.__ax1 is None:
+                fig, self.__ax1 = plt.subplots(1, 1, tight_layout=True)
+                fig.canvas.set_window_title("Maze")
+
+        plt.show(block=False)
+
+    def step(self, action):
+        """ Move the agent according to 'action' and return the new state, reward and game status.
+
+            :param Action action: the agent will move in this direction
+            :return: state, reward, status
+        """
+        reward = self.__execute(action)
+        self.__total_reward += reward
+        status = self.__status()
+        state = self.__observe()
+        logging.debug("action: {:10s} | reward: {: .2f} | status: {}".format(Action(action).name, reward, status))
+        return state, reward, status
+
+    def __execute(self, action):
+        """ Execute action and collect the reward or penalty.
+
+            :param Action action: direction in which the agent will move
+            :return float: reward or penalty which results from the action
+        """
+        possible_actions = self.__possible_actions(self.__current_cell)
+
+        if not possible_actions:
+            reward = self.__minimum_reward - 1  # cannot move anywhere, force end of game
+        elif action in possible_actions:
+            col, row = self.__current_cell
+            if action == Action.MOVE_LEFT:
+                col -= 1
+            elif action == Action.MOVE_UP:
+                row -= 1
+            if action == Action.MOVE_RIGHT:
+                col += 1
+            elif action == Action.MOVE_DOWN:
+                row += 1
+
+            self.__previous_cell = self.__current_cell
+            self.__current_cell = (col, row)
+
+            if self.__render != Render.NOTHING:
+                self.__draw()
+
+            if self.__current_cell == self.__exit_cell:
+                reward = Maze.reward_exit  # maximum reward when reaching the exit cell
+            elif self.__current_cell in self.__visited:
+                reward = Maze.penalty_visited  # penalty when returning to a cell which was visited earlier
+            else:
+                reward = Maze.penalty_move  # penalty for a move which did not result in finding the exit cell
+
+            self.__visited.add(self.__current_cell)
+        else:
+            reward = Maze.penalty_impossible_move  # penalty for trying to enter an occupied cell or move out of the maze
+
+        return reward
+
+    def __possible_actions(self, cell=None):
+        """ Create a list with all possible actions from 'cell', avoiding the maze's edges and walls.
+
+            :param tuple cell: location of the agent (optional, else use current cell)
+            :return list: all possible actions
+        """
+        if cell is None:
+            col, row = self.__current_cell
+        else:
+            col, row = cell
+
+        possible_actions = Maze.actions.copy()  # initially allow all
+
+        # now restrict the initial list by removing impossible actions
+        nrows, ncols = self.maze.shape
+        if row == 0 or (row > 0 and self.maze[row - 1, col] == Cell.OCCUPIED):
+            possible_actions.remove(Action.MOVE_UP)
+        if row == nrows - 1 or (row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED):
+            possible_actions.remove(Action.MOVE_DOWN)
+
+        if col == 0 or (col > 0 and self.maze[row, col - 1] == Cell.OCCUPIED):
+            possible_actions.remove(Action.MOVE_LEFT)
+        if col == ncols - 1 or (col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED):
+            possible_actions.remove(Action.MOVE_RIGHT)
+
+        return possible_actions
+
+    def __status(self):
+        """ Return the game status.
+
+            :return Status: current game status (WIN, LOSE, PLAYING)
+        """
+        if self.__current_cell == self.__exit_cell:
+            return Status.WIN
+
+        if self.__total_reward < self.__minimum_reward:  # force end of game after too much loss
+            return Status.LOSE
+
+        return Status.PLAYING
+
+    def __observe(self):
+        """ Return the state of the maze - in this game the agents current location.
+
+            :return numpy.array [1][2]: agents current location
+        """
+        return np.array([[*self.__current_cell]])
+
+    def play(self, model, start_cell=(0, 0)):
+        """ Play a single game, choosing the next move based a prediction from 'model'.
+
+            :param class AbstractModel model: the prediction model to use
+            :param tuple start_cell: agents initial cell (optional, else upper left)
+            :return Status: WIN, LOSE
+        """
+        self.reset(start_cell)
+
+        state = self.__observe()
+
+        while True:
+            action = model.predict(state=state)
+            state, reward, status = self.step(action)
+            if status in (Status.WIN, Status.LOSE):
+                return status
+
+    def check_win_all(self, model):
+        """ Check if the model wins from all possible starting cells. """
+        previous = self.__render
+        self.__render = Render.NOTHING  # avoid rendering anything during execution of the check games
+
+        win = 0
+        lose = 0
+
+        for cell in self.empty:
+            if self.play(model, cell) == Status.WIN:
+                win += 1
+            else:
+                lose += 1
+
+        self.__render = previous  # restore previous rendering setting
+
+        logging.info("won: {} | lost: {} | win rate: {:.5f}".format(win, lose, win / (win + lose)))
+
+        result = True if lose == 0 else False
+
+        return result, win / (win + lose)
+
+    def render_q(self, model):
+        """ Render the recommended action(s) for each cell as provided by 'model'.
+
+        :param class AbstractModel model: the prediction model to use
+        """
+
+        def clip(n):
+            return max(min(1, n), 0)
+
+        if self.__render == Render.TRAINING:
+            nrows, ncols = self.maze.shape
+
+            self.__ax2.clear()
+            self.__ax2.set_xticks(np.arange(0.5, nrows, step=1))
+            self.__ax2.set_xticklabels([])
+            self.__ax2.set_yticks(np.arange(0.5, ncols, step=1))
+            self.__ax2.set_yticklabels([])
+            self.__ax2.grid(True)
+            self.__ax2.plot(*self.__exit_cell, "gs", markersize=30)  # exit is a big green square
+            self.__ax2.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white")
+
+            for cell in self.empty:
+                q = model.q(cell) if model is not None else [0, 0, 0, 0]
+                a = np.nonzero(q == np.max(q))[0]
+
+                for action in a:
+                    dx = 0
+                    dy = 0
+                    if action == Action.MOVE_LEFT:
+                        dx = -0.2
+                    if action == Action.MOVE_RIGHT:
+                        dx = +0.2
+                    if action == Action.MOVE_UP:
+                        dy = -0.2
+                    if action == Action.MOVE_DOWN:
+                        dy = 0.2
+
+                    # color (from red to green) represents the certainty of the preferred action(s)
+                    maxv = 1
+                    minv = -1
+                    color = clip((q[action] - minv) / (maxv - minv))  # normalize in [-1, 1]
+
+                    self.__ax2.arrow(*cell, dx, dy, color=(1 - color, color, 0), head_width=0.2, head_length=0.1)
+
+            self.__ax2.imshow(self.maze, cmap="binary")
+            self.__ax2.get_figure().canvas.draw()
\ No newline at end of file
diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py
new file mode 100644
index 00000000..e9560bb1
--- /dev/null
+++ b/src/envs/maze_env/server/maze_environment.py
@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Maze Environment Server Implementation.
+
+This module wraps Maze's environment and exposes it
+via the OpenEnv Environment interface.
+"""
+
+from typing import Any, Dict, List, Tuple, Optional
+from core.env_server import Action, Environment, Observation
+from .maze import Maze, Status
+from ..models import MazeAction, MazeObservation, MazeState
+
+try:
+    import numpy as np
+except ImportError as e:
+    raise ImportError(
+        "Numpy is not installed. "
+        "Please install it following instructions at: "
+        "pip install numpy"
+    ) from e
+
+
+class MazeEnvironment(Environment):
+    """
+    Maze Environment wrapper for OpenEnv.
+
+    This environment wraps Maze game and provides a single-agent interface.
+
+    Args:
+        maze_array: Maze array as numpy array
+        start cell: Start of the maze
+        exit_cell: Exit for the maze
+    """
+
+    def __init__(
+        self,
+        maze_array: np.ndarray,
+        start_cell: Tuple[int, int] = (0, 0),
+        exit_cell: Optional[Tuple[int, int]] = None,
+    ):
+        # Create underlying Maze instance (matches your working code)
+        self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell)
+
+        # env.reset() will be called in reset(); state initialized to None until then
+        self.state: Optional[MazeState] = None
+
+    def reset(self) -> MazeObservation:
+        """Reset environment and return initial observation (MazeObservation)."""
+        observation = self.env.reset()  # typically returns np.array([row, col]) or similar
+        # initialize episode state
+        self.state = MazeState(episode_id="episode_1", step_count=0, done=False)
+
+        # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields
+        pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation)
+        total_reward = getattr(self.env, "_Maze__total_reward", 0.0)
+        legal_actions = self._compute_legal_actions(pos_list[0])
+
+        return MazeObservation(position=pos_list, total_reward=total_reward, legal_actions=legal_actions)
+
+    def step(self, action: MazeAction) -> MazeObservation:
+        """
+        Step function that directly manipulates the maze position grid
+        to ensure visible player movement.
+        """
+
+        # --- Get current position ---
+        if hasattr(self.env, "agent_position"):
+            row, col = self.env.agent_position
+        elif hasattr(self.env, "_Maze__current_cell"):
+            row, col = self.env._Maze__current_cell
+        else:
+            row, col = self.env._Maze__start_cell
+
+        maze = np.array(self.env.maze)
+
+        # --- Define movement directions ---
+        # 0 = UP, 1 = DOWN, 2 = LEFT, 3 = RIGHT
+        move_map = {
+            0: (-1, 0),
+            1: (1, 0),
+            2: (0, -1),
+            3: (0, 1),
+        }
+
+        dr, dc = move_map.get(action.action, (0, 0))
+        new_r, new_c = row + dr, col + dc
+
+        # --- Check if move is within bounds and not a wall ---
+        if (
+            0 <= new_r < maze.shape[0]
+            and 0 <= new_c < maze.shape[1]
+            and maze[new_r, new_c] != 1  # assuming 1 = wall, 0 = free space
+        ):
+            row, col = new_r, new_c
+
+        # --- Update environment position ---
+        if hasattr(self.env, "agent_position"):
+            self.env.agent_position = (row, col)
+        elif hasattr(self.env, "_Maze__current_cell"):
+            self.env._Maze__current_cell = (row, col)
+
+        # --- Reward and done ---
+        total_reward = getattr(self.env, "_Maze__total_reward", 0.0)
+        if hasattr(self.env, "_Maze__total_reward"):
+            self.env._Maze__total_reward = total_reward + 0.0  # change as needed
+
+        exit_cell = getattr(self.env, "exit_cell", None)
+        done = exit_cell is not None and (row, col) == exit_cell
+
+        # --- Update state ---
+        if self.state is None:
+            self.state = MazeState(episode_id="episode_1", step_count=0, done=done)
+        self.state.step_count += 1
+        self.state.done = done
+
+        pos_list = [row, col]
+        legal_actions = self._compute_legal_actions(pos_list)
+
+        return MazeObservation(
+            position=pos_list,
+            total_reward=total_reward,
+            legal_actions=legal_actions,
+        )
+
+    def state(self) -> Optional[MazeState]:
+        """Return the current MazeState object."""
+        return self.state
+
+    def _compute_legal_actions(self, pos: List[int]) -> List[int]:
+        """
+        Compute which actions are legal given the current normalized position [row, col].
+        (0=UP, 1=DOWN, 2=LEFT, 3=RIGHT)
+        """
+        actions: List[int] = []
+        if not pos or len(pos) < 2:
+            return actions
+
+        row, col = int(pos[0]), int(pos[1])
+        nrows, ncols = self.env.maze.shape
+
+        # UP
+        if row > 0 and self.env.maze[row - 1, col] == 0:
+            actions.append(0)
+        # DOWN
+        if row < nrows - 1 and self.env.maze[row + 1, col] == 0:
+            actions.append(1)
+        # LEFT
+        if col > 0 and self.env.maze[row, col - 1] == 0:
+            actions.append(2)
+        # RIGHT
+        if col < ncols - 1 and self.env.maze[row, col + 1] == 0:
+            actions.append(3)
+
+        return actions
\ No newline at end of file
diff --git a/src/envs/maze_env/server/mazearray.py b/src/envs/maze_env/server/mazearray.py
new file mode 100644
index 00000000..b87935e2
--- /dev/null
+++ b/src/envs/maze_env/server/mazearray.py
@@ -0,0 +1,13 @@
+import numpy as np
+
+# Maze
+maze = np.array([
+    [0, 1, 0, 0, 0, 0, 0, 0],
+    [0, 1, 0, 1, 0, 1, 0, 0],
+    [0, 0, 0, 1, 1, 0, 1, 0],
+    [0, 1, 0, 1, 0, 0, 0, 0],
+    [1, 0, 0, 1, 0, 1, 0, 0],
+    [0, 0, 0, 1, 0, 1, 1, 1],
+    [0, 1, 1, 0, 0, 0, 0, 0],
+    [0, 0, 0, 0, 0, 1, 0, 0]
+])
\ No newline at end of file

From e3eafcf12575a29cfd79c31a498cbf35430a0eaf Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Tue, 28 Oct 2025 00:17:56 +0530
Subject: [PATCH 02/14] Add maze environment to workflow matrix

---
 .github/workflows/docker-build.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml
index 8cba8b47..b82da2a9 100644
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@@ -79,7 +79,8 @@ jobs:
             dockerfile: src/envs/atari_env/server/Dockerfile
           - name: git-env
             dockerfile: src/envs/git_env/server/Dockerfile
-
+          - name: maze-env
+            dockerfile: src/envs/maze_env/server/Dockerfile
     steps:
       - name: Checkout code
         uses: actions/checkout@v4

From d45843465052682620144142b8e88b5883774b60 Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Tue, 28 Oct 2025 00:23:54 +0530
Subject: [PATCH 03/14] Add dependencies to Dockerfile

---
 src/envs/maze_env/server/Dockerfile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/envs/maze_env/server/Dockerfile b/src/envs/maze_env/server/Dockerfile
index 3000b570..544d3e44 100644
--- a/src/envs/maze_env/server/Dockerfile
+++ b/src/envs/maze_env/server/Dockerfile
@@ -10,6 +10,11 @@
 ARG OPENENV_BASE_IMAGE=openenv-base:latest
 FROM ${OPENENV_BASE_IMAGE}
 
+# Install Python dependencies that all environments need
+RUN pip install --no-cache-dir \
+    numpy>=2.3.4 \
+    matplotlib>=3.10.7
+
 # Set working directory
 WORKDIR /app
 

From e9e1c0f2089f6794edf1abb8e333e06be45df50b Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Tue, 28 Oct 2025 03:58:52 +0530
Subject: [PATCH 04/14] Add reward function

---
 src/envs/maze_env/server/maze_environment.py | 62 +++++++++++++++-----
 1 file changed, 47 insertions(+), 15 deletions(-)

diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py
index e9560bb1..7d86dc06 100644
--- a/src/envs/maze_env/server/maze_environment.py
+++ b/src/envs/maze_env/server/maze_environment.py
@@ -46,7 +46,7 @@ def __init__(
     ):
         # Create underlying Maze instance (matches your working code)
         self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell)
-
+        self.total_reward = 0
         # env.reset() will be called in reset(); state initialized to None until then
         self.state: Optional[MazeState] = None
 
@@ -65,8 +65,8 @@ def reset(self) -> MazeObservation:
 
     def step(self, action: MazeAction) -> MazeObservation:
         """
-        Step function that directly manipulates the maze position grid
-        to ensure visible player movement.
+        Step function that manipulates the maze position grid
+        and applies rewards/penalties for movement outcomes.
         """
 
         # --- Get current position ---
@@ -88,30 +88,59 @@ def step(self, action: MazeAction) -> MazeObservation:
             3: (0, 1),
         }
 
+        # --- Reward settings ---
+        reward_exit = 10.0          # reward for reaching the exit cell
+        penalty_move = 0.05        # penalty for a move that didn't find the exit
+        penalty_visited = -0.25     # penalty for revisiting a cell
+        penalty_impossible = -0.75  # penalty for invalid move (wall/outside)
+
         dr, dc = move_map.get(action.action, (0, 0))
         new_r, new_c = row + dr, col + dc
 
-        # --- Check if move is within bounds and not a wall ---
-        if (
+        # Keep track of visited cells
+        if not hasattr(self, "_visited"):
+            self._visited = set()
+        self._visited.add((row, col))
+
+        # --- Check if move is valid ---
+        valid_move = (
             0 <= new_r < maze.shape[0]
             and 0 <= new_c < maze.shape[1]
-            and maze[new_r, new_c] != 1  # assuming 1 = wall, 0 = free space
-        ):
+            and maze[new_r, new_c] != 1
+        )
+
+        reward = 0.0
+        done = False
+
+        if valid_move:
+            # Update position
             row, col = new_r, new_c
 
+            exit_cell = getattr(self.env, "exit_cell", None)
+            if exit_cell and (row, col) == exit_cell:
+                reward += reward_exit
+                done = True
+                self._visited = set()
+            elif (row, col) in self._visited:
+                reward += penalty_visited
+            else:
+                reward += penalty_move
+        else:
+            # Invalid move
+            reward += penalty_impossible
+
         # --- Update environment position ---
         if hasattr(self.env, "agent_position"):
             self.env.agent_position = (row, col)
         elif hasattr(self.env, "_Maze__current_cell"):
             self.env._Maze__current_cell = (row, col)
 
-        # --- Reward and done ---
-        total_reward = getattr(self.env, "_Maze__total_reward", 0.0)
-        if hasattr(self.env, "_Maze__total_reward"):
-            self.env._Maze__total_reward = total_reward + 0.0  # change as needed
-
-        exit_cell = getattr(self.env, "exit_cell", None)
-        done = exit_cell is not None and (row, col) == exit_cell
+        # --- Total reward update ---
+        self.total_reward += reward
+        print("Total reward:",self.total_reward)
+        print("Reward:",reward)
+        # if hasattr(self.env, "_Maze__total_reward"):
+        #     self.env._Maze__total_reward = total_reward
 
         # --- Update state ---
         if self.state is None:
@@ -119,15 +148,18 @@ def step(self, action: MazeAction) -> MazeObservation:
         self.state.step_count += 1
         self.state.done = done
 
+        # --- Observation ---
         pos_list = [row, col]
         legal_actions = self._compute_legal_actions(pos_list)
 
+        # --- Return observation ---
         return MazeObservation(
             position=pos_list,
-            total_reward=total_reward,
+            total_reward=self.total_reward,
             legal_actions=legal_actions,
         )
 
+
     def state(self) -> Optional[MazeState]:
         """Return the current MazeState object."""
         return self.state

From f47d107e3dbc964cb28138eb5a33b8e7ef6877c9 Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Tue, 28 Oct 2025 03:59:43 +0530
Subject: [PATCH 05/14] Update maze example for reward

---
 examples/maze_human.py | 101 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 examples/maze_human.py

diff --git a/examples/maze_human.py b/examples/maze_human.py
new file mode 100644
index 00000000..1f81449a
--- /dev/null
+++ b/examples/maze_human.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Simple example of using Maze environment with OpenEnv.
+
+This demonstrates:
+1. Connecting to the Maze environment server
+2. Resetting the environment
+3. Taking actions
+4. Observing rewards
+5. Inspecting environment state
+
+Usage:
+    python examples/maze_simple.py
+"""
+
+import sys
+from pathlib import Path
+
+# Add src to path
+sys.path.insert(0, str(Path(__file__).parent.parent / "src"))
+import numpy as np
+from envs.maze_env import MazeEnv, MazeAction
+
+
+def main():
+    print("🧩 Simple Maze Environment Example")
+    print("=" * 60)
+
+    # Connect to environment server
+    # Ensure server is running: python -m envs.maze_env.server.app
+    env = MazeEnv(base_url="http://localhost:8000")
+    maze = np.array([
+            [0, 1, 0, 0, 0, 0, 0, 0],
+            [0, 1, 0, 1, 0, 1, 0, 0],
+            [0, 0, 0, 1, 1, 0, 1, 0],
+            [0, 1, 0, 1, 0, 0, 0, 0],
+            [1, 0, 0, 1, 0, 1, 0, 0],
+            [0, 0, 0, 1, 0, 1, 1, 1],
+            [0, 1, 1, 0, 0, 0, 0, 0],
+            [0, 0, 0, 0, 0, 1, 0, 0]
+        ])
+    try:
+        # Reset environment
+        print("\n📍 Resetting environment...")
+        result = env.reset()
+
+        print(f"   Initial position: {result.observation.position}")
+        print(f"   Legal actions: {result.observation.legal_actions}")
+
+        # Run one episode
+        print("\n🚶 Navigating through maze...")
+        step = 0
+        total_reward = 0
+
+        while not result.done and step < 25:
+            # Choose random legal action
+            print(f"   Current position: {result.observation.position}")
+            print(f"   Legal actions: {result.observation.legal_actions}")
+            env.render_ascii_maze(maze,result.observation.position,[0,0],[maze.shape[0],maze.shape[1]])
+            action_id = int(input("Make any move from the legal actions"))
+            # Take action
+            result = env.step(MazeAction(action=action_id))
+            print(result)
+            reward = result.observation.total_reward or 0
+            total_reward += reward
+
+            print(f"   Step {step + 1}: action={action_id}, pos={result.observation.position}, reward={reward:.2f}, done={result.done}")
+            step += 1
+            print("-----------------------------------------------------")
+
+        print(f"\n✅ Episode finished!")
+        print(f"   Total steps: {step}")
+        print(f"   Total reward: {total_reward}")
+
+        # Get environment state
+        state = env.state()
+        print(f"\n📊 Environment State:")
+        print(f"   Episode ID: {state.episode_id}")
+        print(f"   Step count: {state.step_count}")
+        print(f"   Done: {state.done}")
+
+    except Exception as e:
+        print(f"\n❌ Error: {e}")
+        print("\nMake sure the server is running:")
+        print("  python -m envs.maze_env.server.app")
+        print("\nOr start with Docker:")
+        print("  docker run -p 8000:8000 maze-env:latest")
+
+    finally:
+        env.close()
+        print("\n👋 Done!")
+
+
+if __name__ == "__main__":
+    main()

From 6c0c7a8fbe2a35518c418a29d7115703fa3b8a57 Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Tue, 28 Oct 2025 17:45:37 +0530
Subject: [PATCH 06/14] Implement done flag for win

---
 examples/maze_human.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/examples/maze_human.py b/examples/maze_human.py
index 1f81449a..97095e5b 100644
--- a/examples/maze_human.py
+++ b/examples/maze_human.py
@@ -52,11 +52,11 @@ def main():
 
         print(f"   Initial position: {result.observation.position}")
         print(f"   Legal actions: {result.observation.legal_actions}")
-
+        # Note: Initial total reward is 0 however it is observed it doesn't resets if you run this example again during the same server app session
+        print(f"   Initial Total reward: {result.observation.total_reward}")
         # Run one episode
         print("\n🚶 Navigating through maze...")
         step = 0
-        total_reward = 0
 
         while not result.done and step < 25:
             # Choose random legal action
@@ -66,9 +66,7 @@ def main():
             action_id = int(input("Make any move from the legal actions"))
             # Take action
             result = env.step(MazeAction(action=action_id))
-            print(result)
             reward = result.observation.total_reward or 0
-            total_reward += reward
 
             print(f"   Step {step + 1}: action={action_id}, pos={result.observation.position}, reward={reward:.2f}, done={result.done}")
             step += 1
@@ -76,7 +74,7 @@ def main():
 
         print(f"\n✅ Episode finished!")
         print(f"   Total steps: {step}")
-        print(f"   Total reward: {total_reward}")
+        print(f"   Total reward: {reward}")
 
         # Get environment state
         state = env.state()

From 373ff5bc1a9b7996a23694fbe6e0be270cd4d095 Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Tue, 28 Oct 2025 17:49:06 +0530
Subject: [PATCH 07/14] Minor fix

---
 src/envs/maze_env/server/app.py              |  2 +-
 src/envs/maze_env/server/maze_environment.py | 15 ++++++---------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py
index 0282cd6e..6076e516 100644
--- a/src/envs/maze_env/server/app.py
+++ b/src/envs/maze_env/server/app.py
@@ -32,7 +32,7 @@
 # Get game configuration from environment variables
 
 # Create the environment instance
-env = MazeEnvironment(maze_array=maze)
+env = MazeEnvironment(maze_array=maze,start_cell=(0,0),exit_cell=(7,7))
 
 # Create the FastAPI app with web interface and README integration
 app = create_app(env, MazeAction, MazeObservation, env_name="maze_env")
diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py
index 7d86dc06..a79e754f 100644
--- a/src/envs/maze_env/server/maze_environment.py
+++ b/src/envs/maze_env/server/maze_environment.py
@@ -42,11 +42,13 @@ def __init__(
         self,
         maze_array: np.ndarray,
         start_cell: Tuple[int, int] = (0, 0),
-        exit_cell: Optional[Tuple[int, int]] = None,
+        exit_cell: Optional[Tuple[int, int]] = (7,7),
     ):
         # Create underlying Maze instance (matches your working code)
         self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell)
         self.total_reward = 0
+        self.start_cell = start_cell
+        self.exit_cell = exit_cell
         # env.reset() will be called in reset(); state initialized to None until then
         self.state: Optional[MazeState] = None
 
@@ -58,7 +60,7 @@ def reset(self) -> MazeObservation:
 
         # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields
         pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation)
-        total_reward = getattr(self.env, "_Maze__total_reward", 0.0)
+        total_reward = 0
         legal_actions = self._compute_legal_actions(pos_list[0])
 
         return MazeObservation(position=pos_list, total_reward=total_reward, legal_actions=legal_actions)
@@ -116,8 +118,7 @@ def step(self, action: MazeAction) -> MazeObservation:
             # Update position
             row, col = new_r, new_c
 
-            exit_cell = getattr(self.env, "exit_cell", None)
-            if exit_cell and (row, col) == exit_cell:
+            if self.exit_cell and (row, col) == self.exit_cell:
                 reward += reward_exit
                 done = True
                 self._visited = set()
@@ -137,10 +138,6 @@ def step(self, action: MazeAction) -> MazeObservation:
 
         # --- Total reward update ---
         self.total_reward += reward
-        print("Total reward:",self.total_reward)
-        print("Reward:",reward)
-        # if hasattr(self.env, "_Maze__total_reward"):
-        #     self.env._Maze__total_reward = total_reward
 
         # --- Update state ---
         if self.state is None:
@@ -151,12 +148,12 @@ def step(self, action: MazeAction) -> MazeObservation:
         # --- Observation ---
         pos_list = [row, col]
         legal_actions = self._compute_legal_actions(pos_list)
-
         # --- Return observation ---
         return MazeObservation(
             position=pos_list,
             total_reward=self.total_reward,
             legal_actions=legal_actions,
+            done=done
         )
 
 

From cc336cf3117031606397310f2529914e9d12c251 Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Tue, 28 Oct 2025 17:55:41 +0530
Subject: [PATCH 08/14] Typo fix

---
 examples/maze_simple.py                      | 2 ++
 src/envs/maze_env/server/maze_environment.py | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/maze_simple.py b/examples/maze_simple.py
index c3f27d91..ddacdbe3 100644
--- a/examples/maze_simple.py
+++ b/examples/maze_simple.py
@@ -52,6 +52,8 @@ def main():
 
         print(f"   Initial position: {result.observation.position}")
         print(f"   Legal actions: {result.observation.legal_actions}")
+        # Note: Initial total reward is 0 however it is observed it doesn't resets if you run this example again during the same server app session
+        print(f"   Initial Total reward: {result.observation.total_reward}")
 
         # Run one episode
         print("\n🚶 Navigating through maze...")
diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py
index a79e754f..860ea936 100644
--- a/src/envs/maze_env/server/maze_environment.py
+++ b/src/envs/maze_env/server/maze_environment.py
@@ -92,7 +92,7 @@ def step(self, action: MazeAction) -> MazeObservation:
 
         # --- Reward settings ---
         reward_exit = 10.0          # reward for reaching the exit cell
-        penalty_move = 0.05        # penalty for a move that didn't find the exit
+        reward_move = 0.05        # reward for a move that didn't find the exit but is valid
         penalty_visited = -0.25     # penalty for revisiting a cell
         penalty_impossible = -0.75  # penalty for invalid move (wall/outside)
 
@@ -125,7 +125,7 @@ def step(self, action: MazeAction) -> MazeObservation:
             elif (row, col) in self._visited:
                 reward += penalty_visited
             else:
-                reward += penalty_move
+                reward += reward_move
         else:
             # Invalid move
             reward += penalty_impossible

From 697967949f847a4b9315f311061ca55acc6b66ff Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Sun, 2 Nov 2025 18:08:05 +0530
Subject: [PATCH 09/14] Remove unused imports and minor fix

---
 examples/maze_human.py                       |  4 ++--
 examples/maze_simple.py                      |  4 ++--
 src/envs/maze_env/client.py                  |  2 +-
 src/envs/maze_env/models.py                  |  3 +--
 src/envs/maze_env/server/app.py              |  1 -
 src/envs/maze_env/server/maze_environment.py | 10 +++++-----
 6 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/examples/maze_human.py b/examples/maze_human.py
index 97095e5b..1efe3912 100644
--- a/examples/maze_human.py
+++ b/examples/maze_human.py
@@ -72,13 +72,13 @@ def main():
             step += 1
             print("-----------------------------------------------------")
 
-        print(f"\n✅ Episode finished!")
+        print("\n✅ Episode finished!")
         print(f"   Total steps: {step}")
         print(f"   Total reward: {reward}")
 
         # Get environment state
         state = env.state()
-        print(f"\n📊 Environment State:")
+        print("\n📊 Environment State:")
         print(f"   Episode ID: {state.episode_id}")
         print(f"   Step count: {state.step_count}")
         print(f"   Done: {state.done}")
diff --git a/examples/maze_simple.py b/examples/maze_simple.py
index ddacdbe3..2b5f5e5f 100644
--- a/examples/maze_simple.py
+++ b/examples/maze_simple.py
@@ -76,13 +76,13 @@ def main():
             step += 1
             print("-----------------------------------------------------")
 
-        print(f"\n✅ Episode finished!")
+        print("\n✅ Episode finished!")
         print(f"   Total steps: {step}")
         print(f"   Total reward: {total_reward}")
 
         # Get environment state
         state = env.state()
-        print(f"\n📊 Environment State:")
+        print("\n📊 Environment State:")
         print(f"   Episode ID: {state.episode_id}")
         print(f"   Step count: {state.step_count}")
         print(f"   Done: {state.done}")
diff --git a/src/envs/maze_env/client.py b/src/envs/maze_env/client.py
index 81188562..a00887fd 100644
--- a/src/envs/maze_env/client.py
+++ b/src/envs/maze_env/client.py
@@ -21,7 +21,7 @@
 from .models import MazeAction, MazeObservation, MazeState
 
 if TYPE_CHECKING:
-    from core.containers.runtime import ContainerProvider
+    pass
 
 class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]):
     """HTTP client for Maze Environment."""
diff --git a/src/envs/maze_env/models.py b/src/envs/maze_env/models.py
index d642d305..2461299e 100644
--- a/src/envs/maze_env/models.py
+++ b/src/envs/maze_env/models.py
@@ -13,8 +13,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, field
-from pydantic import Field
-from typing import Any, Dict, List, Optional, Tuple, Literal
+from typing import List
 
 from core.env_server import Action, Observation, State
 
diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py
index 6076e516..3a9ba099 100644
--- a/src/envs/maze_env/server/app.py
+++ b/src/envs/maze_env/server/app.py
@@ -25,7 +25,6 @@
 """
 
 from core.env_server import create_app
-import numpy as np
 from ..models import MazeAction, MazeObservation
 from .maze_environment import MazeEnvironment
 from .mazearray import maze
diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py
index 860ea936..eaab821c 100644
--- a/src/envs/maze_env/server/maze_environment.py
+++ b/src/envs/maze_env/server/maze_environment.py
@@ -11,9 +11,9 @@
 via the OpenEnv Environment interface.
 """
 
-from typing import Any, Dict, List, Tuple, Optional
-from core.env_server import Action, Environment, Observation
-from .maze import Maze, Status
+from typing import List, Tuple, Optional
+from core.env_server import Environment
+from .maze import Maze
 from ..models import MazeAction, MazeObservation, MazeState
 
 try:
@@ -60,10 +60,10 @@ def reset(self) -> MazeObservation:
 
         # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields
         pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation)
-        total_reward = 0
+        self.total_reward = 0
         legal_actions = self._compute_legal_actions(pos_list[0])
 
-        return MazeObservation(position=pos_list, total_reward=total_reward, legal_actions=legal_actions)
+        return MazeObservation(position=pos_list, total_reward=self.total_reward, legal_actions=legal_actions)
 
     def step(self, action: MazeAction) -> MazeObservation:
         """

From 15fe3672c51cbace87d940c9186c9c92eaa270ba Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Sun, 2 Nov 2025 18:21:20 +0530
Subject: [PATCH 10/14] Remove unused import and function

---
 src/envs/maze_env/server/maze.py | 28 ----------------------------
 1 file changed, 28 deletions(-)

diff --git a/src/envs/maze_env/server/maze.py b/src/envs/maze_env/server/maze.py
index f9dddaaa..9ff33989 100644
--- a/src/envs/maze_env/server/maze.py
+++ b/src/envs/maze_env/server/maze.py
@@ -10,7 +10,6 @@
 import logging
 from enum import Enum, IntEnum
 
-import matplotlib.pyplot as plt
 import numpy as np
 
 
@@ -137,33 +136,6 @@ def __draw(self):
         self.__ax1.get_figure().canvas.draw()
         self.__ax1.get_figure().canvas.flush_events()
 
-    def render(self, content=Render.NOTHING):
-        """ Record what will be rendered during play and/or training.
-
-            :param Render content: NOTHING, TRAINING, MOVES
-        """
-        self.__render = content
-
-        if self.__render == Render.NOTHING:
-            if self.__ax1:
-                self.__ax1.get_figure().close()
-                self.__ax1 = None
-            if self.__ax2:
-                self.__ax2.get_figure().close()
-                self.__ax2 = None
-        if self.__render == Render.TRAINING:
-            if self.__ax2 is None:
-                fig, self.__ax2 = plt.subplots(1, 1, tight_layout=True)
-                fig.canvas.set_window_title("Best move")
-                self.__ax2.set_axis_off()
-                self.render_q(None)
-        if self.__render in (Render.MOVES, Render.TRAINING):
-            if self.__ax1 is None:
-                fig, self.__ax1 = plt.subplots(1, 1, tight_layout=True)
-                fig.canvas.set_window_title("Maze")
-
-        plt.show(block=False)
-
     def step(self, action):
         """ Move the agent according to 'action' and return the new state, reward and game status.
 

From 11347ef774cddae7640209c15187d6a07438c382 Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Sun, 2 Nov 2025 18:29:35 +0530
Subject: [PATCH 11/14] Add maze env to hf build

---
 .github/workflows/deploy-hf-env.yml          |   3 +-
 src/envs/maze_env/client.py                  |  15 +-
 src/envs/maze_env/models.py                  |   1 +
 src/envs/maze_env/server/__init__.py         |   2 +-
 src/envs/maze_env/server/app.py              |   3 +-
 src/envs/maze_env/server/maze.py             | 218 ++++++++++++-------
 src/envs/maze_env/server/maze_environment.py |  29 ++-
 src/envs/maze_env/server/mazearray.py        |  22 +-
 8 files changed, 193 insertions(+), 100 deletions(-)

diff --git a/.github/workflows/deploy-hf-env.yml b/.github/workflows/deploy-hf-env.yml
index d84833df..2f188ddf 100644
--- a/.github/workflows/deploy-hf-env.yml
+++ b/.github/workflows/deploy-hf-env.yml
@@ -15,6 +15,7 @@ on:
           - 'chat_env'
           - 'atari_env'
           - 'openspiel_env'
+          - 'maze_env'
       custom_environment:
         description: 'Custom environment to deploy (leave empty for none)'
         required: false
@@ -110,7 +111,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        environment: [echo_env, coding_env, chat_env, atari_env, openspiel_env]
+        environment: [echo_env, coding_env, chat_env, atari_env, openspiel_env, maze_env]
     permissions:
       contents: read
     
diff --git a/src/envs/maze_env/client.py b/src/envs/maze_env/client.py
index a00887fd..dfbc1013 100644
--- a/src/envs/maze_env/client.py
+++ b/src/envs/maze_env/client.py
@@ -23,10 +23,17 @@
 if TYPE_CHECKING:
     pass
 
+
 class MazeEnv(HTTPEnvClient[MazeAction, MazeObservation]):
     """HTTP client for Maze Environment."""
 
-    def render_ascii_maze(self, maze: List[List[int]], position: List[int], start: List[int], goal: List[int]) -> None:
+    def render_ascii_maze(
+        self,
+        maze: List[List[int]],
+        position: List[int],
+        start: List[int],
+        goal: List[int],
+    ) -> None:
         """
         Render the maze grid as ASCII art in the terminal.
         - 0 = free cell
@@ -49,8 +56,8 @@ def render_ascii_maze(self, maze: List[List[int]], position: List[int], start: L
                     line += "G "
                 elif maze[r][c] == 1:
                     line += "█ "
-                elif r == rows-1 and c == cols-1:
-                    line+= "E "
+                elif r == rows - 1 and c == cols - 1:
+                    line += "E "
                 else:
                     line += ". "
             print(line)
@@ -82,4 +89,4 @@ def _parse_state(self, payload: Dict[str, Any]) -> MazeState:
             episode_id=payload.get("episode_id", ""),
             step_count=payload.get("step_count", 0),
             done=payload.get("done", False),
-        )
\ No newline at end of file
+        )
diff --git a/src/envs/maze_env/models.py b/src/envs/maze_env/models.py
index 2461299e..35a00b14 100644
--- a/src/envs/maze_env/models.py
+++ b/src/envs/maze_env/models.py
@@ -29,6 +29,7 @@ class MazeObservation(Observation):
     total_reward: float
     legal_actions: List[int] = field(default_factory=list)
 
+
 @dataclass
 class MazeState(State):
     episode_id: str
diff --git a/src/envs/maze_env/server/__init__.py b/src/envs/maze_env/server/__init__.py
index f3cfcf4a..1fca47db 100644
--- a/src/envs/maze_env/server/__init__.py
+++ b/src/envs/maze_env/server/__init__.py
@@ -8,4 +8,4 @@
 from .maze import Maze, Status
 from .maze_environment import MazeEnvironment
 
-__all__ = ["Maze","MazeEnvironment","Status"]
\ No newline at end of file
+__all__ = ["Maze", "MazeEnvironment", "Status"]
diff --git a/src/envs/maze_env/server/app.py b/src/envs/maze_env/server/app.py
index 3a9ba099..d81ed695 100644
--- a/src/envs/maze_env/server/app.py
+++ b/src/envs/maze_env/server/app.py
@@ -28,10 +28,11 @@
 from ..models import MazeAction, MazeObservation
 from .maze_environment import MazeEnvironment
 from .mazearray import maze
+
 # Get game configuration from environment variables
 
 # Create the environment instance
-env = MazeEnvironment(maze_array=maze,start_cell=(0,0),exit_cell=(7,7))
+env = MazeEnvironment(maze_array=maze, start_cell=(0, 0), exit_cell=(7, 7))
 
 # Create the FastAPI app with web interface and README integration
 app = create_app(env, MazeAction, MazeObservation, env_name="maze_env")
diff --git a/src/envs/maze_env/server/maze.py b/src/envs/maze_env/server/maze.py
index 9ff33989..1385654f 100644
--- a/src/envs/maze_env/server/maze.py
+++ b/src/envs/maze_env/server/maze.py
@@ -39,52 +39,73 @@ class Status(Enum):
 
 
 class Maze:
-    """ A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze.
-
-        The layout of the maze and the rules how to move through it are called the environment. An agent is placed
-        at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every
-        action results in a reward or penalty which are accumulated during the game. Every move gives a small
-        penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into
-        a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The
-        game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means
-        winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is
-        assumed to wander around clueless and looses.
-
-        A note on cell coordinates:
-        The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze.
-        This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze
-        itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple
-        to (row, col) use (col, row)[::-1]
+    """A maze with walls. An agent is placed at the start cell and must find the exit cell by moving through the maze.
+
+    The layout of the maze and the rules how to move through it are called the environment. An agent is placed
+    at start_cell. The agent chooses actions (move left/right/up/down) in order to reach the exit_cell. Every
+    action results in a reward or penalty which are accumulated during the game. Every move gives a small
+    penalty (-0.05), returning to a cell the agent visited earlier a bigger penalty (-0.25) and running into
+    a wall a large penalty (-0.75). The reward (+10.0) is collected when the agent reaches the exit. The
+    game always reaches a terminal state; the agent either wins or looses. Obviously reaching the exit means
+    winning, but if the penalties the agent is collecting during play exceed a certain threshold the agent is
+    assumed to wander around clueless and looses.
+
+    A note on cell coordinates:
+    The cells in the maze are stored as (col, row) or (x, y) tuples. (0, 0) is the upper left corner of the maze.
+    This way of storing coordinates is in line with what matplotlib's plot() function expects as inputs. The maze
+    itself is stored as a 2D numpy array so cells are accessed via [row, col]. To convert a (col, row) tuple
+    to (row, col) use (col, row)[::-1]
     """
-    actions = [Action.MOVE_LEFT, Action.MOVE_RIGHT, Action.MOVE_UP, Action.MOVE_DOWN]  # all possible actions
+
+    actions = [
+        Action.MOVE_LEFT,
+        Action.MOVE_RIGHT,
+        Action.MOVE_UP,
+        Action.MOVE_DOWN,
+    ]  # all possible actions
 
     reward_exit = 10.0  # reward for reaching the exit cell
-    penalty_move = -0.05  # penalty for a move which did not result in finding the exit cell
+    penalty_move = (
+        -0.05
+    )  # penalty for a move which did not result in finding the exit cell
     penalty_visited = -0.25  # penalty for returning to a cell which was visited earlier
-    penalty_impossible_move = -0.75  # penalty for trying to enter an occupied cell or moving out of the maze
+    penalty_impossible_move = (
+        -0.75
+    )  # penalty for trying to enter an occupied cell or moving out of the maze
 
     def __init__(self, maze, start_cell=(0, 0), exit_cell=None):
-        """ Create a new maze game.
+        """Create a new maze game.
 
-            :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1)
-            :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left)
-            :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right)
+        :param numpy.array maze: 2D array containing empty cells (= 0) and cells occupied with walls (= 1)
+        :param tuple start_cell: starting cell for the agent in the maze (optional, else upper left)
+        :param tuple exit_cell: exit cell which the agent has to reach (optional, else lower right)
         """
         self.maze = maze
 
-        self.__minimum_reward = -0.5 * self.maze.size  # stop game if accumulated reward is below this threshold
+        self.__minimum_reward = (
+            -0.5 * self.maze.size
+        )  # stop game if accumulated reward is below this threshold
 
         nrows, ncols = self.maze.shape
         self.cells = [(col, row) for col in range(ncols) for row in range(nrows)]
-        self.empty = [(col, row) for col in range(ncols) for row in range(nrows) if self.maze[row, col] == Cell.EMPTY]
+        self.empty = [
+            (col, row)
+            for col in range(ncols)
+            for row in range(nrows)
+            if self.maze[row, col] == Cell.EMPTY
+        ]
         self.__exit_cell = (ncols - 1, nrows - 1) if exit_cell is None else exit_cell
         self.empty.remove(self.__exit_cell)
 
         # Check for impossible maze layout
         if self.__exit_cell not in self.cells:
-            raise Exception("Error: exit cell at {} is not inside maze".format(self.__exit_cell))
+            raise Exception(
+                "Error: exit cell at {} is not inside maze".format(self.__exit_cell)
+            )
         if self.maze[self.__exit_cell[::-1]] == Cell.OCCUPIED:
-            raise Exception("Error: exit cell at {} is not free".format(self.__exit_cell))
+            raise Exception(
+                "Error: exit cell at {} is not free".format(self.__exit_cell)
+            )
 
         # Variables for rendering using Matplotlib
         self.__render = Render.NOTHING  # what to render
@@ -94,17 +115,21 @@ def __init__(self, maze, start_cell=(0, 0), exit_cell=None):
         self.reset(start_cell)
 
     def reset(self, start_cell=(0, 0)):
-        """ Reset the maze to its initial state and place the agent at start_cell.
+        """Reset the maze to its initial state and place the agent at start_cell.
 
-            :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left)
-            :return: new state after reset
+        :param tuple start_cell: here the agent starts its journey through the maze (optional, else upper left)
+        :return: new state after reset
         """
         if start_cell not in self.cells:
-            raise Exception("Error: start cell at {} is not inside maze".format(start_cell))
+            raise Exception(
+                "Error: start cell at {} is not inside maze".format(start_cell)
+            )
         if self.maze[start_cell[::-1]] == Cell.OCCUPIED:
             raise Exception("Error: start cell at {} is not free".format(start_cell))
         if start_cell == self.__exit_cell:
-            raise Exception("Error: start- and exit cell cannot be the same {}".format(start_cell))
+            raise Exception(
+                "Error: start- and exit cell cannot be the same {}".format(start_cell)
+            )
 
         self.__previous_cell = self.__current_cell = start_cell
         self.__total_reward = 0.0  # accumulated reward
@@ -119,10 +144,18 @@ def reset(self, start_cell=(0, 0)):
             self.__ax1.set_yticks(np.arange(0.5, ncols, step=1))
             self.__ax1.set_yticklabels([])
             self.__ax1.grid(True)
-            self.__ax1.plot(*self.__current_cell, "rs", markersize=30)  # start is a big red square
-            self.__ax1.text(*self.__current_cell, "Start", ha="center", va="center", color="white")
-            self.__ax1.plot(*self.__exit_cell, "gs", markersize=30)  # exit is a big green square
-            self.__ax1.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white")
+            self.__ax1.plot(
+                *self.__current_cell, "rs", markersize=30
+            )  # start is a big red square
+            self.__ax1.text(
+                *self.__current_cell, "Start", ha="center", va="center", color="white"
+            )
+            self.__ax1.plot(
+                *self.__exit_cell, "gs", markersize=30
+            )  # exit is a big green square
+            self.__ax1.text(
+                *self.__exit_cell, "Exit", ha="center", va="center", color="white"
+            )
             self.__ax1.imshow(self.maze, cmap="binary")
             self.__ax1.get_figure().canvas.draw()
             self.__ax1.get_figure().canvas.flush_events()
@@ -130,35 +163,43 @@ def reset(self, start_cell=(0, 0)):
         return self.__observe()
 
     def __draw(self):
-        """ Draw a line from the agents previous cell to its current cell. """
-        self.__ax1.plot(*zip(*[self.__previous_cell, self.__current_cell]), "bo-")  # previous cells are blue dots
+        """Draw a line from the agents previous cell to its current cell."""
+        self.__ax1.plot(
+            *zip(*[self.__previous_cell, self.__current_cell]), "bo-"
+        )  # previous cells are blue dots
         self.__ax1.plot(*self.__current_cell, "ro")  # current cell is a red dot
         self.__ax1.get_figure().canvas.draw()
         self.__ax1.get_figure().canvas.flush_events()
 
     def step(self, action):
-        """ Move the agent according to 'action' and return the new state, reward and game status.
+        """Move the agent according to 'action' and return the new state, reward and game status.
 
-            :param Action action: the agent will move in this direction
-            :return: state, reward, status
+        :param Action action: the agent will move in this direction
+        :return: state, reward, status
         """
         reward = self.__execute(action)
         self.__total_reward += reward
         status = self.__status()
         state = self.__observe()
-        logging.debug("action: {:10s} | reward: {: .2f} | status: {}".format(Action(action).name, reward, status))
+        logging.debug(
+            "action: {:10s} | reward: {: .2f} | status: {}".format(
+                Action(action).name, reward, status
+            )
+        )
         return state, reward, status
 
     def __execute(self, action):
-        """ Execute action and collect the reward or penalty.
+        """Execute action and collect the reward or penalty.
 
-            :param Action action: direction in which the agent will move
-            :return float: reward or penalty which results from the action
+        :param Action action: direction in which the agent will move
+        :return float: reward or penalty which results from the action
         """
         possible_actions = self.__possible_actions(self.__current_cell)
 
         if not possible_actions:
-            reward = self.__minimum_reward - 1  # cannot move anywhere, force end of game
+            reward = (
+                self.__minimum_reward - 1
+            )  # cannot move anywhere, force end of game
         elif action in possible_actions:
             col, row = self.__current_cell
             if action == Action.MOVE_LEFT:
@@ -179,21 +220,27 @@ def __execute(self, action):
             if self.__current_cell == self.__exit_cell:
                 reward = Maze.reward_exit  # maximum reward when reaching the exit cell
             elif self.__current_cell in self.__visited:
-                reward = Maze.penalty_visited  # penalty when returning to a cell which was visited earlier
+                reward = (
+                    Maze.penalty_visited
+                )  # penalty when returning to a cell which was visited earlier
             else:
-                reward = Maze.penalty_move  # penalty for a move which did not result in finding the exit cell
+                reward = (
+                    Maze.penalty_move
+                )  # penalty for a move which did not result in finding the exit cell
 
             self.__visited.add(self.__current_cell)
         else:
-            reward = Maze.penalty_impossible_move  # penalty for trying to enter an occupied cell or move out of the maze
+            reward = (
+                Maze.penalty_impossible_move
+            )  # penalty for trying to enter an occupied cell or move out of the maze
 
         return reward
 
     def __possible_actions(self, cell=None):
-        """ Create a list with all possible actions from 'cell', avoiding the maze's edges and walls.
+        """Create a list with all possible actions from 'cell', avoiding the maze's edges and walls.
 
-            :param tuple cell: location of the agent (optional, else use current cell)
-            :return list: all possible actions
+        :param tuple cell: location of the agent (optional, else use current cell)
+        :return list: all possible actions
         """
         if cell is None:
             col, row = self.__current_cell
@@ -206,42 +253,48 @@ def __possible_actions(self, cell=None):
         nrows, ncols = self.maze.shape
         if row == 0 or (row > 0 and self.maze[row - 1, col] == Cell.OCCUPIED):
             possible_actions.remove(Action.MOVE_UP)
-        if row == nrows - 1 or (row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED):
+        if row == nrows - 1 or (
+            row < nrows - 1 and self.maze[row + 1, col] == Cell.OCCUPIED
+        ):
             possible_actions.remove(Action.MOVE_DOWN)
 
         if col == 0 or (col > 0 and self.maze[row, col - 1] == Cell.OCCUPIED):
             possible_actions.remove(Action.MOVE_LEFT)
-        if col == ncols - 1 or (col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED):
+        if col == ncols - 1 or (
+            col < ncols - 1 and self.maze[row, col + 1] == Cell.OCCUPIED
+        ):
             possible_actions.remove(Action.MOVE_RIGHT)
 
         return possible_actions
 
     def __status(self):
-        """ Return the game status.
+        """Return the game status.
 
-            :return Status: current game status (WIN, LOSE, PLAYING)
+        :return Status: current game status (WIN, LOSE, PLAYING)
         """
         if self.__current_cell == self.__exit_cell:
             return Status.WIN
 
-        if self.__total_reward < self.__minimum_reward:  # force end of game after too much loss
+        if (
+            self.__total_reward < self.__minimum_reward
+        ):  # force end of game after too much loss
             return Status.LOSE
 
         return Status.PLAYING
 
     def __observe(self):
-        """ Return the state of the maze - in this game the agents current location.
+        """Return the state of the maze - in this game the agents current location.
 
-            :return numpy.array [1][2]: agents current location
+        :return numpy.array [1][2]: agents current location
         """
         return np.array([[*self.__current_cell]])
 
     def play(self, model, start_cell=(0, 0)):
-        """ Play a single game, choosing the next move based a prediction from 'model'.
+        """Play a single game, choosing the next move based a prediction from 'model'.
 
-            :param class AbstractModel model: the prediction model to use
-            :param tuple start_cell: agents initial cell (optional, else upper left)
-            :return Status: WIN, LOSE
+        :param class AbstractModel model: the prediction model to use
+        :param tuple start_cell: agents initial cell (optional, else upper left)
+        :return Status: WIN, LOSE
         """
         self.reset(start_cell)
 
@@ -254,9 +307,11 @@ def play(self, model, start_cell=(0, 0)):
                 return status
 
     def check_win_all(self, model):
-        """ Check if the model wins from all possible starting cells. """
+        """Check if the model wins from all possible starting cells."""
         previous = self.__render
-        self.__render = Render.NOTHING  # avoid rendering anything during execution of the check games
+        self.__render = (
+            Render.NOTHING
+        )  # avoid rendering anything during execution of the check games
 
         win = 0
         lose = 0
@@ -269,14 +324,18 @@ def check_win_all(self, model):
 
         self.__render = previous  # restore previous rendering setting
 
-        logging.info("won: {} | lost: {} | win rate: {:.5f}".format(win, lose, win / (win + lose)))
+        logging.info(
+            "won: {} | lost: {} | win rate: {:.5f}".format(
+                win, lose, win / (win + lose)
+            )
+        )
 
         result = True if lose == 0 else False
 
         return result, win / (win + lose)
 
     def render_q(self, model):
-        """ Render the recommended action(s) for each cell as provided by 'model'.
+        """Render the recommended action(s) for each cell as provided by 'model'.
 
         :param class AbstractModel model: the prediction model to use
         """
@@ -293,8 +352,12 @@ def clip(n):
             self.__ax2.set_yticks(np.arange(0.5, ncols, step=1))
             self.__ax2.set_yticklabels([])
             self.__ax2.grid(True)
-            self.__ax2.plot(*self.__exit_cell, "gs", markersize=30)  # exit is a big green square
-            self.__ax2.text(*self.__exit_cell, "Exit", ha="center", va="center", color="white")
+            self.__ax2.plot(
+                *self.__exit_cell, "gs", markersize=30
+            )  # exit is a big green square
+            self.__ax2.text(
+                *self.__exit_cell, "Exit", ha="center", va="center", color="white"
+            )
 
             for cell in self.empty:
                 q = model.q(cell) if model is not None else [0, 0, 0, 0]
@@ -315,9 +378,18 @@ def clip(n):
                     # color (from red to green) represents the certainty of the preferred action(s)
                     maxv = 1
                     minv = -1
-                    color = clip((q[action] - minv) / (maxv - minv))  # normalize in [-1, 1]
-
-                    self.__ax2.arrow(*cell, dx, dy, color=(1 - color, color, 0), head_width=0.2, head_length=0.1)
+                    color = clip(
+                        (q[action] - minv) / (maxv - minv)
+                    )  # normalize in [-1, 1]
+
+                    self.__ax2.arrow(
+                        *cell,
+                        dx,
+                        dy,
+                        color=(1 - color, color, 0),
+                        head_width=0.2,
+                        head_length=0.1,
+                    )
 
             self.__ax2.imshow(self.maze, cmap="binary")
-            self.__ax2.get_figure().canvas.draw()
\ No newline at end of file
+            self.__ax2.get_figure().canvas.draw()
diff --git a/src/envs/maze_env/server/maze_environment.py b/src/envs/maze_env/server/maze_environment.py
index eaab821c..b9675bcf 100644
--- a/src/envs/maze_env/server/maze_environment.py
+++ b/src/envs/maze_env/server/maze_environment.py
@@ -42,7 +42,7 @@ def __init__(
         self,
         maze_array: np.ndarray,
         start_cell: Tuple[int, int] = (0, 0),
-        exit_cell: Optional[Tuple[int, int]] = (7,7),
+        exit_cell: Optional[Tuple[int, int]] = (7, 7),
     ):
         # Create underlying Maze instance (matches your working code)
         self.env = Maze(maze=maze_array, start_cell=start_cell, exit_cell=exit_cell)
@@ -54,16 +54,26 @@ def __init__(
 
     def reset(self) -> MazeObservation:
         """Reset environment and return initial observation (MazeObservation)."""
-        observation = self.env.reset()  # typically returns np.array([row, col]) or similar
+        observation = (
+            self.env.reset()
+        )  # typically returns np.array([row, col]) or similar
         # initialize episode state
         self.state = MazeState(episode_id="episode_1", step_count=0, done=False)
 
         # build MazeObservation; convert numpy to list for JSON-serializable dataclass fields
-        pos_list = observation.tolist() if hasattr(observation, "tolist") else list(observation)
+        pos_list = (
+            observation.tolist()
+            if hasattr(observation, "tolist")
+            else list(observation)
+        )
         self.total_reward = 0
         legal_actions = self._compute_legal_actions(pos_list[0])
 
-        return MazeObservation(position=pos_list, total_reward=self.total_reward, legal_actions=legal_actions)
+        return MazeObservation(
+            position=pos_list,
+            total_reward=self.total_reward,
+            legal_actions=legal_actions,
+        )
 
     def step(self, action: MazeAction) -> MazeObservation:
         """
@@ -91,9 +101,9 @@ def step(self, action: MazeAction) -> MazeObservation:
         }
 
         # --- Reward settings ---
-        reward_exit = 10.0          # reward for reaching the exit cell
-        reward_move = 0.05        # reward for a move that didn't find the exit but is valid
-        penalty_visited = -0.25     # penalty for revisiting a cell
+        reward_exit = 10.0  # reward for reaching the exit cell
+        reward_move = 0.05  # reward for a move that didn't find the exit but is valid
+        penalty_visited = -0.25  # penalty for revisiting a cell
         penalty_impossible = -0.75  # penalty for invalid move (wall/outside)
 
         dr, dc = move_map.get(action.action, (0, 0))
@@ -153,10 +163,9 @@ def step(self, action: MazeAction) -> MazeObservation:
             position=pos_list,
             total_reward=self.total_reward,
             legal_actions=legal_actions,
-            done=done
+            done=done,
         )
 
-
     def state(self) -> Optional[MazeState]:
         """Return the current MazeState object."""
         return self.state
@@ -186,4 +195,4 @@ def _compute_legal_actions(self, pos: List[int]) -> List[int]:
         if col < ncols - 1 and self.env.maze[row, col + 1] == 0:
             actions.append(3)
 
-        return actions
\ No newline at end of file
+        return actions
diff --git a/src/envs/maze_env/server/mazearray.py b/src/envs/maze_env/server/mazearray.py
index b87935e2..3cd7dbd6 100644
--- a/src/envs/maze_env/server/mazearray.py
+++ b/src/envs/maze_env/server/mazearray.py
@@ -1,13 +1,15 @@
 import numpy as np
 
 # Maze
-maze = np.array([
-    [0, 1, 0, 0, 0, 0, 0, 0],
-    [0, 1, 0, 1, 0, 1, 0, 0],
-    [0, 0, 0, 1, 1, 0, 1, 0],
-    [0, 1, 0, 1, 0, 0, 0, 0],
-    [1, 0, 0, 1, 0, 1, 0, 0],
-    [0, 0, 0, 1, 0, 1, 1, 1],
-    [0, 1, 1, 0, 0, 0, 0, 0],
-    [0, 0, 0, 0, 0, 1, 0, 0]
-])
\ No newline at end of file
+maze = np.array(
+    [
+        [0, 1, 0, 0, 0, 0, 0, 0],
+        [0, 1, 0, 1, 0, 1, 0, 0],
+        [0, 0, 0, 1, 1, 0, 1, 0],
+        [0, 1, 0, 1, 0, 0, 0, 0],
+        [1, 0, 0, 1, 0, 1, 0, 0],
+        [0, 0, 0, 1, 0, 1, 1, 1],
+        [0, 1, 1, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 1, 0, 0],
+    ]
+)

From 71e6f97d985e2d0b34cc4f204b2740beebecc43b Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Sun, 2 Nov 2025 18:37:26 +0530
Subject: [PATCH 12/14] Fix hf deployment for maze env

---
 scripts/prepare_hf_deployment.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/prepare_hf_deployment.sh b/scripts/prepare_hf_deployment.sh
index 23fd4779..381edffd 100755
--- a/scripts/prepare_hf_deployment.sh
+++ b/scripts/prepare_hf_deployment.sh
@@ -157,6 +157,7 @@ README_EOF
         "chat_env") ENV_CLASS="ChatEnv" ;;
         "atari_env") ENV_CLASS="AtariEnv" ;;
         "openspiel_env") ENV_CLASS="OpenSpielEnv" ;;
+        "maze_env") ENV_CLASS="MazeEnv" ;;
         *) ENV_CLASS="Env" ;;
     esac
 

From b05505065f454ae5136ea33246e2a85462b4c1a4 Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Sun, 2 Nov 2025 19:06:31 +0530
Subject: [PATCH 13/14] Fix hf deployment

---
 .github/workflows/deploy-hf-env.yml | 6 +++---
 src/envs/maze_env/server/Dockerfile | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/deploy-hf-env.yml b/.github/workflows/deploy-hf-env.yml
index 2f188ddf..12849a3f 100644
--- a/.github/workflows/deploy-hf-env.yml
+++ b/.github/workflows/deploy-hf-env.yml
@@ -64,7 +64,7 @@ jobs:
             if [ "${{ github.event.inputs.environment }}" = "all" ]; then
               echo "deploy_all=true" >> $GITHUB_OUTPUT
               echo "use_matrix=true" >> $GITHUB_OUTPUT
-              echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env" >> $GITHUB_OUTPUT
+              echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env,maze_env" >> $GITHUB_OUTPUT
               echo "Manual trigger - deploying all environments with matrix"
             else
               echo "deploy_all=false" >> $GITHUB_OUTPUT
@@ -79,14 +79,14 @@ jobs:
           if git diff --name-only HEAD~1 HEAD | grep -E '^src/core/' > /dev/null; then
             echo "deploy_all=true" >> $GITHUB_OUTPUT
             echo "use_matrix=true" >> $GITHUB_OUTPUT
-            echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env" >> $GITHUB_OUTPUT
+            echo "environments=echo_env,coding_env,chat_env,atari_env,openspiel_env,maze_env" >> $GITHUB_OUTPUT
             echo "Core files changed - deploying all environments with matrix"
             exit 0
           fi
           
           # Check which specific environments changed
           changed_envs=()
-          for env in echo_env coding_env chat_env atari_env openspiel_env; do
+          for env in echo_env coding_env chat_env atari_env openspiel_env maze_env; do
             if git diff --name-only HEAD~1 HEAD | grep -E "^src/envs/$env/" > /dev/null; then
               changed_envs+=("$env")
             fi
diff --git a/src/envs/maze_env/server/Dockerfile b/src/envs/maze_env/server/Dockerfile
index 544d3e44..2d2e3d6f 100644
--- a/src/envs/maze_env/server/Dockerfile
+++ b/src/envs/maze_env/server/Dockerfile
@@ -29,7 +29,7 @@ COPY src/envs/maze_env/README.md /app/README.md
 
 # Extend Python path for OpenEnv (base image sets PYTHONPATH=/app/src)
 # We prepend Maze paths
-ENV PYTHONPATH=/repo:/repo/build/python:/app/src
+ENV PYTHONPATH=/app/src
 
 
 # Health check (curl provided by openenv-base)

From 3656fc69cb7ca648310f746695875db73662660d Mon Sep 17 00:00:00 2001
From: Vivek Silimkhan <viveksilimkhan@gmail.com>
Date: Sun, 2 Nov 2025 21:47:11 +0530
Subject: [PATCH 14/14] Update deploy to hf script

---
 scripts/deploy_to_hf.sh | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/scripts/deploy_to_hf.sh b/scripts/deploy_to_hf.sh
index 20256c9a..ef212ffb 100755
--- a/scripts/deploy_to_hf.sh
+++ b/scripts/deploy_to_hf.sh
@@ -290,6 +290,13 @@ DOCKERFILE_EOF
             echo "OpenSpiel builds can take 10-15 minutes due to C++ compilation"
             return  # Skip the common parts since OpenSpiel has its own complete Dockerfile
             ;;
+        "maze_env")
+         cat >> "$CURRENT_STAGING_DIR/Dockerfile" << 'DOCKERFILE_EOF'
+# Install additional dependencies for ChatEnvironment
+RUN pip install --no-cache-dir numpy
+DOCKERFILE_EOF
+            # Maze env requre
+            ;;
     esac
 
     # Add common parts