[Feature] RayLLMCollector.sync_iter (#3015)

vmoens · web-flow · commit d6d0b1315f7d · 2025-06-18T13:34:06.000+01:00
diff --git a/sota-implementations/grpo/README.md b/sota-implementations/grpo/README.md
@@ -107,7 +107,11 @@ for step in range(total_steps):
 
 Key differences:
 1. **Data Collection**: 
-   - Sync: Data collection and optimization happen sequentially
+   - Sync: Data collection and optimization happen sequentially.
+     
+     *Note*: The `train.sync_iter=False` argument can be used to collect data whilst optimizing. In this context, the
+     maximum policy age will be 1. If `train.sync_iter=True` (default), the maximum policy age is `0`.
+
    - Async: Data collection runs in background while optimization happens
 
 2. **Buffer Size**:
diff --git a/sota-implementations/grpo/config/mode/async.yaml b/sota-implementations/grpo/config/mode/async.yaml
@@ -9,3 +9,5 @@ train:
   buffer_size: 128
   # Update policy weights every N steps - can be set to any positive integer in async mode
   weight_update_frequency: 10
+  # Sync the collector between iterations. Deactivated when async.
+  sync_iter:
diff --git a/sota-implementations/grpo/config/mode/sync.yaml b/sota-implementations/grpo/config/mode/sync.yaml
@@ -9,3 +9,6 @@ train:
   buffer_size:
   # Update policy weights every N steps - must be left empty in sync mode
   weight_update_frequency:  
+  # Sync the collector between iterations. Not syncing means that the collector will collect the next batch of data in between yielding.
+  #  When sync_iter=True, the maximuum policy age is 0. When sync_iter=False, the maximuum policy age is 1.
+  sync_iter: true
diff --git a/sota-implementations/grpo/grpo-async.py b/sota-implementations/grpo/grpo-async.py
@@ -465,6 +465,8 @@ def main(cfg):
     )
     torchrl_logger.info(f"Starting collector with {collector_config=}")
 
+    if cfg.train.sync_iter is not None:
+        raise ValueError("sync_iter is not supported in async mode.")
     collector = RayLLMCollector(
         env=partial(make_env, cfg, devices=device_config["ref_model_devices"]),
         policy=inference_policy,
diff --git a/sota-implementations/grpo/grpo-sync.py b/sota-implementations/grpo/grpo-sync.py
@@ -483,6 +483,7 @@ def main(cfg):
         # The ref model will be instantiated within the collector, so we only need to allocate the number of devices for the inference model
         cfg.ref_model.num_devices
     )
+    collector_config["num_cpus"] = cfg.ray.collector_config.get("num_cpus", 1)
     torchrl_logger.info(f"Starting collector with {collector_config=}")
 
     collector = RayLLMCollector(
@@ -495,6 +496,7 @@ def main(cfg):
         weight_updater=None,  # We'll create this after getting the remote LLM
         track_policy_version=True,
         remote_config=collector_config,
+        sync_iter=cfg.train.sync_iter,
         verbose=True,
     )
     # Ensure collector is initialized by calling a method that will block until ready
diff --git a/torchrl/collectors/llm/ray_collector.py b/torchrl/collectors/llm/ray_collector.py
@@ -4,6 +4,8 @@
 # LICENSE file in the root directory of this source tree.
 from __future__ import annotations
 
+import copy
+
 import warnings
 from typing import Any, Callable, Iterator
 
@@ -55,6 +57,19 @@ class RayLLMCollector(LLMCollector):
             or its subclass, responsible for updating the policy weights on remote inference workers.
         ray_init_config (dict[str, Any], optional): keyword arguments to pass to ray.init().
         remote_config (dict[str, Any], optional): keyword arguments to pass to cls.as_remote().
+        sync_iter (bool, optional): if `True`, items yeilded by the collector will be synced to the local process.
+            If `False`, the collector will collect the next batch of data in between yielding.
+            This has no effect when data is collected through the :meth:`start` method.
+            For example:
+
+               >>> collector = RayLLMCollector(..., sync_iter=True)
+               >>> for data in collector:  # blocking
+               ...     # expensive operation - collector is idle
+               >>> collector = RayLLMCollector(..., sync_iter=False)
+               >>> for data in collector:  # non-blocking
+               ...     # expensive operation - collector is collecting data
+
+            Defaults to `True`.
         verbose (bool, optional): if ``True``, the collector will print progress information.
             Defaults to `False`.
     """
@@ -81,6 +96,7 @@ def __init__(
         ray_init_config: dict[str, Any] | None = None,
         remote_config: dict[str, Any] | None = None,
         track_policy_version: bool | PolicyVersion = False,
+        sync_iter: bool = True,
         verbose: bool = False,
     ) -> None:
         if not _has_ray:
@@ -93,8 +109,11 @@ def __init__(
 
                 ray_init_config = DEFAULT_RAY_INIT_CONFIG
             ray.init(**ray_init_config)
-
+        if not sync_iter:
+            remote_config = copy.copy(remote_config)
+            remote_config.setdefault("max_concurrency", 2)
         remote_cls = LLMCollector.as_remote(remote_config).remote
+        self.sync_iter = sync_iter
         self._collector = remote_cls(
             env=env,
             policy=policy,
@@ -113,19 +132,31 @@ def __init__(
             verbose=verbose,
         )
 
+    def _next_remote(self) -> None:
+        return self._collector.next.remote()
+
     def next(self) -> None:
         """Get the next batch of data from the collector.
 
         Returns:
             None as the data is written directly to the replay buffer.
         """
-        return ray.get(self._collector.next.remote())
+        return ray.get(self._next_remote())
 
     def __iter__(self) -> Iterator[None]:
         """Returns an iterator that yields None as the collector writes directly to the replay buffer."""
+        if not self.sync_iter:
+            future = self._next_remote()
+        else:
+            future = None
         while True:
             try:
-                yield self.next()
+                if self.sync_iter:
+                    yield self.next()
+                else:
+                    result = ray.get(future)
+                    future = self._next_remote()
+                    yield result
             except StopIteration:
                 break
 

Original file line number	Diff line number	Diff line change
`@@ -465,6 +465,8 @@ def main(cfg):`
`465`	`465`	`)`
`466`	`466`	`torchrl_logger.info(f"Starting collector with {collector_config=}")`
`467`	`467`
	`468`	`+ if cfg.train.sync_iter is not None:`
	`469`	`+ raise ValueError("sync_iter is not supported in async mode.")`
`468`	`470`	`collector = RayLLMCollector(`
`469`	`471`	`env=partial(make_env, cfg, devices=device_config["ref_model_devices"]),`
`470`	`472`	`policy=inference_policy,`
Original file line number	Diff line number	Diff line change
`@@ -483,6 +483,7 @@ def main(cfg):`
`483`	`483`	`# The ref model will be instantiated within the collector, so we only need to allocate the number of devices for the inference model`
`484`	`484`	`cfg.ref_model.num_devices`
`485`	`485`	`)`
	`486`	`+ collector_config["num_cpus"] = cfg.ray.collector_config.get("num_cpus", 1)`
`486`	`487`	`torchrl_logger.info(f"Starting collector with {collector_config=}")`
`487`	`488`
`488`	`489`	`collector = RayLLMCollector(`
`@@ -495,6 +496,7 @@ def main(cfg):`
`495`	`496`	`weight_updater=None, # We'll create this after getting the remote LLM`
`496`	`497`	`track_policy_version=True,`
`497`	`498`	`remote_config=collector_config,`
	`499`	`+ sync_iter=cfg.train.sync_iter,`
`498`	`500`	`verbose=True,`
`499`	`501`	`)`
`500`	`502`	`# Ensure collector is initialized by calling a method that will block until ready`