diff --git a/recipes/dev/grpo_full_finetune_distributed.py b/recipes/dev/grpo_full_finetune_distributed.py index 145f5cd661..264110d132 100644 --- a/recipes/dev/grpo_full_finetune_distributed.py +++ b/recipes/dev/grpo_full_finetune_distributed.py @@ -646,7 +646,9 @@ def generate_trajectory( # Do some reward modelingggggggg # responses :: [B x G, L] responses = responses.reshape(batch_size, grpo_size, -1) # [B, G, L] - rewards, successes = batched_rewards(self._tokenizer, responses, answers) + rewards, successes = batched_rewards( + self._tokenizer, responses, answers, device=self._device + ) rewards = rewards.to(self._device) # [B, G] successes = successes.to(self._device) # [B, G] diff --git a/torchtune/dev/rl/rewards.py b/torchtune/dev/rl/rewards.py index 95c45ee9b0..69d5bb7836 100644 --- a/torchtune/dev/rl/rewards.py +++ b/torchtune/dev/rl/rewards.py @@ -296,21 +296,13 @@ def batched_rewards( metadata = {"func_names": [f.__name__ for f in reward_funcs]} for b in range(batch_size): - for g in range(grpo_size): - - answer = answers[b][g] - + answer = answers[b] text_completion = tokenizer.decode(completions[b, g].tolist()) - cot, potential_answer = extract_tags(f"{text_completion}") - for rw_idx, reward_func in enumerate(reward_funcs): - reward, success = reward_func(cot, answer, potential_answer) - rewards_tensor[b, g, rw_idx] += reward - successes_tensor[b, g, rw_idx] += success return rewards_tensor, successes_tensor, metadata