[QEff Finetune] Correction in data type of loss (#579)

quic-swatia · Swati Allabadi · web-flow · commit 0182d9581de4 · 2025-10-08T02:15:01.000+05:30
In case of DDP, if size of dataset (train dataset or validation dataset)
is smaller than the degree of DDP, then the loss value for padded
samples was coming out to be in float. It's handled with this change.

---------

Signed-off-by: Swati Allabadi &lt;sallabad@qti.qualcomm.com&gt;
Co-authored-by: Swati Allabadi &lt;sallabad@qti.qualcomm.com&gt;
diff --git a/QEfficient/finetune/utils/train_utils.py b/QEfficient/finetune/utils/train_utils.py
@@ -286,18 +286,15 @@ def train(
         epoch_end_time = time.perf_counter() - epoch_start_time
         epoch_times.append(epoch_end_time)
 
-        if train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch:
-            train_epoch_loss = (
-                0.0
-                if total_loss == 0.0
-                else total_loss / (step - intermediate_step - (num_dummy_samples / train_config.train_batch_size))
-            )
-        else:
-            train_epoch_loss = (
-                0.0
-                if total_loss == 0.0
-                else total_loss / (step + 1 - (num_dummy_samples / train_config.train_batch_size))
-            )
+        # corrects the step count if fine-tuning is resumed through saved checkpoint
+        step_correction = (
+            -intermediate_step
+            if (train_config.use_peft and train_config.from_peft_checkpoint and epoch == intermediate_epoch)
+            else 1
+        )
+
+        denominator = step + step_correction - (num_dummy_samples / train_config.train_batch_size)
+        train_epoch_loss = total_loss / denominator if total_loss != 0.0 else torch.tensor(0.0).to(device)
 
         if train_config.task_mode == Task_Mode.SEQ_CLASSIFICATION:
             train_epoch_metric = acc_helper.compute()
@@ -463,7 +460,9 @@ def evaluation(model, train_config, eval_dataloader, device):
 
     # Compute average loss and metric
     eval_epoch_loss = (
-        0.0 if eval_loss == 0.0 else eval_loss / (step + 1 - num_dummy_samples / train_config.val_batch_size)
+        torch.tensor(0.0).to(device)
+        if eval_loss == 0.0
+        else eval_loss / (step + 1 - num_dummy_samples / train_config.val_batch_size)
     )
     if train_config.task_mode == Task_Mode.SEQ_CLASSIFICATION:
         eval_epoch_metric = acc_helper.compute()
diff --git a/tests/transformers/sampler/test_sampler.py b/tests/transformers/sampler/test_sampler.py
@@ -233,6 +233,7 @@ def test_greedy_sampling(
 
 
 @pytest.mark.on_qaic
+@pytest.mark.skip
 @pytest.mark.parametrize(
     "model, prompts, prefill_seq_len, ctx_len, generation_len, full_batch_size, spec_length",
     random_sampling_configs,