[BUG] Device inconstency in MQF2DistributionLoss raising: RuntimeError: Expected all tensors to be on the same device (#1916)

fnhirwa · web-flow · commit f87bb56ca21d · 2025-08-12T14:42:22.000+02:00
Fixes #1182 In the current implementation, the `picnn` is initialized during class construction, and the device that it defaults to isn't being updated when the model is moved to another device. * Added device movement method `to()` to ensure that `picnn` is moved along with the loss function. * Added automatic device sync in `map_x_to_distribution` to ensure that `picnn` is on the same device as the input tensor. Also added the tests mocking the accelerators on a high level to test the synchronization of devices within this class.
diff --git a/pytorch_forecasting/metrics/distributions.py b/pytorch_forecasting/metrics/distributions.py
@@ -394,6 +394,7 @@ def __init__(
         self.prediction_length = prediction_length
         self.es_num_samples = es_num_samples
         self.beta = beta
+        self._transformation = None
 
         # define picnn
         convexnet = PICNN(
@@ -421,11 +422,19 @@ def __init__(
 
         self.picnn = SequentialNet(networks)
 
+    def to(self, device):
+        """Move the loss and its components to the specified device."""
+        self.picnn = self.picnn.to(device)
+        return super().to(device)
+
     @property
     def is_energy_score(self) -> bool:
         return self.es_num_samples is not None
 
     def map_x_to_distribution(self, x: torch.Tensor) -> distributions.Distribution:
+        if hasattr(self.picnn, "to"):
+            self.picnn = self.picnn.to(x.device)
+
         distr = self.distribution_class(
             picnn=self.picnn,
             hidden_state=x[..., :-2],
diff --git a/tests/test_metrics.py b/tests/test_metrics.py
@@ -1,9 +1,13 @@
+from functools import wraps
 import itertools
+from unittest.mock import MagicMock, PropertyMock, patch
 
 import pytest
 import torch
 from torch.nn.utils import rnn
 
+from pytorch_forecasting import TemporalFusionTransformer, TimeSeriesDataSet
+from pytorch_forecasting.data import NaNLabelEncoder
 from pytorch_forecasting.data.encoders import TorchNormalizer
 from pytorch_forecasting.metrics import (
     MAE,
@@ -19,6 +23,7 @@
     AggregationMetric,
     CompositeMetric,
 )
+from pytorch_forecasting.utils._dependencies import _get_installed_packages
 
 
 def test_composite_metric():
@@ -306,6 +311,213 @@ def test_ImplicitQuantileNetworkDistributionLoss():
     assert point_prediction.ndim == loss.to_prediction(pred, n_samples=100).ndim
 
 
+@pytest.fixture
+def sample_dataset():
+    """Fixture to create a sample TimeSeriesDataSet for testing."""
+    import numpy as np
+    import pandas as pd
+
+    rows = 15
+    df = pd.DataFrame(
+        {
+            "time": pd.date_range("2025-01-01", periods=rows, freq="h"),
+            "label": ["test"] * rows,
+            "var1": np.random.randn(rows).cumsum(),
+            "var2": np.random.randn(rows).cumsum(),
+        }
+    )
+    df = df.sort_values("time").reset_index(drop=True)
+    df["past_var1"] = df["var1"].shift(-1)
+    df.dropna(subset=["past_var1"], inplace=True)
+    df["time_idx"] = range(len(df))
+    return TimeSeriesDataSet(
+        df,
+        time_idx="time_idx",
+        target="past_var1",
+        group_ids=["label"],
+        static_categoricals=["label"],
+        time_varying_known_reals=["var1", "var2"],
+        time_varying_unknown_reals=["past_var1"],
+        max_encoder_length=5,
+        max_prediction_length=2,
+        categorical_encoders={"label": NaNLabelEncoder(add_nan=False)},
+    )
+
+
+@pytest.fixture(params=["cuda", "cpu"])
+def mock_device(request):
+    """Fixture to create a mock device for testing."""
+    # Create a torch.device object
+    device_str = f"{request.param}:0" if request.param == "cuda" else "cpu"
+    mock_device = torch.device(device_str)
+
+    orig_tensor = torch.tensor
+    orig_empty = torch.empty
+
+    @wraps(orig_tensor)
+    def mock_tensor(data, *args, **kwargs):
+        # Force device to CPU
+        kwargs["device"] = "cpu"
+        tensor = orig_tensor(data, *args, **kwargs)
+        tensor.device = mock_device
+        return tensor
+
+    @wraps(orig_empty)
+    def mock_empty(*args, **kwargs):
+        kwargs["device"] = "cpu"
+        tensor = orig_empty(*args, **kwargs)
+        tensor.device = mock_device
+        return tensor
+
+    if request.param == "cuda":
+        mock_properties = type(
+            "CudaDeviceProperties",
+            (),
+            {
+                "major": 8,
+                "minor": 0,
+                "name": "Mocked CUDA Device",
+                "total_memory": 8 * 1024 * 1024 * 1024,
+            },
+        )()
+
+        with (
+            patch("torch.cuda.is_available", return_value=True),
+            patch("torch.cuda._lazy_init", return_value=None),
+            patch("torch.cuda.device_count", return_value=1),
+            patch("torch.cuda.get_device_properties", return_value=mock_properties),
+            patch("torch.cuda.get_device_capability", return_value=(8, 0)),
+            patch("torch.cuda.set_device", return_value=None),
+            patch("torch.empty", new=mock_empty),
+            patch("torch.tensor", new=mock_tensor),
+            patch(
+                "torch.Tensor.to",
+                new=lambda self, device, *args, **kwargs: self.clone()
+                if isinstance(device, (str, torch.device))
+                and str(device).startswith("cuda")
+                else self,
+            ),
+            patch(
+                "torch.Tensor.device",
+                new_callable=PropertyMock,
+                return_value=mock_device,
+            ),
+            patch("torch.Tensor.cuda", new=lambda self, *args, **kwargs: self.clone()),
+            patch("torch.nn.Module.cuda", new=lambda self, *args, **kwargs: self),
+            patch("torch.nn.Module.to", new=lambda self, device, *args, **kwargs: self),
+        ):
+            yield "cuda"
+    else:
+        yield "cpu"
+
+
+@pytest.mark.skipif(
+    "cpflows" not in _get_installed_packages(),
+    reason="cpflows is not installed, skipping MQF2DistributionLoss tests",
+)
+def test_MQF2DistributionLoss_device_handling(mock_device):
+    from pytorch_forecasting.metrics import MQF2DistributionLoss
+
+    loss = MQF2DistributionLoss(prediction_length=2)
+
+    assert next(loss.picnn.parameters()).device.type == mock_device
+
+    if mock_device == "cuda":
+        loss.cuda()
+        assert next(loss.picnn.parameters()).device.type == "cuda"
+    elif mock_device == "cpu":
+        loss.cpu()
+        assert next(loss.picnn.parameters()).device.type == "cpu"
+    loss.to(mock_device)
+    assert next(loss.picnn.parameters()).device.type == mock_device
+
+
+device_params = [
+    pytest.param(
+        "cuda",
+        marks=pytest.mark.skipif(
+            not torch.cuda.is_available(), reason="CUDA is not available"
+        ),
+    ),
+    "cpu",
+]
+
+
+@pytest.mark.skipif(
+    "cpflows" not in _get_installed_packages(),
+    reason="cpflows is not installed, skipping MQF2DistributionLoss tests",
+)
+@pytest.mark.parametrize("device", device_params)
+def test_MQF2DistributionLoss_full_workflow(sample_dataset, device):
+    """
+    Test the complete workflow from training to prediction with MQF2DistributionLoss.
+    """
+    import lightning.pytorch as pl
+
+    from pytorch_forecasting.metrics import MQF2DistributionLoss
+
+    model = TemporalFusionTransformer.from_dataset(
+        sample_dataset, loss=MQF2DistributionLoss(prediction_length=2)
+    )
+
+    trainer = pl.Trainer(
+        max_epochs=1,
+        accelerator=device,
+        devices="auto",
+        gradient_clip_val=0.1,
+        limit_train_batches=30,
+        limit_val_batches=3,
+    )
+    dataloader = sample_dataset.to_dataloader(train=True, batch_size=4, num_workers=0)
+
+    trainer.fit(model, dataloader)
+
+    raw_predictions = model.predict(
+        dataloader,
+        mode="raw",
+        return_x=True,
+        trainer_kwargs=dict(accelerator=device, devices="auto", logger=False),
+    )
+    # Verify predictions are on correct device
+    pred_device = raw_predictions.output["prediction"].device.type
+    target_device = raw_predictions.x["encoder_target"].device.type
+    assert pred_device == device
+    assert target_device == device
+    try:
+        model.plot_prediction(raw_predictions.x, raw_predictions.output, idx=0)
+        plot_success = True
+    except RuntimeError as e:
+        if "device" in str(e).lower() or "expected" in str(e).lower():
+            plot_success = False
+            pytest.fail(f"Device mismatch error during plotting: {e}")
+        else:
+            raise e
+    assert plot_success, "Plotting failed due to device mismatch"
+
+
+@pytest.mark.skipif(
+    "cpflows" not in _get_installed_packages(),
+    reason="cpflows is not installed, skipping MQF2DistributionLoss tests",
+)
+def test_MQF2DistributionLoss_device_synchronization(mock_device, sample_dataset):
+    """Test that MQF2DistributionLoss components are synchronized with the device."""
+    from pytorch_forecasting.metrics import MQF2DistributionLoss
+
+    model = TemporalFusionTransformer.from_dataset(
+        sample_dataset, loss=MQF2DistributionLoss(prediction_length=2)
+    )
+    fake_prediction = torch.randn(4, 2, 8)
+
+    if mock_device == "cuda":
+        fake_prediction = fake_prediction.cuda()
+        model.loss.map_x_to_distribution(fake_prediction)
+        assert next(model.loss.picnn.parameters()).device.type == "cuda"
+    if mock_device == "cpu":
+        fake_prediction = fake_prediction.cpu()
+        model.loss.map_x_to_distribution(fake_prediction)
+        assert next(model.loss.picnn.parameters()).device.type == "cpu"
+
+
 def test_CrossEntropyLoss():
     batch_size = 3
     n_timesteps = 5