pyg-team · rajveer43 · Sep 3, 2025 · Sep 3, 2025
@@ -910,6 +910,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Fixed mini-batching with empty lists as attributes ([#4293](https://github.com/pyg-team/pytorch_geometric/pull/4293))
 - Fixed a bug in which `GCNConv` could not be combined with `to_hetero` on heterogeneous graphs with one node type ([#4279](https://github.com/pyg-team/pytorch_geometric/pull/4279))
 - Added a scheduler to the Graph Sage OGBN Example [#9877](https://github.com/pyg-team/pytorch_geometric/pull/9877)
+- Fixed `cluster_loss` calculation in `DMoNPooling` ([#10149](https://github.com/pyg-team/pytorch_geometric/pull/10429))
 
 ### Removed
 

@@ -114,14 +114,17 @@ def forward(
             mask = torch.ones(batch_size, num_nodes, dtype=torch.bool,
                               device=x.device)
 
-        mask = mask.view(batch_size, num_nodes, 1).to(x.dtype)
+        # Keep a 2D boolean mask for counting valid nodes per-graph, and a 3D
+        # float mask for elementwise masking of tensors.
+        node_mask_2d = mask.view(batch_size, num_nodes)
+        mask = node_mask_2d.unsqueeze(-1).to(x.dtype)  # B x N x 1
         x, s = x * mask, s * mask
 
         out = F.selu(torch.matmul(s.transpose(1, 2), x))
         out_adj = torch.matmul(torch.matmul(s.transpose(1, 2), adj), s)
 
         # Spectral loss:
-        degrees = torch.einsum('ijk->ij', adj)  # B X N
+        degrees = torch.einsum('ijk->ij', adj)  # B x N
         degrees = degrees.unsqueeze(-1) * mask  # B x N x 1
         degrees_t = degrees.transpose(1, 2)  # B x 1 x N
 
@@ -144,10 +147,20 @@ def forward(
             i_s / torch.norm(i_s), dim=(-1, -2))
         ortho_loss = ortho_loss.mean()
 
-        # Cluster loss:
-        cluster_size = torch.einsum('ijk->ik', s)  # B x C
-        cluster_loss = torch.norm(input=cluster_size, dim=1)
-        cluster_loss = cluster_loss / mask.sum(dim=1) * torch.norm(i_s) - 1
+        # Cluster loss (FIX):
+        # The original implementation divided by `mask.sum(dim=1)` where
+        # `mask` had shape B x N x 1, producing a B x 1 x 1 tensor and thus a
+        # wrong broadcast when normalizing by graph size. We instead compute a
+        # per-graph node count vector of shape B and use it for normalization.
+        cluster_size = torch.einsum('ijk->ik', s)  # B x C  (sum over nodes)
+        cluster_norm = torch.norm(input=cluster_size, dim=1)  # B
+
+        # Number of valid nodes per graph, shape: B
+        n_per_graph = node_mask_2d.sum(dim=1).to(cluster_norm.dtype)  # B
+        n_per_graph = torch.clamp(n_per_graph, min=1.0)  # avoid div-by-zero
+
+        # L_c = (sqrt(C)/n) * || sum_i C_i^T ||_F - 1
+        cluster_loss = (cluster_norm / n_per_graph) * torch.norm(i_s) - 1  # B
         cluster_loss = cluster_loss.mean()
 
         # Fix and normalize coarsened adjacency matrix: