roboflow · isaaccorley · Apr 24, 2025 · Apr 24, 2025
@@ -12,6 +12,7 @@
 
 class ModelConfig(BaseModel):
     encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"]
+    num_channels: int = 3
     out_feature_indexes: List[int]
     dec_layers: int = 3
     two_stage: bool = True
@@ -33,6 +34,7 @@ class ModelConfig(BaseModel):
 
 class RFDETRBaseConfig(ModelConfig):
     encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"] = "dinov2_windowed_small"
+    num_channels: int = 3
     hidden_dim: int = 256
     sa_nheads: int = 8
     ca_nheads: int = 16
@@ -45,6 +47,7 @@ class RFDETRBaseConfig(ModelConfig):
 
 class RFDETRLargeConfig(RFDETRBaseConfig):
     encoder: Literal["dinov2_windowed_small", "dinov2_windowed_base"] = "dinov2_windowed_base"
+    num_channels: int = 3
     hidden_dim: int = 384
     sa_nheads: int = 12
     ca_nheads: int = 24

@@ -7,6 +7,7 @@
 
 import json
 import os
+from itertools import cycle
 from collections import defaultdict
 from logging import getLogger
 from typing import Union, List
@@ -33,6 +34,11 @@ def __init__(self, **kwargs):
         self.model = self.get_model(self.model_config)
         self.callbacks = defaultdict(list)
 
+        # repeat means and stds for non-rgb images
+        if self.model_config.num_channels != 3:
+            self.means = [val for _, val in zip(range(self.model_config.num_channels), cycle(self.means))]
+            self.stds = [val for _, val in zip(range(self.model_config.num_channels), cycle(self.stds))]
+
     def maybe_download_pretrain_weights(self):
         download_pretrain_weights(self.model_config.pretrain_weights)
 
@@ -177,9 +183,9 @@ def predict(
                     "Image has pixel values above 1. Please ensure the image is "
                     "normalized (scaled to [0, 1])."
                 )
-            if img.shape[0] != 3:
+            if img.shape[0] != self.model_config.num_channels:
                 raise ValueError(
-                    f"Invalid image shape. Expected 3 channels (RGB), but got "
+                    f"Invalid image shape. Expected {self.model_config.num_channels} channels, but got "
                     f"{img.shape[0]} channels."
                 )
             img_tensor = img

@@ -33,8 +33,10 @@
 
 import numpy as np
 import torch
+import torch.nn as nn
 from peft import LoraConfig, get_peft_model
 from torch.utils.data import DataLoader, DistributedSampler
+from timm.models._manipulate import adapt_input_conv
 
 import rfdetr.util.misc as utils
 from rfdetr.datasets import build_dataset, get_coco_api_from_dataset
@@ -70,6 +72,16 @@ def download_pretrain_weights(pretrain_weights: str, redownload=False):
                 pretrain_weights,
             )
 
+def modify_input_conv(conv: nn.Conv2d, num_channels: int) -> nn.Conv2d:
+    """Modify the pretrained input conv layer to accept a different number of input channels."""
+    new_conv = copy.deepcopy(conv)
+    new_conv.in_channels = num_channels
+    new_weight = adapt_input_conv(in_chans=num_channels, conv_weight=conv.weight)
+    new_conv.weight = torch.nn.Parameter(new_weight)
+    new_conv.weight.requires_grad = conv.weight.requires_grad
+    return new_conv
+
+
 class Model:
     def __init__(self, **kwargs):
         args = populate_args(**kwargs)
@@ -126,6 +138,12 @@ def __init__(self, **kwargs):
 
             self.model.load_state_dict(checkpoint['model'], strict=False)
 
+        # Modify input conv if needed
+        if args.num_channels != 3:
+            conv = self.model.backbone[0].encoder.encoder.embeddings.patch_embeddings.projection
+            self.model.backbone[0].encoder.encoder.embeddings.patch_embeddings.projection = modify_input_conv(conv, args.num_channels)
+            self.model.backbone[0].encoder.encoder.embeddings.patch_embeddings.num_channels = args.num_channels
+
         if args.backbone_lora:
             print("Applying LORA to backbone")
             lora_config = LoraConfig(
@@ -814,6 +832,7 @@ def get_args_parser():
 def populate_args(
     # Basic training parameters
     num_classes=2,
+    num_channels=3,
     grad_accum_steps=1,
     amp=False,
     lr=1e-4,
@@ -936,6 +955,7 @@ def populate_args(
 ):
     args = argparse.Namespace(
         num_classes=num_classes,
+        num_channels=num_channels,
         grad_accum_steps=grad_accum_steps,
         amp=amp,
         lr=lr,

@@ -0,0 +1,11 @@
+import torch
+import pytest
+from rfdetr import RFDETRBase, RFDETRLarge
+
+
+@pytest.mark.parametrize("model_class", [RFDETRBase, RFDETRLarge])
+@pytest.mark.parametrize("channels", [1, 4])
+def test_multispectral_support(model_class, channels: int) -> None:
+    model = model_class(num_channels=channels, device="cpu")
+    image = torch.zeros(channels, 224, 224).to("cpu")
+    model.predict(image, threshold=0.5)