[BUG] - <why does the C++ Libtorch performance slower than pytorch? (show the full code)>

### Add Link

none ...

### Describe the bug

i use a same .pt model, test is in a same computer, but libtorch is slower than pytorch 30~40%.
in python, 30 times inference only 18 ms AVG , but in C++ libtorch needs 24ms AVG.
i am using CUDA 12.8 ， CUDNN 9.5.1 and libtorch 2.8
my codes are below..

`

#include <chrono>
#include <torch/torch.h>
#include <torch/script.h>
#include <iostream>
#include <vector>
int main() {
    // 1. choose device..
    torch::Device device = torch::kCPU;
    if (torch::cuda::is_available()) {
        device = torch::kCUDA;
        std::cout << "CUDA is available! Using GPU." << std::endl;

        if (torch::cuda::cudnn_is_available()) {
            std::cout << "✅ cuDNN is available and will be used." << std::endl;
        } else {
            std::cout << "❌ cuDNN is NOT available. Performance may be suboptimal." << std::endl;
        }
    }

    // 2. load model
    torch::jit::Module module;
    try {
        module = torch::jit::load("/home/bingyu/profile_model/rf202508011_74.pt", device);
        module.eval();
        std::cout << "Model loaded successfully." << std::endl;
    } catch (const c10::Error& e) {
        std::cerr << "Error loading model: " << e.what() << std::endl;
        return -1;
    }

    // 3. defination shapes
    const int64_t BATCH_SIZE = 1;
    const int64_t JOINT_NUM = 14;
    const int64_t STATES_HORIZON = 12;
    const int64_t SEQ_LEN = 50;
    const int64_t NUM_CAMERAS = 4;
    const int64_t IMG_C = 3;
    const int64_t IMG_H = 480;
    const int64_t IMG_W = 640;

    // 4. create input tensor
    auto qpos = torch::randn({BATCH_SIZE, STATES_HORIZON, JOINT_NUM}, device);
    auto image = torch::randn({BATCH_SIZE, NUM_CAMERAS, IMG_C, IMG_H, IMG_W}, device);
    auto noise = torch::randn({BATCH_SIZE, SEQ_LEN, JOINT_NUM}, device);
    std::vector<torch::jit::IValue> inputs = {qpos, image, noise};

    // 5. warm up ...
    std::cout << "\nWarming up model..." << std::endl;
    for (int i = 0; i < 5; ++i) {
        torch::NoGradGuard no_grad;
        module.forward(inputs);
    }
    std::cout << "Warm-up completed." << std::endl;

    // 6. testing..
    const int total_times = 10;
    double total_elapsed = 0.0;

    std::cout << "\nRunning inference..." << std::endl;
    for (int i = 0; i < total_times; ++i) {
        torch::NoGradGuard no_grad;  // 在这个作用域内，不计算梯度

        auto start = std::chrono::high_resolution_clock::now();

        auto output = module.forward(inputs).toTensor();

        auto end = std::chrono::high_resolution_clock::now();

        auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
        total_elapsed += duration.count();

        std::cout << "Inference " << i << " time: " << duration.count() << " μs" << std::endl;
    }

    double avg_time = total_elapsed / total_times;
    std::cout << "\nAverage inference time: " << avg_time << " μs ("
              << avg_time / 1000.0 << " ms)" << std::endl;

    return 0;
}
`



python code is below..
`

import torch
import os
import time



MODEL_PATH = "/home/bingyu/profile_model/rf202508011_74.pt"

INPUT_SHAPES = [
    (1, 12, 14),  # qpos
    (1, 4, 3, 480, 640),  # image
    (1, 50, 14)  # noise
]

WARMUP_ITER = 10
INFERENCE_RUNS = 10


def main():

    if not os.path.exists(MODEL_PATH):
        return

    device_str = "cuda" if torch.cuda.is_available() else "cpu"
    device = torch.device(device_str)
    print(f"🚀 usisng device: {device_str.upper()}")
    print(f"📂 loading model : {MODEL_PATH}")

    try:
        model = torch.jit.load(MODEL_PATH)
        model.to(device)
        model.eval()
        print("✅ model success!")
    except Exception as e:
        print(f"❌ model load failed。\n   {e}")
        return

    try:
        inputs = [torch.randn(shape, device=device) for shape in INPUT_SHAPES]
    except Exception as e:
        print(f"❌ error INPUT_SHAPES。\n   {e}")
        return


    with torch.no_grad():
        for _ in range(WARMUP_ITER):
            model(*inputs)
    # ==================================================================

    if device.type == 'cuda':
        torch.cuda.synchronize()


    timings_ms = []

    with torch.no_grad():
        for i in range(INFERENCE_RUNS):
            if device.type == 'cuda':
                start_event = torch.cuda.Event(enable_timing=True)
                end_event = torch.cuda.Event(enable_timing=True)

                start_event.record()
                model(*inputs)
                end_event.record()

                torch.cuda.synchronize()

                elapsed_time = start_event.elapsed_time(end_event)
                timings_ms.append(elapsed_time)

            else:
                start_time = time.perf_counter()
                model(*inputs)
                end_time = time.perf_counter()

                elapsed_time = (end_time - start_time) * 1000
                timings_ms.append(elapsed_time)
    # ==================================================================


    # --- 4. 结果统计 ---
    print("\n" + "=" * 80)
    print("=" * 80 + "\n")

    if timings_ms:
        timings_tensor = torch.tensor(timings_ms)
        print(f"total times : {len(timings_tensor)}")
        print(f"(Mean):   {timings_tensor.mean():.3f} ms")
        print(f"(Std Dev):  {timings_tensor.std():.3f} ms")
        print(f"(Min):    {timings_tensor.min():.3f} ms")
        print(f"(Max):    {timings_tensor.max():.3f} ms")
    else:
        print("没有有效的计时结果。")

    print("\n" + "=" * 80)


if __name__ == '__main__':
    main()
`


and , i use the same model code to .onnx model,  but performance is so same in the C++ and Python .
so i really want to know why does the C++ libtorch slower than pytorh



### Describe your environment

i am using CUDA 12.8 ， CUDNN 9.5.1 and libtorch 2.8


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[BUG] - <why does the C++ Libtorch performance slower than pytorch? (show the full code)> #3518

Add Link

Describe the bug

Describe your environment

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[BUG] - <why does the C++ Libtorch performance slower than pytorch? (show the full code)> #3518

Description

Add Link

Describe the bug

Describe your environment

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions