Skip to content

[BUG] - <why does the C++ Libtorch performance slower than pytorch? (show the full code)> #3518

@Sukidesyo

Description

@Sukidesyo

Add Link

none ...

Describe the bug

i use a same .pt model, test is in a same computer, but libtorch is slower than pytorch 30~40%.
in python, 30 times inference only 18 ms AVG , but in C++ libtorch needs 24ms AVG.
i am using CUDA 12.8 , CUDNN 9.5.1 and libtorch 2.8
my codes are below..

`

#include
#include <torch/torch.h>
#include <torch/script.h>
#include
#include
int main() {
// 1. choose device..
torch::Device device = torch::kCPU;
if (torch::cuda::is_available()) {
device = torch::kCUDA;
std::cout << "CUDA is available! Using GPU." << std::endl;

    if (torch::cuda::cudnn_is_available()) {
        std::cout << "✅ cuDNN is available and will be used." << std::endl;
    } else {
        std::cout << "❌ cuDNN is NOT available. Performance may be suboptimal." << std::endl;
    }
}

// 2. load model
torch::jit::Module module;
try {
    module = torch::jit::load("/home/bingyu/profile_model/rf202508011_74.pt", device);
    module.eval();
    std::cout << "Model loaded successfully." << std::endl;
} catch (const c10::Error& e) {
    std::cerr << "Error loading model: " << e.what() << std::endl;
    return -1;
}

// 3. defination shapes
const int64_t BATCH_SIZE = 1;
const int64_t JOINT_NUM = 14;
const int64_t STATES_HORIZON = 12;
const int64_t SEQ_LEN = 50;
const int64_t NUM_CAMERAS = 4;
const int64_t IMG_C = 3;
const int64_t IMG_H = 480;
const int64_t IMG_W = 640;

// 4. create input tensor
auto qpos = torch::randn({BATCH_SIZE, STATES_HORIZON, JOINT_NUM}, device);
auto image = torch::randn({BATCH_SIZE, NUM_CAMERAS, IMG_C, IMG_H, IMG_W}, device);
auto noise = torch::randn({BATCH_SIZE, SEQ_LEN, JOINT_NUM}, device);
std::vector<torch::jit::IValue> inputs = {qpos, image, noise};

// 5. warm up ...
std::cout << "\nWarming up model..." << std::endl;
for (int i = 0; i < 5; ++i) {
    torch::NoGradGuard no_grad;
    module.forward(inputs);
}
std::cout << "Warm-up completed." << std::endl;

// 6. testing..
const int total_times = 10;
double total_elapsed = 0.0;

std::cout << "\nRunning inference..." << std::endl;
for (int i = 0; i < total_times; ++i) {
    torch::NoGradGuard no_grad;  // 在这个作用域内,不计算梯度

    auto start = std::chrono::high_resolution_clock::now();

    auto output = module.forward(inputs).toTensor();

    auto end = std::chrono::high_resolution_clock::now();

    auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
    total_elapsed += duration.count();

    std::cout << "Inference " << i << " time: " << duration.count() << " μs" << std::endl;
}

double avg_time = total_elapsed / total_times;
std::cout << "\nAverage inference time: " << avg_time << " μs ("
          << avg_time / 1000.0 << " ms)" << std::endl;

return 0;

}
`

python code is below..
`

import torch
import os
import time

MODEL_PATH = "/home/bingyu/profile_model/rf202508011_74.pt"

INPUT_SHAPES = [
(1, 12, 14), # qpos
(1, 4, 3, 480, 640), # image
(1, 50, 14) # noise
]

WARMUP_ITER = 10
INFERENCE_RUNS = 10

def main():

if not os.path.exists(MODEL_PATH):
    return

device_str = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_str)
print(f"🚀 usisng device: {device_str.upper()}")
print(f"📂 loading model : {MODEL_PATH}")

try:
    model = torch.jit.load(MODEL_PATH)
    model.to(device)
    model.eval()
    print("✅ model success!")
except Exception as e:
    print(f"❌ model load failed。\n   {e}")
    return

try:
    inputs = [torch.randn(shape, device=device) for shape in INPUT_SHAPES]
except Exception as e:
    print(f"❌ error INPUT_SHAPES。\n   {e}")
    return


with torch.no_grad():
    for _ in range(WARMUP_ITER):
        model(*inputs)
# ==================================================================

if device.type == 'cuda':
    torch.cuda.synchronize()


timings_ms = []

with torch.no_grad():
    for i in range(INFERENCE_RUNS):
        if device.type == 'cuda':
            start_event = torch.cuda.Event(enable_timing=True)
            end_event = torch.cuda.Event(enable_timing=True)

            start_event.record()
            model(*inputs)
            end_event.record()

            torch.cuda.synchronize()

            elapsed_time = start_event.elapsed_time(end_event)
            timings_ms.append(elapsed_time)

        else:
            start_time = time.perf_counter()
            model(*inputs)
            end_time = time.perf_counter()

            elapsed_time = (end_time - start_time) * 1000
            timings_ms.append(elapsed_time)
# ==================================================================


# --- 4. 结果统计 ---
print("\n" + "=" * 80)
print("=" * 80 + "\n")

if timings_ms:
    timings_tensor = torch.tensor(timings_ms)
    print(f"total times : {len(timings_tensor)}")
    print(f"(Mean):   {timings_tensor.mean():.3f} ms")
    print(f"(Std Dev):  {timings_tensor.std():.3f} ms")
    print(f"(Min):    {timings_tensor.min():.3f} ms")
    print(f"(Max):    {timings_tensor.max():.3f} ms")
else:
    print("没有有效的计时结果。")

print("\n" + "=" * 80)

if name == 'main':
main()
`

and , i use the same model code to .onnx model, but performance is so same in the C++ and Python .
so i really want to know why does the C++ libtorch slower than pytorh

Describe your environment

i am using CUDA 12.8 , CUDNN 9.5.1 and libtorch 2.8

Metadata

Metadata

Assignees

No one assigned

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions