-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Description
Add Link
none ...
Describe the bug
i use a same .pt model, test is in a same computer, but libtorch is slower than pytorch 30~40%.
in python, 30 times inference only 18 ms AVG , but in C++ libtorch needs 24ms AVG.
i am using CUDA 12.8 , CUDNN 9.5.1 and libtorch 2.8
my codes are below..
`
#include
#include <torch/torch.h>
#include <torch/script.h>
#include
#include
int main() {
// 1. choose device..
torch::Device device = torch::kCPU;
if (torch::cuda::is_available()) {
device = torch::kCUDA;
std::cout << "CUDA is available! Using GPU." << std::endl;
if (torch::cuda::cudnn_is_available()) {
std::cout << "✅ cuDNN is available and will be used." << std::endl;
} else {
std::cout << "❌ cuDNN is NOT available. Performance may be suboptimal." << std::endl;
}
}
// 2. load model
torch::jit::Module module;
try {
module = torch::jit::load("/home/bingyu/profile_model/rf202508011_74.pt", device);
module.eval();
std::cout << "Model loaded successfully." << std::endl;
} catch (const c10::Error& e) {
std::cerr << "Error loading model: " << e.what() << std::endl;
return -1;
}
// 3. defination shapes
const int64_t BATCH_SIZE = 1;
const int64_t JOINT_NUM = 14;
const int64_t STATES_HORIZON = 12;
const int64_t SEQ_LEN = 50;
const int64_t NUM_CAMERAS = 4;
const int64_t IMG_C = 3;
const int64_t IMG_H = 480;
const int64_t IMG_W = 640;
// 4. create input tensor
auto qpos = torch::randn({BATCH_SIZE, STATES_HORIZON, JOINT_NUM}, device);
auto image = torch::randn({BATCH_SIZE, NUM_CAMERAS, IMG_C, IMG_H, IMG_W}, device);
auto noise = torch::randn({BATCH_SIZE, SEQ_LEN, JOINT_NUM}, device);
std::vector<torch::jit::IValue> inputs = {qpos, image, noise};
// 5. warm up ...
std::cout << "\nWarming up model..." << std::endl;
for (int i = 0; i < 5; ++i) {
torch::NoGradGuard no_grad;
module.forward(inputs);
}
std::cout << "Warm-up completed." << std::endl;
// 6. testing..
const int total_times = 10;
double total_elapsed = 0.0;
std::cout << "\nRunning inference..." << std::endl;
for (int i = 0; i < total_times; ++i) {
torch::NoGradGuard no_grad; // 在这个作用域内,不计算梯度
auto start = std::chrono::high_resolution_clock::now();
auto output = module.forward(inputs).toTensor();
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::microseconds>(end - start);
total_elapsed += duration.count();
std::cout << "Inference " << i << " time: " << duration.count() << " μs" << std::endl;
}
double avg_time = total_elapsed / total_times;
std::cout << "\nAverage inference time: " << avg_time << " μs ("
<< avg_time / 1000.0 << " ms)" << std::endl;
return 0;
}
`
python code is below..
`
import torch
import os
import time
MODEL_PATH = "/home/bingyu/profile_model/rf202508011_74.pt"
INPUT_SHAPES = [
(1, 12, 14), # qpos
(1, 4, 3, 480, 640), # image
(1, 50, 14) # noise
]
WARMUP_ITER = 10
INFERENCE_RUNS = 10
def main():
if not os.path.exists(MODEL_PATH):
return
device_str = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_str)
print(f"🚀 usisng device: {device_str.upper()}")
print(f"📂 loading model : {MODEL_PATH}")
try:
model = torch.jit.load(MODEL_PATH)
model.to(device)
model.eval()
print("✅ model success!")
except Exception as e:
print(f"❌ model load failed。\n {e}")
return
try:
inputs = [torch.randn(shape, device=device) for shape in INPUT_SHAPES]
except Exception as e:
print(f"❌ error INPUT_SHAPES。\n {e}")
return
with torch.no_grad():
for _ in range(WARMUP_ITER):
model(*inputs)
# ==================================================================
if device.type == 'cuda':
torch.cuda.synchronize()
timings_ms = []
with torch.no_grad():
for i in range(INFERENCE_RUNS):
if device.type == 'cuda':
start_event = torch.cuda.Event(enable_timing=True)
end_event = torch.cuda.Event(enable_timing=True)
start_event.record()
model(*inputs)
end_event.record()
torch.cuda.synchronize()
elapsed_time = start_event.elapsed_time(end_event)
timings_ms.append(elapsed_time)
else:
start_time = time.perf_counter()
model(*inputs)
end_time = time.perf_counter()
elapsed_time = (end_time - start_time) * 1000
timings_ms.append(elapsed_time)
# ==================================================================
# --- 4. 结果统计 ---
print("\n" + "=" * 80)
print("=" * 80 + "\n")
if timings_ms:
timings_tensor = torch.tensor(timings_ms)
print(f"total times : {len(timings_tensor)}")
print(f"(Mean): {timings_tensor.mean():.3f} ms")
print(f"(Std Dev): {timings_tensor.std():.3f} ms")
print(f"(Min): {timings_tensor.min():.3f} ms")
print(f"(Max): {timings_tensor.max():.3f} ms")
else:
print("没有有效的计时结果。")
print("\n" + "=" * 80)
if name == 'main':
main()
`
and , i use the same model code to .onnx model, but performance is so same in the C++ and Python .
so i really want to know why does the C++ libtorch slower than pytorh
Describe your environment
i am using CUDA 12.8 , CUDNN 9.5.1 and libtorch 2.8