Skip to content

Commit ef0d06d

Browse files
[None][chore] Fix kernel launch param and add TRTLLM MoE backend test (#7524)
Signed-off-by: Pengbo Wang <[email protected]>
1 parent ac0df0a commit ef0d06d

File tree

5 files changed

+25
-15
lines changed

5 files changed

+25
-15
lines changed

cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,14 +186,14 @@ void run(Data const& data, void* stream)
186186
if (data.mUseDeepSeekFp8)
187187
{
188188
int const numThreads = 128;
189-
const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
189+
const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens));
190190

191191
LAUNCH(data, activationDeepSeekKernel, grid, numThreads, 0, stream);
192192
}
193193
else
194194
{
195195
int const numThreads = 256;
196-
const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
196+
const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens));
197197

198198
LAUNCH(data, activationKernel, grid, numThreads, 0, stream);
199199
}
@@ -371,7 +371,7 @@ void run(Data const& data, void* stream)
371371
constexpr int VecSize = 4;
372372
int const numThreads = 128;
373373
int const numBlocksX = (data.hiddenDimSf / VecSize - 1 + numThreads) / numThreads;
374-
int const numBlocksY = data.numTokens;
374+
int const numBlocksY = std::min(8192, data.numTokens);
375375
dim3 numBlocks(numBlocksX, numBlocksY);
376376
#define CONVERT_FP4_SF_LAUNCH(LayoutSrc, LayoutDst) \
377377
if (data.sfLayoutSrc == tg::SfLayout::LayoutSrc && data.sfLayoutDst == tg::SfLayout::LayoutDst) \
@@ -457,7 +457,7 @@ void run(Data const& data, void* stream)
457457
{
458458
int const numThreads = 256;
459459
int const numBlocksX = (data.hiddenDim - 1 + numThreads) / numThreads;
460-
int const numBlocksY = data.numTokens;
460+
int const numBlocksY = std::min(8192, data.numTokens);
461461
dim3 numBlocks(numBlocksX, numBlocksY);
462462

463463
LAUNCH(data, permuteKernel, numBlocks, numThreads, 0, stream);

jenkins/L0_Test.groovy

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2093,10 +2093,11 @@ def launchTestJobs(pipeline, testFilter)
20932093

20942094
multiNodesSBSAConfigs = [
20952095
// Each stage test 1 testcase with 8 GPUs and 2 nodes.
2096-
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
2097-
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
2098-
"GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
2099-
"GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
2096+
"GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
2097+
"GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
2098+
"GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],
2099+
"GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 5, 8, 2],
2100+
"GB200-8_GPUs-2_Nodes-PyTorch-5": ["gb200-multi-node", "l0_gb200_multi_nodes", 5, 5, 8, 2],
21002101
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
21012102
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
21022103
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1918,18 +1918,25 @@ def test_nvfp4_multi_gpus_corner_case(self):
19181918
@pytest.mark.skip_less_mpi_world_size(8)
19191919
@skip_pre_hopper
19201920
@pytest.mark.parametrize(
1921-
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size",
1922-
[(8, 1, 4, 3, False, False, True, True, 1),
1923-
(8, 1, 8, 0, True, True, True, True, 24),
1924-
(8, 1, 8, 1, True, True, True, True, 24)],
1925-
ids=["latency", "throughput", "throughput_mtp"])
1921+
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
1922+
[(8, 1, 4, 3, False, False, True, True, 1, "_DEFAULT"),
1923+
(8, 1, 8, 0, True, True, True, True, 24, "_DEFAULT"),
1924+
(8, 1, 8, 1, True, True, True, True, 24, "_DEFAULT"),
1925+
(8, 1, 8, 1, True, True, True, True, 24, "TRTLLM")],
1926+
ids=[
1927+
"latency", "throughput", "throughput_mtp", "throughput_mtp_trtllm"
1928+
])
19261929
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
19271930
attention_dp, cuda_graph, overlap_scheduler,
1928-
max_batch_size):
1931+
max_batch_size, moe_backend):
1932+
19291933
if get_sm_version() == 100:
1930-
moe_config = MoeConfig(backend="DEEPGEMM", max_num_tokens=16384)
1934+
moe_backend = "DEEPGEMM" if moe_backend == "_DEFAULT" else moe_backend
1935+
moe_config = MoeConfig(backend=moe_backend, max_num_tokens=16384)
19311936
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
19321937
else:
1938+
if moe_backend != "_DEFAULT":
1939+
pytest.skip("Not supported MoE backend!")
19331940
moe_config = MoeConfig()
19341941
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
19351942

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,7 @@ l0_dgx_b200:
7171
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180)
7272
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
7373
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180)
74+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180)
7475
- condition:
7576
ranges:
7677
system_gpu_count:

tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ l0_gb200_multi_nodes:
1717
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180)
1818
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
1919
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180)
20+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180)
2021
- condition:
2122
ranges:
2223
# 2 nodes with each node has 4 GPUs

0 commit comments

Comments
 (0)