@@ -49,7 +49,13 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
49
49
# NOTE: Increasing this in this suite will fail CI because we currently cannot
50
50
# reset distributed env properly. Use a value > 1 just when you test.
51
51
@pytest .mark .parametrize ("tensor_parallel_size" , [1 ])
52
- @pytest .mark .parametrize ("attention_backend" , ["FLASHINFER" , "FLASH_ATTN" ])
52
+ @pytest .mark .parametrize ("attention_backend" , [
53
+ pytest .param ("FLASHINFER" ,
54
+ marks = pytest .mark .skipif (
55
+ current_platform .is_rocm (),
56
+ reason = "FLASHINFER isn't supported on ROCm" )),
57
+ "FLASH_ATTN"
58
+ ])
53
59
def test_models (
54
60
hf_runner : HfRunner ,
55
61
vllm_runner : VllmRunner ,
@@ -99,7 +105,13 @@ def test_models(
99
105
@multi_gpu_test (num_gpus = 2 )
100
106
@pytest .mark .parametrize ("distributed_executor_backend" , ["ray" , "mp" ])
101
107
@pytest .mark .parametrize ("model" , MODELS )
102
- @pytest .mark .parametrize ("attention_backend" , ["FLASHINFER" , "FLASH_ATTN" ])
108
+ @pytest .mark .parametrize ("attention_backend" , [
109
+ pytest .param ("FLASHINFER" ,
110
+ marks = pytest .mark .skipif (
111
+ current_platform .is_rocm (),
112
+ reason = "FLASHINFER isn't supported on ROCm" )),
113
+ "FLASH_ATTN"
114
+ ])
103
115
def test_models_distributed (
104
116
hf_runner : HfRunner ,
105
117
vllm_runner : VllmRunner ,
@@ -172,6 +184,8 @@ def test_models_distributed(
172
184
# Due to low-precision numerical divergence, this test is too sensitive to
173
185
# the async postprocessor
174
186
@pytest .mark .parametrize ("disable_async_output_proc" , [True ])
187
+ @pytest .mark .skipif (current_platform .is_rocm (),
188
+ reason = "machete_prepack_B isn't supported on ROCm" )
175
189
def test_models_with_fp8_kv_cache (
176
190
vllm_runner : VllmRunner ,
177
191
example_prompts ,
0 commit comments