From 8101bc319d552e71769932363c9f9474291a1af2 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 23 Jul 2025 11:08:54 +0000
Subject: [PATCH 1/2] add rocm perf

Co-authored-by: AlexHe99 <Alex.He2@amd.com>
Co-authored-by: haichen07 <Haichen.Zhang@amd.com>
---
 README.md     | 19 +++++++++++--
 README_amd.md | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 2 deletions(-)
 create mode 100644 README_amd.md
diff --git a/README.md b/README.md
index f56a7339..8b1fdc80 100644
--- a/README.md
+++ b/README.md
@@ -14,6 +14,12 @@ A lightweight vLLM implementation built from scratch.
 pip install git+https://github.com/GeeeekExplorer/nano-vllm.git
 ```
 
+### Installation on AMD GPUs
+```bash 
+pip install --no-build-isolation git+https://github.com/GeeeekExplorer/nano-vllm.git
+```
+For more infomations, see [AMD's readme](./README_amd.md)
+
 ## Manual Download
 
 If you prefer to download the model weights manually, use the following command:
@@ -40,18 +46,27 @@ outputs[0]["text"]
 See `bench.py` for benchmark.
 
 **Test Configuration:**
-- Hardware: RTX 4070 Laptop (8GB)
+- Hardware:
+    - Setup 1: Nvidia RTX 4070 Laptop (8GB)
+    - Setup 2: AMD Radeon RX7900XT    (20GB)
 - Model: Qwen3-0.6B
 - Total Requests: 256 sequences
 - Input Length: Randomly sampled between 100–1024 tokens
 - Output Length: Randomly sampled between 100–1024 tokens
 
-**Performance Results:**
+**Setup 1 Performance Results:**
 | Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
 |----------------|-------------|----------|-----------------------|
 | vLLM           | 133,966     | 98.37    | 1361.84               |
 | Nano-vLLM      | 133,966     | 93.41    | 1434.13               |
 
+**Setup 2 Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 61.80    | 2167.84               |
+| Nano-vLLM      | 133,966     | 65.91    | 2032.36               |
+
+
 
 ## Star History
 
diff --git a/README_amd.md b/README_amd.md
new file mode 100644
index 00000000..e3d11819
--- /dev/null
+++ b/README_amd.md
@@ -0,0 +1,77 @@
+# Installation Guide on AMD GPUs
+
+This guide shows nano-vllm users how to install nano-vllm and how it performs on AMD platform.
+
+
+## Installation on AMD GPUs
+
+### Launch container environment
+
+```bash
+CONTAINER_NAME=<your container name>
+IMAGE_NAME=rocm/vllm-dev:rocm6.4.1_navi_ubuntu24.04_py3.12_pytorch_2.7_vllm_0.8.5
+# For AMD Instinct GPUs, users can select latest pre-built docker image:
+# rocm/vllm:rocm6.4.1_vllm_0.9.1_20250715. See https://hub.docker.com/r/rocm/vllm/tags
+
+docker run -it \
+        --rm \
+        --device /dev/dri \
+        --device /dev/kfd \
+        --network host \
+        --ipc host \
+        --group-add video \
+        --cap-add SYS_PTRACE \
+        --security-opt seccomp=unconfined \
+        --privileged \
+        --shm-size 8G \
+        --name ${CONTAINER_NAME} \
+        ${IMAGE_NAME} /bin/bash
+```
+
+### Install through pip
+
+```bash 
+pip install --no-build-isolation git+https://github.com/GeeeekExplorer/nano-vllm.git
+```
+
+
+## Benchmark on AMD GPUs
+
+See `bench.py` for benchmark.
+
+**Test Configuration:**
+- Hardware:
+    - Setup 1: Radeon RX7900XT (20GB)
+    - Setup 2: Instinct MI300X (192GB) 
+    - Setup 3: Ryzen AI 395(128GB unified memory)
+    - Setup 4: Radeon RX9070XT (16GB) 
+- Model: Qwen3-0.6B
+- Total Requests: 256 sequences
+- Input Length: Randomly sampled between 100–1024 tokens
+- Output Length: Randomly sampled between 100–1024 tokens
+
+**Setup 1 Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 61.80    | 2167.84               |
+| Nano-vLLM      | 133,966     | 65.91    | 2032.36               |
+
+**Setup 2 Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 8.93     | 14994.98              |
+| Nano-vLLM      | 133,966     | 20.17    | 6640.22               |
+
+**Setup 3 Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 114.72    | 1167.76              |
+| Nano-vLLM      | 133,966     | 123.81    | 1082.05              |
+
+**Setup 4 Performance Results:**
+| Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
+|----------------|-------------|----------|-----------------------|
+| vLLM           | 133,966     | 47.12    | 2842.8                |
+| Nano-vLLM      | 133,966     | *        |        *              |
+
+*Known issue: nano-vllm has memory access fault issue on RX9070XT to be fixed.
\ No newline at end of file

From 20f11fbde91480e8efa6882a021fe3291365f6db Mon Sep 17 00:00:00 2001
From: AlexHe99 <alehe@amd.com>
Date: Wed, 30 Jul 2025 09:42:37 +0800
Subject: [PATCH 2/2] Update README_amd.md

---
 README_amd.md | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/README_amd.md b/README_amd.md
index e3d11819..e5c18c17 100644
--- a/README_amd.md
+++ b/README_amd.md
@@ -41,9 +41,9 @@ See `bench.py` for benchmark.
 
 **Test Configuration:**
 - Hardware:
-    - Setup 1: Radeon RX7900XT (20GB)
+    - Setup 1: Radeon RX7900XTX (24GB)
     - Setup 2: Instinct MI300X (192GB) 
-    - Setup 3: Ryzen AI 395(128GB unified memory)
+    - Setup 3: Ryzen AI 395(64GB VRAM within 128GB unified memory)
     - Setup 4: Radeon RX9070XT (16GB) 
 - Model: Qwen3-0.6B
 - Total Requests: 256 sequences
@@ -53,8 +53,10 @@ See `bench.py` for benchmark.
 **Setup 1 Performance Results:**
 | Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
 |----------------|-------------|----------|-----------------------|
-| vLLM           | 133,966     | 61.80    | 2167.84               |
-| Nano-vLLM      | 133,966     | 65.91    | 2032.36               |
+| vLLM           | 133,966     | 41.00    | 3295.00               |
+| Nano-vLLM      | 124754      | 35.74    | 3287.95               |
+
+* num_seqs=206 w/ Nano-vLLM for 24GB VRAM of RX7900XTX
 
 **Setup 2 Performance Results:**
 | Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
@@ -72,6 +74,6 @@ See `bench.py` for benchmark.
 | Inference Engine | Output Tokens | Time (s) | Throughput (tokens/s) |
 |----------------|-------------|----------|-----------------------|
 | vLLM           | 133,966     | 47.12    | 2842.8                |
-| Nano-vLLM      | 133,966     | *        |        *              |
+| Nano-vLLM      | 133,966     | 23.97    | 2626.39               |
 
-*Known issue: nano-vllm has memory access fault issue on RX9070XT to be fixed.
\ No newline at end of file
+* num_seqs=110 w/ Nano-vLLM for 16GB VRAM of RX9070XT