diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh
index 6e9af1e721bb..133893ce9555 100755
--- a/.buildkite/scripts/hardware_ci/run-amd-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh
@@ -10,7 +10,7 @@ export PYTHONPATH=".."
 echo "--- Confirming Clean Initial State"
 while true; do
         sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+        if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then
                 echo "GPUs state is \"clean\""
                 break
         fi
@@ -49,18 +49,18 @@ cleanup_docker
 
 echo "--- Resetting GPUs"
 
-echo "reset" > /opt/amdgpu/etc/gpu_state
+echo "reset" > ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}
 
 while true; do
         sleep 3
-        if grep -q clean /opt/amdgpu/etc/gpu_state; then
+	if grep -q clean ${BUILDKITE_AGENT_META_DATA_RESET_TARGET}; then
                 echo "GPUs state is \"clean\""
                 break
         fi
 done
 
 echo "--- Pulling container" 
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="rocm/vllm-ci-private:${BUILDKITE_COMMIT}"
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
 docker pull "${image_name}"
 
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
new file mode 100644
index 000000000000..7d7dcd77bebc
--- /dev/null
+++ b/.buildkite/test-template.j2
@@ -0,0 +1,58 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set docker_image_amd = "rocm/vllm-ci-private:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "vllm/tests" %}
+{% set hf_home = "/root/.cache/huggingface" %}
+
+steps:
+  - label: ":docker: build image"
+    depends_on: ~
+    commands:
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image_amd }} -f docker/Dockerfile.rocm  --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' --target test --progress plain ."
+      - "docker push {{ docker_image_amd }}"
+    key: "amd-build"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    agents:
+      queue: amd-cpu
+    soft_fail: false
+
+    {% for step in steps %}
+    {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares %}
+      - label: "AMD MI300: {{ step.label }}"
+        depends_on: amd-build
+        agents:
+         {% if step.label and step.label=="Benchmarks" or step.label=="Kernels Attention Test %N" or step.label=="Kernels Quantization Test %N" %}
+           queue: amd_mi300_8
+         {% elif step.label=="Distributed Tests (4 GPUs)" or step.label=="2 Node Tests (4 GPUs in total)" or step.label=="Multi-step Tests (4 GPUs)" or step.label=="Pipeline Parallelism Test" or step.label=="LoRA TP Test (Distributed)" %}
+           queue: amd_mi300_4
+         {% elif step.label=="Distributed Comm Ops Test" or step.label=="Distributed Tests (2 GPUs)" or step.label=="Plugin Tests (2 GPUs)" or step.label=="Weight Loading Multiple GPU Test" or step.label=="Weight Loading Multiple GPU Test - Large Models" %}
+           queue: amd_mi300_2
+         {% else %}
+           queue: amd_mi300_1
+         {% endif%}
+        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        priority: 100
+        soft_fail: true
+        {% endif %}
+    {% endfor %}
+    {% for step in steps %}
+        {% if step.mirror_hardwares and mirror_hw in step.mirror_hardwares and (step.label and step.label=="Benchmarks" or step.label=="LoRA Test %N" or step.label=="Kernels Attention Test %N" or step.label=="Kernels Quantization Test %N" or step.label=="Distributed Tests (4 GPUs)" or step.label=="Distributed Comm Ops Test" or step.label=="2 Node Tests (4 GPUs in total)" or step.label=="Distributed Tests (2 GPUs)" or step.label=="Plugin Tests (2 GPUs)" or step.label=="Multi-step Tests (4 GPUs)" or step.label=="Pipeline Parallelism Test" or step.label=="LoRA TP Test (Distributed)" or step.label=="Weight Loading Multiple GPU Test" or step.label=="Weight Loading Multiple GPU Test - Large Models") %}
+      - label: "AMD MI250: {{ step.label }}"
+        depends_on: amd-build
+        agents:
+           queue: amd_mi250_8
+        command: bash .buildkite/scripts/hardware_ci/run-amd-test.sh "(command rocm-smi || true) && export VLLM_LOGGING_LEVEL=DEBUG && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" && ")) | safe }}"
+        env:
+          DOCKER_BUILDKIT: "1"
+        priority: 100
+        soft_fail: true
+        {% endif %}
+    {% endfor %}
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index e98ccd035ee9..db20f3ee0f99 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -1,50 +1,14 @@
 # See https://help.github.com/articles/about-codeowners/
 # for more info about CODEOWNERS file
 
-# This lists cover the "core" components of vLLM that require careful review
-/vllm/attention/backends/abstract.py @WoosukKwon @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/core @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/engine/llm_engine.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/executor/executor_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker_base.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/worker/worker.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/sampler.py @zhuohan123 @youkaichao @alexm-redhat @comaniac @njhill
-/vllm/model_executor/layers/quantization @mgoin @robertgshaw2-redhat @tlrmchlsmth
-/vllm/model_executor/guided_decoding @mgoin @russellb @aarnphm
-/vllm/multimodal @DarkLight1337 @ywang96
-/vllm/vllm_flash_attn @LucasWilkinson
-/vllm/lora @jeejeelee
-/vllm/reasoning @aarnphm
-/vllm/entrypoints @aarnphm
-CMakeLists.txt @tlrmchlsmth
+* @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
 
-# vLLM V1
-/vllm/v1 @WoosukKwon @robertgshaw2-redhat @njhill @ywang96 @comaniac @alexm-redhat
-/vllm/v1/structured_output @mgoin @russellb @aarnphm
+/csrc/ @charlifu @mawong-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
+/vllm/ @charlifu @mawong-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
 
-# Test ownership
-/.buildkite/lm-eval-harness @mgoin @simon-mo
-/tests/async_engine @njhill @robertgshaw2-redhat @simon-mo
-/tests/basic_correctness/test_chunked_prefill @rkooo567 @comaniac
-/tests/distributed/test_multi_node_assignment.py @youkaichao
-/tests/distributed/test_pipeline_parallel.py @youkaichao
-/tests/distributed/test_same_node.py @youkaichao
-/tests/entrypoints @DarkLight1337 @robertgshaw2-redhat @simon-mo @aarnphm
-/tests/entrypoints/llm/test_guided_generate.py @mgoin @russellb @aarnphm
-/tests/kernels @tlrmchlsmth @WoosukKwon
-/tests/model_executor/test_guided_processors.py @mgoin @russellb
-/tests/models @DarkLight1337 @ywang96
-/tests/multi_step @alexm-redhat @comaniac
-/tests/multimodal @DarkLight1337 @ywang96
-/tests/prefix_caching @comaniac @KuntaiDu
-/tests/quantization @mgoin @robertgshaw2-redhat
-/tests/spec_decode @njhill @LiuXiaoxuanPKU
-/tests/test_inputs.py @DarkLight1337 @ywang96
-/tests/v1/entrypoints/llm/test_struct_output_generate.py @mgoin @russellb @aarnphm
-/tests/v1/structured_output @mgoin @russellb @aarnphm
-/tests/weight_loading @mgoin @youkaichao
-/tests/lora @jeejeelee
+fused_moe @divakar-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
 
-# Docs
-/docs @hmellor
-mkdocs.yaml @hmellor
+/tests/ @Alexei-V-Ivanov-AMD @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
+/.buildkite/ @Alexei-V-Ivanov-AMD @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
+
+/benchmarks/profiling @AdrianAbeyta @dllehr-amd @shajrawi @gshtras @maleksan85 @sunway513 @hongxiayang
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
index 017ec7ca82da..9f4cc9233886 100644
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,18 +1,3 @@
-## Essential Elements of an Effective PR Description Checklist
-- [ ] The purpose of the PR, such as "Fix some issue (link existing issues this PR will resolve)".
-- [ ] The test plan, such as providing test command.
-- [ ] The test results, such as pasting the results comparison before and after, or e2e results
-- [ ] (Optional) The necessary documentation update, such as updating `supported_models.md` and `examples` for a new model.
+Please direct your PRs to the upstream vllm (https://github.com/vllm-project/vllm.git)
 
-PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
-
-## Purpose
-
-## Test Plan
-
-## Test Result
-
-## (Optional) Documentation Update
-
-<!--- pyml disable-next-line no-emphasis-as-heading -->
-**BEFORE SUBMITTING, PLEASE READ <https://docs.vllm.ai/en/latest/contributing>** (anything written below this line will be removed by GitHub Actions)
+Accepting PRs into the ROCm fork (https://github.com/ROCm/vllm) will require a clear previously communicated exception
diff --git a/.github/workflows/lint-and-deploy.yaml b/.github/workflows/lint-and-deploy.yaml
deleted file mode 100644
index 64011922ad82..000000000000
--- a/.github/workflows/lint-and-deploy.yaml
+++ /dev/null
@@ -1,85 +0,0 @@
-name: Lint and Deploy Charts
-
-on: pull_request
-
-permissions:
-  contents: read
-
-jobs:
-  lint-and-deploy:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout
-        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
-      - name: Set up Helm
-        uses: azure/setup-helm@b9e51907a09c216f16ebe8536097933489208112 # v4.3.0
-        with:
-          version: v3.14.4
-
-       #Python is required because ct lint runs Yamale and yamllint which require Python.
-      - uses: actions/setup-python@42375524e23c412d93fb67b49958b491fce71c38 # v5.4.0
-        with:
-          python-version: '3.13'
-
-      - name: Set up chart-testing
-        uses: helm/chart-testing-action@0d28d3144d3a25ea2cc349d6e59901c4ff469b3b # v2.7.0
-        with:
-          version: v3.10.1
-
-      - name: Run chart-testing (lint)
-        run: ct lint --target-branch ${{ github.event.repository.default_branch }} --chart-dirs examples/online_serving/chart-helm --charts examples/online_serving/chart-helm
-
-      - name: Setup minio
-        run: |
-          docker network create vllm-net
-          docker run -d -p 9000:9000 --name minio --net vllm-net \
-                     -e "MINIO_ACCESS_KEY=minioadmin" \
-                     -e "MINIO_SECRET_KEY=minioadmin" \
-                     -v /tmp/data:/data \
-                     -v /tmp/config:/root/.minio \
-                     minio/minio server /data
-          export AWS_ACCESS_KEY_ID=minioadmin
-          export AWS_SECRET_ACCESS_KEY=minioadmin
-          export AWS_EC2_METADATA_DISABLED=true
-          mkdir opt-125m
-          cd opt-125m && curl -O -Ls "https://huggingface.co/facebook/opt-125m/resolve/main/{pytorch_model.bin,config.json,generation_config.json,merges.txt,special_tokens_map.json,tokenizer_config.json,vocab.json}" && cd ..
-          aws --endpoint-url http://127.0.0.1:9000/ s3 mb s3://testbucket
-          aws --endpoint-url http://127.0.0.1:9000/ s3 cp opt-125m/ s3://testbucket/opt-125m --recursive
-
-      - name: Create kind cluster
-        uses: helm/kind-action@a1b0e391336a6ee6713a0583f8c6240d70863de3 # v1.12.0
-
-      - name: Build the Docker image vllm cpu
-        run: docker buildx build -f docker/Dockerfile.cpu -t vllm-cpu-env .
-
-      - name: Configuration of docker images, network and namespace for the kind cluster
-        run: |
-          docker pull amazon/aws-cli:2.6.4
-          kind load docker-image  amazon/aws-cli:2.6.4 --name chart-testing
-          kind load docker-image vllm-cpu-env:latest --name chart-testing
-          docker network connect vllm-net "$(docker ps -aqf "name=chart-testing-control-plane")"
-          kubectl create ns ns-vllm
-
-      - name: Run chart-testing (install)
-        run: |
-          export AWS_ACCESS_KEY_ID=minioadmin
-          export AWS_SECRET_ACCESS_KEY=minioadmin
-          sleep 30 && kubectl -n ns-vllm logs -f "$(kubectl -n ns-vllm get pods | awk '/deployment/ {print $1;exit}')" &
-          helm install --wait --wait-for-jobs --timeout 5m0s --debug --create-namespace --namespace=ns-vllm test-vllm examples/online_serving/chart-helm -f examples/online_serving/chart-helm/values.yaml --set secrets.s3endpoint=http://minio:9000 --set secrets.s3bucketname=testbucket --set secrets.s3accesskeyid=$AWS_ACCESS_KEY_ID --set secrets.s3accesskey=$AWS_SECRET_ACCESS_KEY --set resources.requests.cpu=1 --set resources.requests.memory=4Gi --set resources.limits.cpu=2 --set resources.limits.memory=5Gi --set image.env[0].name=VLLM_CPU_KVCACHE_SPACE --set image.env[1].name=VLLM_LOGGING_LEVEL --set-string image.env[0].value="1" --set-string image.env[1].value="DEBUG" --set-string extraInit.s3modelpath="opt-125m/" --set-string 'resources.limits.nvidia\.com/gpu=0' --set-string 'resources.requests.nvidia\.com/gpu=0' --set-string image.repository="vllm-cpu-env"
-
-      - name: curl test
-        run: |
-          kubectl -n ns-vllm port-forward service/test-vllm-service 8001:80 &
-          sleep 10
-          CODE="$(curl -v -f --location http://localhost:8001/v1/completions \
-                  --header "Content-Type: application/json" \
-                  --data '{
-                          "model": "opt-125m",
-                          "prompt": "San Francisco is a",
-                          "max_tokens": 7,
-                          "temperature": 0
-                  }'):$CODE"
-          echo "$CODE"
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
index bfd02879965e..f3dda4c25c79 100644
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@@ -16,7 +16,9 @@ jobs:
   release:
     # Retrieve tag and create release
     name: Create Release
-    runs-on: ubuntu-latest
+    runs-on: self-hosted
+    container:
+      image: rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0
     outputs:
       upload_url: ${{ steps.create_release.outputs.upload_url }}
     steps:
@@ -39,73 +41,42 @@ jobs:
             const script = require('.github/workflows/scripts/create_release.js')
             await script(github, context, core)
 
-  # NOTE(simon): No longer build wheel using GitHub Actions. See buildkite's release workflow. 
-  # wheel:
-  #   name: Build Wheel
-  #   runs-on: ${{ matrix.os }}
-  #   needs: release
+  wheel:
+    name: Build Wheel
+    runs-on: self-hosted
+    container:
+      image: rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0
+    needs: release
 
-  #   strategy:
-  #     fail-fast: false
-  #     matrix:
-  #         os: ['ubuntu-20.04']
-  #         python-version: ['3.9', '3.10', '3.11', '3.12']
-  #         pytorch-version: ['2.4.0']  # Must be the most recent version that meets requirements/cuda.txt.
-  #         cuda-version: ['11.8', '12.1']
+    strategy:
+      fail-fast: false
 
-  #   steps:
-  #     - name: Checkout
-  #       uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-  #     - name: Setup ccache
-  #       uses: hendrikmuhs/ccache-action@ed74d11c0b343532753ecead8a951bb09bb34bc9 # v1.2.14
-  #       with:
-  #         create-symlink: true
-  #         key: ${{ github.job }}-${{ matrix.python-version }}-${{ matrix.cuda-version }}
-
-  #     - name: Set up Linux Env
-  #       if: ${{ runner.os == 'Linux' }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/env.sh
-
-  #     - name: Set up Python
-  #       uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
-  #       with:
-  #           python-version: ${{ matrix.python-version }}
-
-  #     - name: Install CUDA ${{ matrix.cuda-version }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
-  #     - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
-  #       run: |
-  #         bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
-  #     - name: Build wheel
-  #       shell: bash
-  #       env:
-  #         CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
-  #       run: |
-  #         bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
-  #         wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
-  #         asset_name=${wheel_name//"linux"/"manylinux1"}
-  #         echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
-  #         echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+    steps:
+      - name: Prepare
+        run: |
+          pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm6.2
+          pip3 install -U triton
 
-  #     - name: Upload Release Asset
-  #       uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
-  #       env:
-  #         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-  #       with:
-  #         upload_url: ${{ needs.release.outputs.upload_url }}
-  #         asset_path: ./dist/${{ env.wheel_name }}
-  #         asset_name: ${{ env.asset_name }}
-  #         asset_content_type: application/*
+      - name: Checkout
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
 
-      # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
-      # - name: Publish package
-      #   uses: pypa/gh-action-pypi-publish@release/v1.8
-      #   with:
-      #     repository-url: https://test.pypi.org/legacy/
-      #     password: ${{ secrets.PYPI_API_TOKEN }}
-      #     skip-existing: true
+      - name: Build wheel
+        shell: bash
+        env:
+          CMAKE_BUILD_TYPE: Release # do not compile with debug symbol to reduce wheel size
+        run: |
+          bash -x .github/workflows/scripts/build.sh
+          wheel_name=$(find dist -name "*whl" -print0 | xargs -0 -n 1 basename)
+          asset_name=${wheel_name//"linux"/"manylinux1"}
+          echo "wheel_name=${wheel_name}" >> "$GITHUB_ENV"
+          echo "asset_name=${asset_name}" >> "$GITHUB_ENV"
+
+      - name: Upload vllm Release Asset
+        uses: actions/upload-release-asset@e8f9f06c4b078e705bd2ea027f0926603fc9b4d5 # v1.0.2
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        with:
+          upload_url: ${{ needs.release.outputs.upload_url }}
+          asset_path: ./dist/${{ env.wheel_name }}
+          asset_name: ${{ env.asset_name }}
+          asset_content_type: application/*
diff --git a/.github/workflows/reminder_comment.yml b/.github/workflows/reminder_comment.yml
deleted file mode 100644
index 16ae1aadb96b..000000000000
--- a/.github/workflows/reminder_comment.yml
+++ /dev/null
@@ -1,27 +0,0 @@
-name: PR Reminder Comment Bot
-permissions:
-  pull-requests: write
-on:
-  pull_request_target:
-    types: [opened]
-jobs:
-  pr_reminder:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Remind to run full CI on PR
-        uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-        with:
-          script: |
-            github.rest.issues.createComment({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              issue_number: context.issue.number,
-              body: '👋 Hi! Thank you for contributing to the vLLM project.\n\n' +
-                '💬 Join our developer Slack at https://slack.vllm.ai to discuss your PR in #pr-reviews, coordinate on features in #feat- channels, or join special interest groups in #sig- channels.\n\n' +
-                'Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run `fastcheck` CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your `fastcheck` build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping `simon-mo` or `khluu` to add you in our Buildkite org.\n\n' +
-                'Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.\n\n' +
-                'To run CI, PR reviewers can either: Add `ready` label to the PR or enable auto-merge.\n\n' +
-                '🚀'
-            })
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
index 0f010832b465..2bb7b726194d 100644
--- a/.github/workflows/scripts/build.sh
+++ b/.github/workflows/scripts/build.sh
@@ -1,23 +1,20 @@
 #!/bin/bash
 set -eux
 
-python_executable=python$1
-cuda_home=/usr/local/cuda-$2
+python_executable=python3
 
 # Update paths
-PATH=${cuda_home}/bin:$PATH
-LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
-
 # Install requirements
-$python_executable -m pip install -r requirements/build.txt -r requirements/cuda.txt
+$python_executable -m pip install -r requirements/rocm.txt
 
 # Limit the number of parallel jobs to avoid OOM
 export MAX_JOBS=1
 # Make sure release wheels are built for the following architectures
-export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
-export VLLM_FA_CMAKE_GPU_ARCHES="80-real;90-real"
+export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+
+rm -f "$(which sccache)"
 
-bash tools/check_repo.sh
+export MAX_JOBS=32
 
 # Build
 $python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd389823fbb2..3980db42020a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -769,7 +769,9 @@ if(VLLM_GPU_LANG STREQUAL "HIP")
   set(VLLM_ROCM_EXT_SRC
     "csrc/rocm/torch_bindings.cpp"
     "csrc/rocm/skinny_gemms.cu"
-    "csrc/rocm/attention.cu")
+    "csrc/rocm/attention.cu"
+    "csrc/rocm/fused_kernels.cu"
+    "csrc/rocm/custom.cu")
 
   define_gpu_extension_target(
     _rocm_C
diff --git a/ROCm_performance.md b/ROCm_performance.md
new file mode 100644
index 000000000000..f6c67637a968
--- /dev/null
+++ b/ROCm_performance.md
@@ -0,0 +1,19 @@
+# Overview of the optional performance features uinque to https://github.com/ROCm/vllm
+
+## Triton attention
+The default attention function on ROCm is using triton attention kernel. To fallback to the https://github.com/ROCm/flash-attention implementation set up the following environment symbol:  
+`VLLM_USE_TRITON_FLASH_ATTN=0`
+
+## Tunable ops
+Pytorch tunable ops are supported.  
+Define the following environment symbol: `PYTORCH_TUNABLEOP_ENABLED=1` in order to enable both the runtime tuning and the subsequent use of tuned results. To only use the tuned results without tuning any newly encountered shapes, set `PYTORCH_TUNABLEOP_TUNING=0`
+
+## Custom PagedAttention
+
+On ROCm, to have better performance, a custom paged attention is available by switching on the env variable: `VLLM_USE_ROCM_CUSTOM_PAGED_ATTN=1`.
+Currently, this env variable is enabled by default. To fallback to PagedAttention v2 kernel assign the env variable to 0.
+The custom PagedAttention kernel is enabled for dtype: bf16, fp16, block-size=16, head-size=128, and max context length <= 16k, with GQA ratio (num_heads//num_kv_heads) between 1 to 16. On all the other cases, we fallback to PagedAttention v2 kernel.
+
+## NCCL Performance environment variable
+
+For MI300x, setting environment variable NCCL_MIN_NCHANNELS=112 is expected to improve performance.
diff --git a/benchmarks/P3L.py b/benchmarks/P3L.py
new file mode 100755
index 000000000000..92721a5e612f
--- /dev/null
+++ b/benchmarks/P3L.py
@@ -0,0 +1,263 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+"""
+Patch-Perplexity (P3L)
+
+This is a script that produces a realistic PPL measurement
+for the quantized KV cache system by processing a sequence of
+non-overlapping patches of the reference text. Generation of the
+consecutive symbols in each patch is governed (forced)
+by the reference text.
+
+The initial context size for the system is set by the parameter
+"--context-size".
+
+The number of output symbols to generate starting from a given
+context is set by the parameter "--sample-size". This variable also
+defines the size of the individual patch.
+
+For the N-token reference text that is split into M patches with the
+system's context size C it takes M*preload + (N-C)*generation time.
+
+Quick correctness validation tips:
+
+Running llama-2-7b model
+(
+    ./vllm/examples/P3L.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
+    --sample-size=512
+)
+should result in PPL ~ 6.524227946419175
+
+Running llama-2-7b model
+(
+    ./vllm/examples/P3L.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
+    --sample-size=512
+    --patch-size=1
+)
+should result in PPL ~ PPL=3.8968611189957523
+
+Running the script with multiple batches is possible
+by specifying the --batch-size parameter.
+
+"""
+
+import argparse
+import dataclasses
+import datetime
+import json
+import math
+import os
+import tempfile
+
+from huggingface_hub import hf_hub_download
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+def get_wikitext2_text(tokenizer):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        hf_hub_download(
+            repo_id="alexei-v-ivanov-amd/wiki",
+            repo_type="dataset",
+            filename="wiki.test.raw",
+            local_dir=tmpdirname,
+        )
+        with open(os.path.join(tmpdirname, "wiki.test.raw")) as f:
+            test_text = "\n".join(line.strip() for line in f)
+            test_enc = tokenizer(test_text)
+
+    return test_enc, test_text
+
+
+def vllm_init(args):
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        n=1,
+        temperature=0.0,
+        top_p=1,
+        ignore_eos=True,
+        ppl_measurement=True,
+        future_context=[],
+        prompt_logprobs=1,
+        logprobs=1,
+        presence_penalty=0.0,
+    )
+
+    return llm, sampling_params
+
+
+def vllm_predict(CONT, llm, sampl_par):
+    result = llm.generate(prompt_token_ids=CONT, sampling_params=sampl_par)
+    return result
+
+
+def main(args: argparse.Namespace):
+    MESSAGE = f"Initialising @ {datetime.datetime.now()}"
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    my_ppl = 0.0
+
+    logger.info("Initializing the engine.")
+    my_llm, my_sampl_par = vllm_init(args)
+    my_tokenizer = my_llm.llm_engine.tokenizer.tokenizer
+    logger.info(my_sampl_par)
+    logger.info("Initialized the engine.")
+
+    my_n_samples = args.sample_size
+
+    if (
+        args.context_size + my_n_samples
+    ) > my_llm.llm_engine.model_config.max_model_len:
+        MESSAGE = (
+            ""
+            "Error! The total number of tokens:\n"
+            f" prefix ({args.context_size}) + "
+            f"to be generated ({my_n_samples})"
+            f" can't be bigger than the model limit "
+            f"({my_llm.llm_engine.model_config.max_model_len})."
+        )
+        logger.info(MESSAGE)
+        print(MESSAGE)
+        return
+
+    my_test_enc, my_test_text = get_wikitext2_text(my_tokenizer)
+    logger.info("Loaded the test data.")
+
+    my_n_patches = math.ceil(
+        (len(my_test_enc["input_ids"]) - args.context_size - 1) / my_n_samples
+    )
+    if args.patch_size is not None:
+        my_n_patches = args.patch_size
+
+    num_tokens_generated = 0
+    starting_time = datetime.datetime.now()
+    MESSAGE = (
+        f"Starting generation @ {starting_time}\n"
+        " Have the test sample of "
+        f"{len(my_test_enc['input_ids'])} tokens"
+        f" will try to process {my_n_patches} patche(s),"
+        f" generating {my_n_samples} tokens in each patch"
+        f" from the initial context of {args.context_size} tokens."
+    )
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    my_batchsize = args.batch_size
+
+    for c in range(0, my_n_patches, my_batchsize):
+        CONTEXT = []
+        my_sampl_par.future_context = []
+        my_sampl_par.cntr = []
+
+        for b in range(my_batchsize):
+            if (c + b) < my_n_patches:
+                upper_boundary = min(
+                    (c + b + 1) * my_n_samples + args.context_size,
+                    len(my_test_enc["input_ids"]),
+                )
+                CONTEXT.append(
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples : (c + b) * my_n_samples
+                        + args.context_size
+                    ]
+                )
+
+                my_sampl_par.future_context.append(
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples + args.context_size : upper_boundary
+                    ]
+                )
+
+                my_sampl_par.cntr.append(c + b)
+
+        my_sampl_par.max_tokens = max(
+            len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT))
+        )
+
+        LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
+        for b in range(len(CONTEXT)):
+            num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
+            my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
+
+        if num_tokens_generated < my_n_samples * len(CONTEXT):
+            MESSAGE = (
+                f"Warning: The number of generated tokens is"
+                f"less than requested ({num_tokens_generated}"
+                f" < {my_n_samples * len(CONTEXT)})."
+            )
+            logger.info(MESSAGE)
+            print(MESSAGE)
+
+        MESSAGE = (
+            f"Iterations {c + 1} through {c + len(CONTEXT)}"
+            f" of {my_n_patches} Intermediate "
+            "Estimates:\n"
+            f"\tCross-entropy_intermediate={my_ppl / num_tokens_generated}\n"
+            f"\tPerplexity_intermediate="
+            f"{math.exp(my_ppl / num_tokens_generated)}"
+        )
+
+        logger.info(MESSAGE)
+        print(MESSAGE)
+
+    ending_time = datetime.datetime.now()
+    MESSAGE = (
+        f"Done @ {ending_time} after processing for"
+        f" {ending_time - starting_time}"
+        f" generated {num_tokens_generated} tokens."
+    )
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    MESSAGE = (
+        f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy="
+        f"{my_ppl / num_tokens_generated}"
+        f"\n\tPPL={math.exp(my_ppl / num_tokens_generated)}"
+    )
+
+    if args.output_json:
+        results = {
+            "integral_cross_entropy": my_ppl,
+            "average_cross_entropy": my_ppl / num_tokens_generated,
+            "ppl": math.exp(my_ppl / num_tokens_generated),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    return
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Measure the PPPL (P3L) score of a given model."
+    )
+    parser.add_argument("--context-size", type=int, default=4096)
+    parser.add_argument("--sample-size", type=int, default=512)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--patch-size", type=int, default=None)
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/benchmarks/P3L_mling.py b/benchmarks/P3L_mling.py
new file mode 100755
index 000000000000..b4e1c0bf9e47
--- /dev/null
+++ b/benchmarks/P3L_mling.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+"""
+*MULTILINGUAL*  Patch-Perplexity (P3L)
+
+This is a script that produces a realistic PPL measurement
+for the quantized KV cache system by processing a sequence of
+non-overlapping patches of the reference text. Generation of the
+consecutive symbols in each patch is governed (forced)
+by the reference text.
+
+The initial context size for the system is set by the parameter
+"--context-size".
+
+The number of output symbols to generate starting from a given
+context is set by the parameter "--sample-size". This variable also
+defines the size of the individual patch.
+
+For the N-token reference text that is split into M patches with the
+system's context size C it takes M*preload + (N-C)*generation time.
+
+Quick correctness validation tips:
+
+Running DeepSeek-V2 model
+(
+    ./vllm/examples/P3L_mling.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
+    --sample-size=512
+)
+
+should result in PPL ~ 8.42927
+
+Running DeepSeek-V2 model
+(
+    ./vllm/examples/P3L_mling.py
+    --model=meta-llama/Llama-2-7b-chat-hf
+    --context-size=1024
+    --sample-size=512
+    --patch-size=1
+    --lang-script="cmn_Hant"
+)
+should result in PPL ~ 2.67962
+
+The multi-linguality is implemented through the additional
+key "--lang-script", which defaults to English in Latin
+scripture ("eng_Latn").
+
+Please refer to
+
+https://confluence.amd.com/display/MLSE/Multi-Lingual+P3L+Test
+
+for the complete set of possible language-scripture choices.
+
+Running the script with multiple batches is possible
+by specifying the --batch-size parameter.
+
+"""
+
+import argparse
+import dataclasses
+import datetime
+import json
+import math
+import os
+import tempfile
+
+import pandas
+from huggingface_hub import hf_hub_download
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.logger import init_logger
+from vllm.utils import FlexibleArgumentParser
+
+logger = init_logger(__name__)
+
+
+def get_wikitext2_text(tokenizer):
+    with tempfile.TemporaryDirectory() as tmpdirname:
+        hf_hub_download(
+            repo_id="alexei-v-ivanov-amd/wiki",
+            repo_type="dataset",
+            filename="wiki.test.raw",
+            local_dir=tmpdirname,
+        )
+        with open(os.path.join(tmpdirname, "wiki.test.raw")) as f:
+            test_text = "\n".join(line.strip() for line in f)
+            test_enc = tokenizer(test_text)
+
+    return test_enc, test_text
+
+
+def get_flores_plus_text(tokenizer, lng_script):
+    hf_hub_download(
+        repo_id="alexei-v-ivanov-amd/flores_plus",
+        repo_type="dataset",
+        filename=lng_script + ".parquet",
+        local_dir="./",
+    )
+
+    df = pandas.read_parquet("./" + lng_script + ".parquet")
+    test_text = "\n\n".join(line.strip() for line in df["text"])
+    test_enc = tokenizer(test_text)
+
+    os.remove("./" + lng_script + ".parquet")
+
+    return test_enc, test_text
+
+
+def vllm_init(args):
+    engine_args = EngineArgs.from_cli_args(args)
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        n=1,
+        temperature=0.0,
+        top_p=1,
+        ignore_eos=True,
+        ppl_measurement=True,
+        future_context=[],
+        prompt_logprobs=1,
+        logprobs=1,
+        presence_penalty=0.0,
+    )
+
+    return llm, sampling_params
+
+
+def vllm_predict(CONT, llm, sampl_par):
+    result = llm.generate(prompt_token_ids=CONT, sampling_params=sampl_par)
+    return result
+
+
+def main(args: argparse.Namespace):
+    MESSAGE = f"Initialising @ {datetime.datetime.now()}"
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    my_ppl = 0.0
+
+    logger.info("Initializing the engine.")
+    my_llm, my_sampl_par = vllm_init(args)
+    my_tokenizer = my_llm.llm_engine.tokenizer.tokenizer
+    logger.info(my_sampl_par)
+    logger.info("Initialized the engine.")
+
+    my_n_samples = args.sample_size
+    my_lang_script = args.lang_script
+
+    if (
+        args.context_size + my_n_samples
+    ) > my_llm.llm_engine.model_config.max_model_len:
+        MESSAGE = (
+            ""
+            "Error! The total number of tokens:\n"
+            f" prefix ({args.context_size}) + "
+            f"to be generated ({my_n_samples})"
+            f" can't be bigger than the model limit "
+            f"({my_llm.llm_engine.model_config.max_model_len})."
+        )
+        logger.info(MESSAGE)
+        print(MESSAGE)
+        return
+
+    my_test_enc, my_test_text = get_flores_plus_text(my_tokenizer, my_lang_script)
+
+    logger.info("Loaded the test data.")
+
+    my_n_patches = math.ceil(
+        (len(my_test_enc["input_ids"]) - args.context_size - 1) / my_n_samples
+    )
+    if args.patch_size is not None:
+        my_n_patches = args.patch_size
+
+    num_tokens_generated = 0
+    starting_time = datetime.datetime.now()
+    MESSAGE = (
+        f"Starting generation @ {starting_time}\n"
+        " Have the test sample of "
+        f"{len(my_test_enc['input_ids'])} tokens"
+        f" will try to process {my_n_patches} patche(s),"
+        f" generating {my_n_samples} tokens in each patch"
+        f" from the initial context of {args.context_size} tokens."
+    )
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    my_batchsize = args.batch_size
+
+    for c in range(0, my_n_patches, my_batchsize):
+        CONTEXT = []
+        my_sampl_par.future_context = []
+        my_sampl_par.cntr = []
+
+        for b in range(my_batchsize):
+            if (c + b) < my_n_patches:
+                upper_boundary = min(
+                    (c + b + 1) * my_n_samples + args.context_size,
+                    len(my_test_enc["input_ids"]),
+                )
+                CONTEXT.append(
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples : (c + b) * my_n_samples
+                        + args.context_size
+                    ]
+                )
+
+                my_sampl_par.future_context.append(
+                    my_test_enc["input_ids"][
+                        (c + b) * my_n_samples + args.context_size : upper_boundary
+                    ]
+                )
+
+                my_sampl_par.cntr.append(c + b)
+
+        my_sampl_par.max_tokens = max(
+            len(my_sampl_par.future_context[b]) for b in range(len(CONTEXT))
+        )
+
+        LOGPROBS = vllm_predict(CONTEXT, my_llm, my_sampl_par)
+        for b in range(len(CONTEXT)):
+            num_tokens_generated += len(LOGPROBS[b].outputs[0].token_ids)
+            my_ppl -= LOGPROBS[b].outputs[0].cumulative_logprob
+
+        if num_tokens_generated < my_n_samples * len(CONTEXT):
+            MESSAGE = (
+                f"Warning: The number of generated tokens is"
+                f"less than requested ({num_tokens_generated}"
+                f" < {my_n_samples * len(CONTEXT)})."
+            )
+            logger.info(MESSAGE)
+            print(MESSAGE)
+
+        MESSAGE = (
+            f"Iterations {c + 1} through {c + len(CONTEXT)}"
+            f" of {my_n_patches} Intermediate "
+            "Estimates:\n"
+            f"\tCross-entropy_intermediate={my_ppl / num_tokens_generated}\n"
+            f"\tPerplexity_intermediate="
+            f"{math.exp(my_ppl / num_tokens_generated)}"
+        )
+
+        logger.info(MESSAGE)
+        print(MESSAGE)
+
+    ending_time = datetime.datetime.now()
+    MESSAGE = (
+        f"Done @ {ending_time} after processing for"
+        f" {ending_time - starting_time}"
+        f" generated {num_tokens_generated} tokens."
+    )
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+
+    MESSAGE = (
+        f"\tIntegral Cross-Entropy={my_ppl}\n\tAverage Cross-Entropy="
+        f"{my_ppl / num_tokens_generated}"
+        f"\n\tPPL={math.exp(my_ppl / num_tokens_generated)}"
+    )
+
+    if args.output_json:
+        results = {
+            "integral_cross_entropy": my_ppl,
+            "average_cross_entropy": my_ppl / num_tokens_generated,
+            "ppl": math.exp(my_ppl / num_tokens_generated),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+    logger.info(MESSAGE)
+    print(MESSAGE)
+    return
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Measure the PPPL (P3L) score of a given model."
+    )
+    parser.add_argument(
+        "--data",
+        type=str,
+        default="./wikitext/wikitext-2-v1/test-00000-of-00001.parquet",
+    )
+    parser.add_argument("--context-size", type=int, default=4096)
+    parser.add_argument("--sample-size", type=int, default=512)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--patch-size", type=int, default=None)
+    parser.add_argument("--lang-script", type=str, default="eng_Latn")
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+
+    main(args)
diff --git a/benchmarks/kernels/moe_tune_script.sh b/benchmarks/kernels/moe_tune_script.sh
new file mode 100755
index 000000000000..acd2502e0587
--- /dev/null
+++ b/benchmarks/kernels/moe_tune_script.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+## ---- Mixtral fp8 tuning example ---- ##
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 1 --tune --dtype fp8_w8a8 
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 2 --tune --dtype fp8_w8a8
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 4 --tune --dtype fp8_w8a8
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-Instruct-v0.1-FP8/ --tp-size 8 --tune --dtype fp8_w8a8
+
+
+## ---- Mixtral fp16 tuning example ---- ##
+# we don't need --dtype fp16; it has been set as default for rocm in the script.
+
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 1 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 2 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 4 --tune
+python benchmark_moe.py --model /data/models/mistral-ai-models/Mixtral-8x22B-v0.1/ --tp-size 8 --tune
+
+
+
+## ---- After the tuning is finished ---- ##
+# The tuning script saves the configurations in a json file at the same directory from where you launch the script.
+# The name of the json file will look something like this: E=8,N=14336,device_name=AMD_Instinct_MI300X.json
+# 
+# [IMPORTANT] -> Once the tuning is complete, move the tuned config file(s) to the following path:
+#                      vllm/vllm/model_executor/layers/fused_moe/configs/
+
+
+## ---- Notes ---- ##
+# 1. The tuned file is specific for a TP size. This means a tuned file obtained for --tp-size 8 can only be used when running the model under TP=8 setting.
+# 2. The script uses Ray for multi-gpu tuning. Export HIP_VISIBLE_DEVICES accordingly to expose the required no. of GPUs and use multiple gpus for tuning.
+# 3. RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 resolves the following errors (depending on if HIP_VISIBLE_DEVICES is set or not):
+#    - Error-1: RuntimeError: HIP error: invalid device ordinal
+#               HIP kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
+#               For debugging consider passing AMD_SERIALIZE_KERNEL=3
+#    - Error-2: RuntimeError: HIP_VISIBLE_DEVICES contains more devices than ROCR_VISIBLE_DEVICES
+
diff --git a/benchmarks/profiling/README.md b/benchmarks/profiling/README.md
new file mode 100644
index 000000000000..ee65e8025cc5
--- /dev/null
+++ b/benchmarks/profiling/README.md
@@ -0,0 +1,57 @@
+# VLLM Benchmark Profiling
+
+This profiling directory provides a method to profile VLLM throughput and latency benchmarks using ROCm profiling utilities.
+
+## 1. Dependencies
+
+Before using the profiling feature, you need to install the required dependencies:
+
+### Install ROCm Profile Data
+
+```bash
+git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git
+cd rocmProfileData && make && sudo make install
+```
+
+### Install hipMarker
+
+```bash
+cd rocmProfileData/hipMarker && python3 setup.py install
+```
+
+## 2. Profiling Benchmarks
+
+Profiling can be used to monitor the performance of the VLLM benchmarks with ROCm. The key flags used for profiling are:
+
+- `--profile-rpd`: Profiles the generation process of a single batch.
+- `--profile-dir PROFILE_DIR`: Specifies the path to save the profiler output, which can later be visualized using tools like [ui.perfetto.dev](https://ui.perfetto.dev/) or [chrome.tracing](chrome://tracing/).
+
+### Profiling Using Default Directory
+
+By default, profiling results are saved in either `vllm_benchmark_latency_result` or `vllm_benchmark_throughput_result`. To run a benchmark and profile it using the default directory, execute:
+
+```bash
+python3 benchmark_throughput.py --input-len {len} --output-len {len} --model {model} --profile-rpd
+```
+
+### Profiling With a Custom Directory
+
+You can specify a custom directory for saving profiler outputs by using the `--profile-dir` flag:
+
+```bash
+python3 benchmark_throughput.py --input-len {len} --output-len {len} --model {model} --profile-rpd --profile-dir {/path/to/custom/dir}
+```
+
+After profiling is complete, an `.rpd` file containing the trace data will be saved to the specified directory.
+
+## 3. Convert Trace Data to JSON Format
+
+To view the trace data, it needs to be converted into a format that is compatible with tools like Chrome tracing or Perfetto.
+
+You can use the `rpd2tracing.py` script in rocmProfileData to convert the `.rpd` file into a JSON file:
+
+```bash
+python3 rocmProfileData/tools/rpd2tracing.py trace.rpd trace.json
+```
+
+Once the trace is converted, open the `.json` file in [Chrome](chrome://tracing/) or [Perfetto](https://ui.perfetto.dev/) for visualization.
diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py
new file mode 100644
index 000000000000..f565314b2458
--- /dev/null
+++ b/benchmarks/profiling/benchmark_latency.py
@@ -0,0 +1,202 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark the latency of processing a single batch of requests."""
+
+import argparse
+import dataclasses
+import json
+import os
+import time
+from contextlib import contextmanager, nullcontext
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from vllm import LLM, SamplingParams
+from vllm.engine.arg_utils import EngineArgs
+from vllm.inputs import PromptType
+from vllm.sampling_params import BeamSearchParams
+from vllm.utils import FlexibleArgumentParser
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    @contextmanager
+    def rpd_profiler_context():
+        from rpdTracerControl import rpdTracerControl as rpd
+
+        llm.start_profile()
+        yield
+        llm.stop_profile()
+        rpd.top_totals()
+
+    @contextmanager
+    def torch_profiler_context(profile_result_dir: Optional[str] = None):
+        p = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                str(profile_result_dir)
+            ),
+        )
+        p.start()
+        try:
+            with torch.no_grad():
+                yield p
+        finally:
+            p.stop()
+            print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+
+    def get_profiling_context(profile_result_dir: Optional[str] = None):
+        if args.profile_torch:
+            return torch_profiler_context(profile_result_dir)
+        elif args.profile_rpd:
+            return rpd_profiler_context()
+        else:
+            return nullcontext()
+
+    if args.profile_torch or args.profile_rpd:
+        profile_result_dir = Path(
+            args.profile_result_dir or "./vllm_benchmark_latency_result"
+        )
+        profile_result_dir.mkdir(parents=True, exist_ok=True)
+        name = os.path.basename(os.path.normpath(args.model))
+        model_trace_name = (
+            f"{name}_in_{args.input_len}_out_{args.output_len}_"
+            f"batch_{args.batch_size}_tp_{args.tensor_parallel_size}"
+        )
+        print(f"Profiling (results will be saved to '{profile_result_dir}')...")
+        if args.profile_rpd:
+            profile_result_dir /= f"{model_trace_name}.rpd"
+            os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_result_dir)
+
+    engine_args = EngineArgs.from_cli_args(args)
+
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=1.0,
+        top_p=1.0,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    print(sampling_params)
+    dummy_prompt_token_ids = np.random.randint(
+        10000, size=(args.batch_size, args.input_len)
+    )
+    dummy_prompts: list[PromptType] = [
+        {"prompt_token_ids": batch} for batch in dummy_prompt_token_ids.tolist()
+    ]
+
+    def llm_generate():
+        if not args.use_beam_search:
+            llm.generate(dummy_prompts, sampling_params=sampling_params, use_tqdm=False)
+        else:
+            llm.beam_search(
+                dummy_prompts,
+                BeamSearchParams(
+                    beam_width=args.n,
+                    max_tokens=args.output_len,
+                    ignore_eos=True,
+                ),
+            )
+
+    def run_to_completion(profile_dir: Optional[str] = None):
+        if profile_dir:
+            with get_profiling_context(profile_dir):
+                llm_generate()
+        else:
+            start_time = time.perf_counter()
+            llm_generate()
+            end_time = time.perf_counter()
+            latency = end_time - start_time
+            return latency
+
+    print("Warming up...")
+    for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+        run_to_completion(profile_dir=None)
+
+    if args.profile_torch or args.profile_rpd:
+        run_to_completion(profile_dir=profile_result_dir)
+        return
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile_dir=None))
+    latencies = np.array(latencies)
+    percentages = [10, 25, 50, 75, 90, 99]
+    percentiles = np.percentile(latencies, percentages)
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    for percentage, percentile in zip(percentages, percentiles):
+        print(f"{percentage}% percentile latency: {percentile} seconds")
+
+    # Output JSON results if specified
+    if args.output_json:
+        results = {
+            "avg_latency": np.mean(latencies),
+            "latencies": latencies.tolist(),
+            "percentiles": dict(zip(percentages, percentiles.tolist())),
+        }
+        with open(args.output_json, "w") as f:
+            json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(
+        description="Benchmark the latency of processing a single batch of "
+        "requests till completion."
+    )
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument(
+        "--num-iters-warmup",
+        type=int,
+        default=10,
+        help="Number of iterations to run for warmup.",
+    )
+    parser.add_argument(
+        "--num-iters", type=int, default=30, help="Number of iterations to run."
+    )
+    parser.add_argument(
+        "--profile-torch",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-rpd",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-result-dir",
+        type=str,
+        default=os.getenv("VLLM_RPD_PROFILER_DIR", default=None),
+        help=(
+            "path to save the profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard."
+        ),
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the latency results in JSON format.",
+    )
+
+    parser = EngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    main(args)
diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py
new file mode 100644
index 000000000000..2cc767595b12
--- /dev/null
+++ b/benchmarks/profiling/benchmark_throughput.py
@@ -0,0 +1,636 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Benchmark offline inference throughput."""
+
+import argparse
+import dataclasses
+import json
+import os
+import random
+import time
+from contextlib import contextmanager, nullcontext
+from functools import cache
+from pathlib import Path
+from typing import Optional
+
+import torch
+import uvloop
+from PIL import Image
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase
+
+from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
+from vllm.entrypoints.openai.api_server import (
+    build_async_engine_client_from_engine_args,
+)
+from vllm.inputs import TextPrompt
+from vllm.lora.request import LoRARequest
+from vllm.lora.utils import get_adapter_absolute_path
+from vllm.multimodal import MultiModalDataDict
+from vllm.sampling_params import BeamSearchParams
+from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
+from vllm.utils import FlexibleArgumentParser, merge_async_iterators
+
+
+@dataclasses.dataclass
+class SampleRequest:
+    """A class representing a single inference request for benchmarking.
+
+    Attributes:
+        prompt: The input text prompt for the model.
+        prompt_len: The length of the prompt in tokens.
+        expected_output_len: The expected length of the output in tokens.
+        multi_modal_data: Optional dictionary containing multi-modal data (e.g.
+            images).
+        lora_request: Optional LoRARequest specifying the LoRA to use.
+    """
+
+    prompt: str
+    prompt_len: int
+    expected_output_len: int
+    multi_modal_data: Optional[MultiModalDataDict] = None
+    lora_request: Optional[LoRARequest] = None
+
+
+def _get_prompt_for_image_model(question: str, *, model: str) -> str:
+    """Prepend and append special tokens around the question to form a prompt.
+
+    Args:
+        question: The input question text to wrap with special tokens
+        model: The name of the model being used, to determine which special
+            tokens to add
+
+    Returns:
+        The formatted prompt string with appropriate special tokens for the
+            model
+
+    Raises:
+        ValueError: If an unsupported model name is provided
+    """
+    model = model.lower()
+    if "pixtral" in model:
+        return f"<s>[INST]{question}\n[IMG][/INST]"
+    raise ValueError(f"Unsupported model {model}")
+
+
+@cache
+def lora_path_on_disk(lora_path: str) -> str:
+    return get_adapter_absolute_path(lora_path)
+
+
+lora_tokenizer_cache: dict[int, AnyTokenizer] = {}
+
+
+def get_random_lora_request(
+    args: argparse.Namespace,
+) -> tuple[LoRARequest, Optional[AnyTokenizer]]:
+    global lora_tokenizer_cache
+    lora_id = random.randint(1, args.max_loras)
+    lora_request = LoRARequest(
+        lora_name=str(lora_id),
+        lora_int_id=lora_id,
+        lora_path=lora_path_on_disk(args.lora_path),
+    )
+    if lora_id not in lora_tokenizer_cache:
+        lora_tokenizer_cache[lora_id] = get_lora_tokenizer(lora_request)
+    return lora_request, lora_tokenizer_cache[lora_id]
+
+
+def sample_requests(
+    tokenizer: PreTrainedTokenizerBase, args: argparse.Namespace
+) -> list[SampleRequest]:
+    dataset_path: str = args.dataset
+    num_requests: int = args.num_prompts
+    fixed_output_len: Optional[int] = args.output_len
+    model: str = args.model
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
+
+    # Load the dataset.
+    with open(dataset_path) as f:
+        dataset = json.load(f)
+    # Filter out the conversations with less than 2 turns.
+    dataset = [data for data in dataset if len(data["conversations"]) >= 2]
+    # Shuffle the dataset.
+    random.shuffle(dataset)
+
+    # Filter out sequences that are too long or too short
+    filtered_dataset: list[SampleRequest] = []
+    for data in tqdm(dataset, total=len(filtered_dataset), desc="sampling requests"):
+        if len(filtered_dataset) == num_requests:
+            break
+
+        # Only keep the first two turns of each conversation.
+        prompt = data["conversations"][0]["value"]
+        completion = data["conversations"][1]["value"]
+
+        multi_modal_data: Optional[MultiModalDataDict] = None
+        if "image" in data:
+            multi_modal_data = multi_modal_data or {}
+            image_path = data["image"]
+            # TODO(vllm-project/vllm/issues/9778): Support multiple images.
+            assert isinstance(image_path, str), "Only support single image input"
+            try:
+                multi_modal_data["image"] = Image.open(image_path).convert("RGB")
+            except FileNotFoundError:
+                # Ignore datapoint where asset is missing
+                continue
+            prompt = _get_prompt_for_image_model(question=prompt, model=model)
+
+        request_tokenizer = tokenizer
+        lora_request: Optional[LoRARequest] = None
+        if args.enable_lora:
+            lora_request, lora_tokenizer = get_random_lora_request(args)
+            if lora_tokenizer:
+                request_tokenizer = lora_tokenizer
+
+        # Tokenize the prompts and completions.
+        prompt_token_ids = request_tokenizer(prompt).input_ids
+        completion_token_ids = request_tokenizer(completion).input_ids
+        prompt_len = len(prompt_token_ids)
+        output_len = (
+            len(completion_token_ids) if fixed_output_len is None else fixed_output_len
+        )
+        if prompt_len < 4 or output_len < 4:
+            # Prune too short sequences.
+            continue
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
+            # Prune too long sequences.
+            continue
+        filtered_dataset.append(
+            SampleRequest(
+                prompt=prompt,
+                prompt_len=prompt_len,
+                expected_output_len=output_len,
+                multi_modal_data=multi_modal_data,
+                lora_request=lora_request,
+            )
+        )
+
+    return filtered_dataset
+
+
+def run_vllm(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: EngineArgs,
+) -> float:
+    from vllm import LLM, SamplingParams
+
+    @contextmanager
+    def rpd_profiler_context():
+        from rpdTracerControl import rpdTracerControl as rpd
+
+        llm.start_profile()
+        yield
+        llm.stop_profile()
+        rpd.top_totals()
+
+    @contextmanager
+    def torch_profiler_context(profile_dir: Optional[str] = None):
+        p = torch.profiler.profile(
+            activities=[
+                torch.profiler.ProfilerActivity.CPU,
+                torch.profiler.ProfilerActivity.CUDA,
+            ],
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(str(profile_dir)),
+        )
+        p.start()
+        try:
+            with torch.no_grad():
+                yield p
+        finally:
+            p.stop()
+            print(p.key_averages().table(sort_by="self_cuda_time_total", row_limit=-1))
+
+    def get_profiling_context(profile_dir: Optional[str] = None):
+        if args.profile_torch:
+            return torch_profiler_context(profile_dir)
+        elif args.profile_rpd:
+            return rpd_profiler_context()
+        else:
+            return nullcontext()
+
+    if args.profile_torch or args.profile_rpd:
+        profile_dir = Path(args.profile_dir or "./vllm_benchmark_throughput_result")
+        profile_dir.mkdir(parents=True, exist_ok=True)
+        name = os.path.basename(os.path.normpath(args.model))
+        model_trace_name = (
+            f"{name}_in_{args.input_len}_out_{args.output_len}_"
+            f"tp_{args.tensor_parallel_size}"
+        )
+        print(f"Profiling (results will be saved to '{profile_dir}')...")
+        if args.profile_rpd:
+            profile_dir /= f"{model_trace_name}.rpd"
+            os.environ["VLLM_RPD_PROFILER_DIR"] = str(profile_dir)
+
+    llm = LLM(**dataclasses.asdict(engine_args))
+
+    # Add the requests to the engine.
+    prompts: list[TextPrompt] = []
+    sampling_params: list[SamplingParams] = []
+    for request in requests:
+        prompts.append(
+            TextPrompt(prompt=request.prompt, multi_modal_data=request.multi_modal_data)
+        )
+        sampling_params.append(
+            SamplingParams(
+                n=n,
+                temperature=1.0,
+                top_p=1.0,
+                ignore_eos=True,
+                max_tokens=request.expected_output_len,
+            )
+        )
+    lora_requests: Optional[list[LoRARequest]] = None
+    if engine_args.enable_lora:
+        lora_requests = [request.lora_request for request in requests]
+
+    use_beam_search = False
+
+    if not use_beam_search:
+        execute = lambda: llm.generate(
+            prompts, sampling_params, lora_request=lora_requests, use_tqdm=True
+        )
+    else:
+        assert lora_requests is None, "BeamSearch API does not support LoRA"
+        prompts = [request.prompt for request in requests]
+        # output_len should be the same for all requests.
+        output_len = requests[0][2]
+        for request in requests:
+            assert request.expected_output_len == output_len
+        execute = lambda: llm.beam_search(
+            prompts,
+            BeamSearchParams(
+                beam_width=n,
+                max_tokens=output_len,
+                ignore_eos=True,
+            ),
+        )
+
+    if args.profile_torch or args.profile_rpd:
+        with get_profiling_context(profile_dir):
+            execute()
+        return
+    else:
+        start = time.perf_counter()
+        execute()
+        end = time.perf_counter()
+        return end - start
+
+
+async def run_vllm_async(
+    requests: list[SampleRequest],
+    n: int,
+    engine_args: AsyncEngineArgs,
+    disable_frontend_multiprocessing: bool = False,
+) -> float:
+    from vllm import SamplingParams
+
+    async with build_async_engine_client_from_engine_args(
+        engine_args, disable_frontend_multiprocessing
+    ) as llm:
+        # Add the requests to the engine.
+        prompts: list[TextPrompt] = []
+        sampling_params: list[SamplingParams] = []
+        lora_requests: list[Optional[LoRARequest]] = []
+        for request in requests:
+            prompts.append(
+                TextPrompt(
+                    prompt=request.prompt, multi_modal_data=request.multi_modal_data
+                )
+            )
+            sampling_params.append(
+                SamplingParams(
+                    n=n,
+                    temperature=1.0,
+                    top_p=1.0,
+                    ignore_eos=True,
+                    max_tokens=request.lora_requests,
+                )
+            )
+            lora_requests.append(request.lora_request)
+
+        generators = []
+        start = time.perf_counter()
+        for i, (prompt, sp, lr) in enumerate(
+            zip(prompts, sampling_params, lora_requests)
+        ):
+            generator = llm.generate(prompt, sp, lora_request=lr, request_id=f"test{i}")
+            generators.append(generator)
+        all_gens = merge_async_iterators(*generators)
+        async for i, res in all_gens:
+            pass
+        end = time.perf_counter()
+        return end - start
+
+
+def run_hf(
+    requests: list[SampleRequest],
+    model: str,
+    tokenizer: PreTrainedTokenizerBase,
+    n: int,
+    max_batch_size: int,
+    trust_remote_code: bool,
+) -> float:
+    llm = AutoModelForCausalLM.from_pretrained(
+        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code
+    )
+    if llm.config.model_type == "llama":
+        # To enable padding in the HF backend.
+        tokenizer.pad_token = tokenizer.eos_token
+    llm = llm.cuda()
+
+    pbar = tqdm(total=len(requests))
+    start = time.perf_counter()
+    batch: list[str] = []
+    max_prompt_len = 0
+    max_output_len = 0
+    for i in range(len(requests)):
+        prompt, prompt_len, output_len = requests[i]
+        # Add the prompt to the batch.
+        batch.append(prompt)
+        max_prompt_len = max(max_prompt_len, prompt_len)
+        max_output_len = max(max_output_len, output_len)
+        if len(batch) < max_batch_size and i != len(requests) - 1:
+            # Check if we can add more requests to the batch.
+            _, next_prompt_len, next_output_len = requests[i + 1]
+            if (
+                max(max_prompt_len, next_prompt_len)
+                + max(max_output_len, next_output_len)
+            ) <= 2048:
+                # We can add more requests to the batch.
+                continue
+
+        # Generate the sequences.
+        input_ids = tokenizer(batch, return_tensors="pt", padding=True).input_ids
+        llm_outputs = llm.generate(
+            input_ids=input_ids.cuda(),
+            do_sample=True,
+            num_return_sequences=n,
+            temperature=1.0,
+            top_p=1.0,
+            use_cache=True,
+            max_new_tokens=max_output_len,
+        )
+        # Include the decoding time.
+        tokenizer.batch_decode(llm_outputs, skip_special_tokens=True)
+        pbar.update(len(batch))
+
+        # Clear the batch.
+        batch = []
+        max_prompt_len = 0
+        max_output_len = 0
+    end = time.perf_counter()
+    return end - start
+
+
+def run_mii(
+    requests: list[SampleRequest],
+    model: str,
+    tensor_parallel_size: int,
+    output_len: int,
+) -> float:
+    from mii import client, serve
+
+    llm = serve(model, tensor_parallel=tensor_parallel_size)
+    prompts = [request.prompt for request in requests]
+
+    start = time.perf_counter()
+    llm.generate(prompts, max_new_tokens=output_len)
+    end = time.perf_counter()
+    client = client(model)
+    client.terminate_server()
+    return end - start
+
+
+def main(args: argparse.Namespace):
+    print(args)
+    random.seed(args.seed)
+
+    # Sample the requests.
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer, trust_remote_code=args.trust_remote_code
+    )
+    if args.dataset is None:
+        vocab_size = tokenizer.vocab_size
+        requests = []
+        for _ in range(args.num_prompts):
+            request_tokenizer = tokenizer
+            lora_request: Optional[LoRARequest] = None
+            if args.enable_lora:
+                lora_request, lora_tokenizer = get_random_lora_request(args)
+                if lora_tokenizer:
+                    request_tokenizer = lora_tokenizer
+
+            # Synthesize a prompt with the given input length.
+            candidate_ids = [
+                random.randint(0, vocab_size - 1) for _ in range(args.input_len)
+            ]
+            # As tokenizer may add additional tokens like BOS, we need to try
+            # different lengths to get the desired input length.
+            for _ in range(5):  # Max attempts to correct
+                candidate_prompt = request_tokenizer.decode(candidate_ids)
+                tokenized_len = len(request_tokenizer.encode(candidate_prompt))
+
+                if tokenized_len == args.input_len:
+                    break
+
+                # Adjust length based on difference
+                diff = args.input_len - tokenized_len
+                if diff > 0:
+                    candidate_ids.extend(
+                        [random.randint(100, vocab_size - 100) for _ in range(diff)]
+                    )
+                else:
+                    candidate_ids = candidate_ids[:diff]
+            requests.append(
+                SampleRequest(
+                    prompt=candidate_prompt,
+                    prompt_len=args.input_len,
+                    expected_output_len=args.output_len,
+                    lora_request=lora_request,
+                )
+            )
+    else:
+        requests = sample_requests(tokenizer, args)
+
+    is_multi_modal = any(request.multi_modal_data is not None for request in requests)
+
+    if args.backend == "vllm":
+        if args.async_engine:
+            elapsed_time = uvloop.run(
+                run_vllm_async(
+                    requests,
+                    args.n,
+                    AsyncEngineArgs.from_cli_args(args),
+                    args.disable_frontend_multiprocessing,
+                )
+            )
+        else:
+            elapsed_time = run_vllm(requests, args.n, EngineArgs.from_cli_args(args))
+    elif args.backend == "hf":
+        assert args.tensor_parallel_size == 1
+        elapsed_time = run_hf(
+            requests,
+            args.model,
+            tokenizer,
+            args.n,
+            args.hf_max_batch_size,
+            args.trust_remote_code,
+        )
+    elif args.backend == "mii":
+        elapsed_time = run_mii(
+            requests, args.model, args.tensor_parallel_size, args.output_len
+        )
+    else:
+        raise ValueError(f"Unknown backend: {args.backend}")
+    total_num_tokens = sum(
+        request.prompt_len + request.expected_output_len for request in requests
+    )
+    total_output_tokens = sum(request.expected_output_len for request in requests)
+
+    if args.profile_torch or args.profile_rpd:
+        # Profiling complete
+        pass
+    else:
+        if is_multi_modal:
+            print(
+                "\033[91mWARNING\033[0m: Multi-modal request detected. The "
+                "following metrics are not accurate because image tokens are"
+                " not counted. See vllm-project/vllm/issues/9778 for details."
+            )
+        # TODO(vllm-project/vllm/issues/9778): Count molti-modal token length.
+        print(
+            f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
+            f"{total_num_tokens / elapsed_time:.2f} total tokens/s, "
+            f"{total_output_tokens / elapsed_time:.2f} output tokens/s"
+        )
+
+        # Output JSON results if specified
+        if args.output_json:
+            results = {
+                "elapsed_time": elapsed_time,
+                "num_requests": len(requests),
+                "total_num_tokens": total_num_tokens,
+                "requests_per_second": len(requests) / elapsed_time,
+                "tokens_per_second": total_num_tokens / elapsed_time,
+            }
+            with open(args.output_json, "w") as f:
+                json.dump(results, f, indent=4)
+
+
+if __name__ == "__main__":
+    parser = FlexibleArgumentParser(description="Benchmark the throughput.")
+    parser.add_argument(
+        "--backend", type=str, choices=["vllm", "hf", "mii"], default="vllm"
+    )
+    parser.add_argument(
+        "--dataset", type=str, default=None, help="Path to the dataset."
+    )
+    parser.add_argument(
+        "--input-len",
+        type=int,
+        default=None,
+        help="Input prompt length for each request",
+    )
+    parser.add_argument(
+        "--output-len",
+        type=int,
+        default=None,
+        help="Output length for each request. Overrides the "
+        "output length from the dataset.",
+    )
+    parser.add_argument(
+        "--n", type=int, default=1, help="Number of generated sequences per prompt."
+    )
+    parser.add_argument(
+        "--num-prompts", type=int, default=1000, help="Number of prompts to process."
+    )
+    parser.add_argument(
+        "--hf-max-batch-size",
+        type=int,
+        default=None,
+        help="Maximum batch size for HF backend.",
+    )
+    parser.add_argument(
+        "--output-json",
+        type=str,
+        default=None,
+        help="Path to save the throughput results in JSON format.",
+    )
+    parser.add_argument(
+        "--async-engine",
+        action="store_true",
+        default=False,
+        help="Use vLLM async engine rather than LLM class.",
+    )
+    parser.add_argument(
+        "--disable-frontend-multiprocessing",
+        action="store_true",
+        default=False,
+        help="Disable decoupled async engine frontend.",
+    )
+    # LoRA
+    parser.add_argument(
+        "--lora-path",
+        type=str,
+        default=None,
+        help="Path to the lora adapters to use. This can be an absolute path, "
+        "a relative path, or a Hugging Face model identifier.",
+    )
+    parser.add_argument(
+        "--profile-torch",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-rpd",
+        action="store_true",
+        help="profile the generation process of a single batch",
+    )
+    parser.add_argument(
+        "--profile-dir",
+        type=str,
+        default=None,
+        help=(
+            "path to save the profiler output. Can be visualized "
+            "with ui.perfetto.dev or Tensorboard."
+        ),
+    )
+
+    parser = AsyncEngineArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    if args.tokenizer is None:
+        args.tokenizer = args.model
+    if args.dataset is None:
+        assert args.input_len is not None
+        assert args.output_len is not None
+    else:
+        assert args.input_len is None
+    if args.enable_lora:
+        assert args.lora_path is not None
+
+    if args.backend == "vllm":
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+    elif args.backend == "hf":
+        if args.hf_max_batch_size is None:
+            raise ValueError("HF max batch size is required for HF backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM backend")
+    elif args.backend == "mii":
+        if args.dtype != "auto":
+            raise ValueError("dtype must be auto for MII backend.")
+        if args.n != 1:
+            raise ValueError("n must be 1 for MII backend.")
+        if args.quantization is not None:
+            raise ValueError("Quantization is only for vLLM backend.")
+        if args.hf_max_batch_size is not None:
+            raise ValueError("HF max batch size is only for HF backend.")
+        if args.tokenizer != args.model:
+            raise ValueError("Tokenizer must be the same as the model for MII backend.")
+        if args.enable_lora is not None:
+            raise ValueError("LoRA benchmarking is only supported for vLLM backend")
+    main(args)
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 55e659679701..1004d48e5271 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -7,6 +7,10 @@
 #include "cuda_compat.h"
 #include "dispatch_utils.h"
 
+#ifdef USE_ROCM
+  #include "quantization/fp8/amd/quant_utils.cuh"
+#endif
+
 namespace vllm {
 
 template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&),
@@ -31,6 +35,27 @@ __global__ void act_and_mul_kernel(
   }
 }
 
+// Scaled activation and gating kernel template.
+#ifdef USE_ROCM
+template <typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
+__global__ void scaled_act_and_mul_kernel(
+    c10::Float8_e4m3fnuz* __restrict__ out,  // [..., d]
+    const scalar_t* __restrict__ input,      // [..., 2, d]
+    const int d, const float scale) {
+  const int64_t token_idx = blockIdx.x;
+  for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
+    const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
+    const scalar_t y = VLLM_LDG(&input[token_idx * 2 * d + d + idx]);
+    float r = ACT_FN(x) * y * scale;
+    out[token_idx * d + idx] = c10::Float8_e4m3fnuz(
+        __hip_cvt_float_to_fp8(__bfloat162float(r),
+                               fp8::fp8_type::__default_saturation,
+                               fp8::fp8_type::__default_interpret),
+        c10::Float8_e4m3fnuz::from_bits());
+  }
+}
+#endif
+
 template <typename T>
 __device__ __forceinline__ T silu_kernel(const T& x) {
   // x * sigmoid(x)
@@ -82,6 +107,25 @@ __device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
                                          input.data_ptr<scalar_t>(), d); \
       });
 
+// Launch activation and gating kernel.
+#ifdef USE_ROCM
+  #define LAUNCH_SCALED_ACTIVATION_GATE_KERNEL(KERNEL)                \
+    int d = input.size(-1) / 2;                                       \
+    int64_t num_tokens = input.numel() / input.size(-1);              \
+    dim3 grid(num_tokens);                                            \
+    dim3 block(std::min(d, 1024));                                    \
+    const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();     \
+    VLLM_DISPATCH_FLOATING_TYPES(                                     \
+        input.scalar_type(), "scaled_act_and_mul_kernel", [&] {       \
+          vllm::scaled_act_and_mul_kernel<scalar_t, KERNEL<scalar_t>> \
+              <<<grid, block, 0, stream>>>(                           \
+                  out.data_ptr<c10::Float8_e4m3fnuz>(),               \
+                  input.data_ptr<scalar_t>(), d,                      \
+                  1.0 / (*scale.data_ptr<float>()));                  \
+        });
+#endif
+
 void silu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
@@ -96,6 +140,14 @@ void mul_and_silu(torch::Tensor& out,    // [..., d]
   LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel, false);
 }
 
+void scaled_silu_and_mul(torch::Tensor& out,    // [..., d]
+                         torch::Tensor& input,  // [..., 2 * d]
+                         torch::Tensor& scale) {
+#ifdef USE_ROCM
+  LAUNCH_SCALED_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
+#endif
+}
+
 void gelu_and_mul(torch::Tensor& out,    // [..., d]
                   torch::Tensor& input)  // [..., 2 * d]
 {
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index d073dd6d2dee..be147d6fbf9a 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -12,17 +12,68 @@
 
 namespace vllm {
 
-// TODO(woosuk): Further optimize this kernel.
-template <typename scalar_t>
-__global__ void rms_norm_kernel(
-    scalar_t* __restrict__ out,           // [..., hidden_size]
-    const scalar_t* __restrict__ input,   // [..., hidden_size]
-    const scalar_t* __restrict__ weight,  // [hidden_size]
-    const float epsilon, const int num_tokens, const int hidden_size) {
+// This kernel uses the _f16Vec to represent vectorized data.
+// A conversion to/from float should exist
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
+rms_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
+                const scalar_t* __restrict__ input,   // [..., hidden_size]
+                const scalar_t* __restrict__ weight,  // [hidden_size]
+                const float epsilon, const int num_tokens,
+                const size_t hidden_size, const size_t vec_hidden_size) {
+  __shared__ float s_variance;
+  float v8_variance_sum = 0.0f;
+
+  const int64_t tx = threadIdx.x;
+  const int64_t bx = blockIdx.x;
+  const int64_t num_threads = blockDim.x;
+
+  auto* __restrict__ out_v = reinterpret_cast<_f16Vec<scalar_t, width>*>(out);
+  auto* __restrict__ input_v =
+      reinterpret_cast<const _f16Vec<scalar_t, width>*>(
+          input + bx * static_cast<int64_t>(hidden_size));
+  auto* __restrict__ weight_v =
+      reinterpret_cast<const _f16Vec<scalar_t, width>*>(weight);
+
+  // Compute variance. Be careful, hidden_size should multiple of 4.
+  for (size_t idx = tx; idx < vec_hidden_size; idx += num_threads) {
+    _f16Vec<scalar_t, width> temp = input_v[idx];
+    v8_variance_sum += temp.sum_squares();
+  }
+
+  using BlockReduce = cub::BlockReduce<float, 1024>;
+  __shared__ typename BlockReduce::TempStorage reduceStore;
+
+  float variance =
+      BlockReduce(reduceStore).Reduce(v8_variance_sum, cub::Sum{}, num_threads);
+
+  if (threadIdx.x == 0) {
+    s_variance = rsqrtf(variance / hidden_size + epsilon);
+  }
+  __syncthreads();
+
+  variance = s_variance;
+
+  for (size_t idx = tx; idx < vec_hidden_size; idx += num_threads) {
+    _f16Vec<scalar_t, width> temp = input_v[idx];
+    temp *= variance;
+    temp *= weight_v[idx];
+    out_v[bx * static_cast<int64_t>(vec_hidden_size) + idx] = temp;
+  }
+}
+
+// Non vectorized kernel for unusual shapes/types without conversion
+template <typename scalar_t, int width>
+__global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
+rms_norm_kernel(scalar_t* __restrict__ out,           // [..., hidden_size]
+                const scalar_t* __restrict__ input,   // [..., hidden_size]
+                const scalar_t* __restrict__ weight,  // [hidden_size]
+                const float epsilon, const int num_tokens,
+                const size_t hidden_size, const size_t) {
   __shared__ float s_variance;
   float variance = 0.0f;
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+  for (size_t idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
     const float x = (float)input[blockIdx.x * hidden_size + idx];
     variance += x * x;
   }
@@ -36,7 +87,7 @@ __global__ void rms_norm_kernel(
   }
   __syncthreads();
 
-  for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
+  for (size_t idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
     float x = (float)input[blockIdx.x * hidden_size + idx];
     out[blockIdx.x * hidden_size + idx] =
         ((scalar_t)(x * s_variance)) * weight[idx];
@@ -136,6 +187,14 @@ fused_add_rms_norm_kernel(
 
 }  // namespace vllm
 
+#define LAUNCH_RMS_NORM(width)                                               \
+  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] { \
+    vllm::rms_norm_kernel<scalar_t, width><<<grid, block, 0, stream>>>(      \
+        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),                \
+        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size,       \
+        vec_hidden_size);                                                    \
+  });
+
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& input,   // [..., hidden_size]
               torch::Tensor& weight,  // [hidden_size]
@@ -146,16 +205,20 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
 
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
+  int vec_size = 16 / input.element_size();
+  int vec_hidden_size = hidden_size / vec_size;
+  bool can_run_vectorize = (hidden_size % vec_size) == 0;
 
   dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(input));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-  VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "rms_norm_kernel", [&] {
-    vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
-        out.data_ptr<scalar_t>(), input.data_ptr<scalar_t>(),
-        weight.data_ptr<scalar_t>(), epsilon, num_tokens, hidden_size);
-  });
+  if (vec_size % 8 == 0 && can_run_vectorize) {
+    dim3 block(std::min(vec_hidden_size, 1024));
+    LAUNCH_RMS_NORM(8);
+  } else {
+    dim3 block(std::min(hidden_size, 1024));
+    LAUNCH_RMS_NORM(0);
+  }
 }
 
 #define LAUNCH_FUSED_ADD_RMS_NORM(width)                                       \
diff --git a/csrc/ops.h b/csrc/ops.h
index f02f5083ac19..b123da92f512 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -132,6 +132,9 @@ void silu_and_mul_quant(torch::Tensor& out, torch::Tensor& input,
 
 void mul_and_silu(torch::Tensor& out, torch::Tensor& input);
 
+void scaled_silu_and_mul(torch::Tensor& out, torch::Tensor& input,
+                         torch::Tensor& scale);
+
 void gelu_and_mul(torch::Tensor& out, torch::Tensor& input);
 
 void gelu_tanh_and_mul(torch::Tensor& out, torch::Tensor& input);
diff --git a/csrc/rocm/custom.cu b/csrc/rocm/custom.cu
new file mode 100644
index 000000000000..d237ea4ffdfe
--- /dev/null
+++ b/csrc/rocm/custom.cu
@@ -0,0 +1,16 @@
+#include <torch/all.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime.h>
+
+// declare templates for front (cpp) and back (cuda) sides of function:
+// template <typename T>
+
+void LLGemm_Silu(void* in_a, void* in_b, void* out_c, const int M, const int K,
+                 cudaStream_t stream, const int rows_per_block);
+void LLMM_Silu(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               const int64_t rows_per_block) {
+  auto M = in_a.size(0);
+  auto K = in_a.size(1);
+  LLGemm_Silu(in_a.data_ptr(), in_b.data_ptr(), out_c.data_ptr(), M, K,
+              at::cuda::getCurrentCUDAStream(), rows_per_block);
+}
diff --git a/csrc/rocm/fused_kernels.cu b/csrc/rocm/fused_kernels.cu
new file mode 100644
index 000000000000..4f3eea456294
--- /dev/null
+++ b/csrc/rocm/fused_kernels.cu
@@ -0,0 +1,195 @@
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <stdexcept>
+#include <algorithm>
+
+constexpr int WARP_SIZE = 64;
+
+template <typename T>
+__device__ __forceinline__ T silu(const T& x) {
+  // x * sigmoid(x)
+  return (T)(((float)x) / (1.0f + expf((float)-x)));
+}
+
+template <typename T>
+__device__ __forceinline__ T loadnt(T* addr) {
+  return __builtin_nontemporal_load(addr);
+}
+
+__device__ __forceinline__ float4 load_ntmprl(const float4* addr) {
+  auto addr_alias = reinterpret_cast<const float*>(addr);
+  auto dat0 = loadnt(addr_alias);
+  auto dat1 = loadnt(addr_alias + 1);
+  auto dat2 = loadnt(addr_alias + 2);
+  auto dat3 = loadnt(addr_alias + 3);
+  // auto dat0 = *(addr_alias);
+  // auto dat1 = *(addr_alias+1);
+  // auto dat2 = *(addr_alias+2);
+  // auto dat3 = *(addr_alias+3);
+  return make_float4(dat0, dat1, dat2, dat3);
+}
+
+// TBlock fetches entire rows of A, and entire col of B (K dimension); assume
+// N=1 for time being grid is M/A_NUM_ROWS blocks
+template <int NUM_A_ROWS_PER_BLOCK>
+__global__ void LLGemm_Silu_kernel(float4* af4, __half2* bf4, _Float16* c,
+                                   const int d) {
+  __shared__ float red_smem[NUM_A_ROWS_PER_BLOCK][WARP_SIZE];
+  const int row_addr = blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 * blockDim.x;
+  const int row_addr_d = row_addr + d * blockDim.x;
+  // int row_addr_1 = row_addr + CUDA_NUM_THREADS;
+  // int row_addr_2 = row_addr_1 + CUDA_NUM_THREADS;
+  // int row_addr_3 = row_addr_2 + CUDA_NUM_THREADS;
+  const int threadid = threadIdx.x;
+  const int warp = threadIdx.x / WARP_SIZE;
+  const int lane = threadIdx.x % WARP_SIZE;
+  const int num_warps = blockDim.x / WARP_SIZE;
+  const int qwarpid = threadid / 16;
+  const int qthreadid = threadid % 16;
+  float4 rowA_elem4[NUM_A_ROWS_PER_BLOCK];
+  // float4 colB_elem4;
+  __half2 colB_elem4x, colB_elem4y, colB_elem4z, colB_elem4w;
+  float4 sum4;                      //[NUM_A_ROWS_PER_BLOCK];
+  float acc[NUM_A_ROWS_PER_BLOCK];  //= 0.0;
+  __half2 acch2;
+  __half2 oval;
+
+  // rowA_elem4 = af4[row_addr + threadid];
+  //__syncthreads();
+  // rowA_elem4_1 = af4[row_addr_1 + threadid];
+  // rowA_elem4_2 = af4[row_addr_2 + threadid];
+  // rowA_elem4_3 = af4[row_addr_3 + threadid];
+#pragma unroll
+  for (int i = 0; i < NUM_A_ROWS_PER_BLOCK / 2; i++) {
+    rowA_elem4[2 * i] = load_ntmprl(&af4[row_addr + i * blockDim.x + threadid]);
+    rowA_elem4[2 * i + 1] =
+        load_ntmprl(&af4[row_addr_d + i * blockDim.x + threadid]);
+    // rowA_elem4[i] = af4[row_addr + i*blockDim.x + threadid];
+    //__syncthreads();
+  }
+  colB_elem4x = bf4[threadid * 4 + 0];
+  colB_elem4y = bf4[threadid * 4 + 1];
+  colB_elem4z = bf4[threadid * 4 + 2];
+  colB_elem4w = bf4[threadid * 4 + 3];
+
+  // __syncthreads();
+  __half2 Af2;
+  __half2 Bf2;
+  float2 S;
+  // auto Bh2ptr = reinterpret_cast<__half2 *>(&colB_elem4);
+  // auto Bf2x = *Bh2ptr;
+  // auto Bf2y = *(Bh2ptr+1);
+  // auto Bf2z = *(Bh2ptr+2);
+  // auto Bf2w = *(Bh2ptr+3);
+  auto Ah2ptr = reinterpret_cast<__half2*>(&rowA_elem4);
+  __half2* ah2lptr;
+#pragma unroll
+  for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+    ah2lptr = Ah2ptr + i * 4;
+    Af2 = *(ah2lptr);
+    acch2 = __hmul2(Af2, colB_elem4x);
+    Af2 = *(ah2lptr + 1);
+    acch2 = __hfma2(Af2, colB_elem4y, acch2);
+    Af2 = *(ah2lptr + 2);
+    acch2 = __hfma2(Af2, colB_elem4z, acch2);
+    Af2 = *(ah2lptr + 3);
+    acch2 = __hfma2(Af2, colB_elem4w, acch2);
+    S = __half22float2(acch2);
+    acc[i] = S.x + S.y;
+  }
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+#pragma unroll
+    for (int i = 0; i < NUM_A_ROWS_PER_BLOCK; i++) {
+      acc[i] += __shfl_xor(acc[i], mask);
+    }
+  }
+
+  // Warp leaders store the data to shared memory.
+  // if (lane == 0) {
+  //   #pragma unroll
+  //   for (int i=0; i<NUM_A_ROWS_PER_BLOCK; i++) {
+  //     red_smem[i][warp] = acc[i];
+  //   }
+  // }
+
+  if (lane < NUM_A_ROWS_PER_BLOCK) {
+    red_smem[lane][warp] = acc[lane];
+  }
+
+  // Make sure the data is in shared memory.
+  __syncthreads();
+  if (qwarpid < NUM_A_ROWS_PER_BLOCK) {
+    // if (threadid<64) {
+    // #pragma unroll
+    // for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {
+    //     acc[i+2*qwarpid] = 0.0;
+    // }
+    ////acc[qwarpid] = 0.0;
+
+    ////if (qthreadid<num_warps) {
+    // #pragma unroll
+    //   for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {
+    //     acc[i+2*qwarpid] = red_smem[i+2*qwarpid][qthreadid];
+    //   }
+    ////acc[qwarpid] = red_smem[qwarpid][qthreadid];
+
+    ////}
+    acc[qwarpid] = qthreadid < num_warps ? red_smem[qwarpid][qthreadid] : 0.f;
+    // if (threadid<32) {
+#pragma unroll
+    for (int mask = 16 / 2; mask >= 1; mask /= 2) {
+      // #pragma unroll
+      // for (int i=0; i<NUM_A_ROWS_PER_BLOCK/2; i++) {
+      //   acc[i+2*qwarpid] += __shfl_xor(acc[i+2*qwarpid], mask);
+      // }
+      acc[qwarpid] += __shfl_xor(acc[qwarpid], mask);
+    }
+    float oval2 = __shfl_xor(acc[qwarpid], 16);
+    // acc[1] = __shfl_xor(acc[1],16);
+    // acc[3] = __shfl_xor(acc[3],16);
+    //}
+    //  __syncthreads();
+    // if (threadid < NUM_A_ROWS_PER_BLOCK/2) {
+    if (lane == 0 or lane == 32) {
+      // oval = __float22half2_rn(make_float2(acc[qwarpid],oval2));
+      // c[blockIdx.x*NUM_A_ROWS_PER_BLOCK/2+qwarpid/2] = oval;
+
+      c[blockIdx.x * NUM_A_ROWS_PER_BLOCK / 2 + qwarpid / 2] =
+          silu(acc[qwarpid]) * oval2;
+    }
+  }  // threadid<WARP_SIZE
+}
+// define the kernel calling code:
+// template <typename T>
+void LLGemm_Silu(void* in_a, void* in_b, void* out_c, const int M, const int K,
+                 cudaStream_t stream, const int rows_per_block = 4) {
+  float4* af4 = reinterpret_cast<float4*>(in_a);
+  auto* bf4 = reinterpret_cast<__half2*>(in_b);
+  auto* c = reinterpret_cast<_Float16*>(out_c);
+  const int d = M / 2;
+  const int NUM_THREADS = K * 2 / 16;
+  int NUM_BLOCKS = M / rows_per_block;
+  if (rows_per_block == 2) {
+    LLGemm_Silu_kernel<2>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  } else if (rows_per_block == 4) {
+    LLGemm_Silu_kernel<4>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  } else if (rows_per_block == 8) {
+    LLGemm_Silu_kernel<8>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  } else if (rows_per_block == 16) {
+    LLGemm_Silu_kernel<16>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  } else {
+    NUM_BLOCKS = M / 4;
+    LLGemm_Silu_kernel<4>
+        <<<NUM_BLOCKS, NUM_THREADS, 0, stream>>>(af4, bf4, c, d);
+  }
+
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err)
+    throw std::runtime_error("CUDA kernel failed : " + std::to_string(err));
+}
diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h
index e538197dbcb0..36f4e7a48ae1 100644
--- a/csrc/rocm/ops.h
+++ b/csrc/rocm/ops.h
@@ -11,6 +11,9 @@ torch::Tensor wvSplitK(at::Tensor& in_a, at::Tensor& in_b,
 void wvSplitKQ(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
                at::Tensor& scale_a, at::Tensor& scale_b, const int64_t CuCount);
 
+void LLMM_Silu(at::Tensor& in_a, at::Tensor& in_b, at::Tensor& out_c,
+               const int64_t rows_per_block);
+
 void paged_attention(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
diff --git a/csrc/rocm/torch_bindings.cpp b/csrc/rocm/torch_bindings.cpp
index 34575477bcc9..361e9585f81a 100644
--- a/csrc/rocm/torch_bindings.cpp
+++ b/csrc/rocm/torch_bindings.cpp
@@ -13,6 +13,10 @@
 
 TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
   // vLLM custom ops for rocm
+  rocm_ops.def(
+      "LLMM_Silu(Tensor in_a, Tensor in_b, Tensor! out_c, int rows_per_block) "
+      "-> ()");
+  rocm_ops.impl("LLMM_Silu", torch::kCUDA, &LLMM_Silu);
 
   // Custom gemm op for matrix-vector multiplication
   rocm_ops.def(
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
index 1a1896b4c1ee..40a14678ad87 100644
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -107,6 +107,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
   ops.def("silu_and_mul(Tensor! result, Tensor input) -> ()");
   ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
 
+  // Activation function used in SwiGLU.
+  ops.def("scaled_silu_and_mul(Tensor! out, Tensor input, Tensor scale) -> ()");
+  ops.impl("scaled_silu_and_mul", torch::kCUDA, &scaled_silu_and_mul);
+
   ops.def(
       "silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
   ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
diff --git a/csrc/type_convert.cuh b/csrc/type_convert.cuh
index 21b9d0ae515d..47b3a767d355 100644
--- a/csrc/type_convert.cuh
+++ b/csrc/type_convert.cuh
@@ -49,7 +49,7 @@ struct _typeConvert<c10::Half> {
   }
 };
 
-  #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  #if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
 // CUDA_ARCH < 800 does not have BF16 support
 // TODO: Add in ROCm support once public headers handle bf16 maturely
 template <>
@@ -162,4 +162,4 @@ struct alignas(16) _f16Vec {
     return result;
   }
 };
-}  // namespace vllm
\ No newline at end of file
+}  // namespace vllm
diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm
index 4f40f32a39f2..921975027568 100644
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -24,7 +24,7 @@ WORKDIR ${COMMON_WORKDIR}
 FROM base AS fetch_vllm_0
 ONBUILD COPY ./ vllm/
 FROM base AS fetch_vllm_1
-ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
+ARG VLLM_REPO="https://github.com/ROCm/vllm.git"
 ARG VLLM_BRANCH="main"
 ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base
index 45efcbde698b..ba660b5b817a 100644
--- a/docker/Dockerfile.rocm_base
+++ b/docker/Dockerfile.rocm_base
@@ -1,18 +1,16 @@
-ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.3.1-complete
-ARG HIPBLASLT_BRANCH="db8e93b4"
-ARG HIPBLAS_COMMON_BRANCH="7c1566b"
+ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:6.4.1-complete
+ARG HIPBLASLT_BRANCH="aa0bda7b"
+ARG HIPBLAS_COMMON_BRANCH="9b80ba8e"
 ARG LEGACY_HIPBLASLT_OPTION=
-ARG RCCL_BRANCH="648a58d"
-ARG RCCL_REPO="https://github.com/ROCm/rccl"
 ARG TRITON_BRANCH="e5be006"
 ARG TRITON_REPO="https://github.com/triton-lang/triton.git"
-ARG PYTORCH_BRANCH="295f2ed4"
+ARG PYTORCH_BRANCH="f717b2af"
 ARG PYTORCH_VISION_BRANCH="v0.21.0"
-ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
+ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
 ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
 ARG FA_BRANCH="1a7f4dfa"
 ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
-ARG AITER_BRANCH="c1debd8"
+ARG AITER_BRANCH="64876494"
 ARG AITER_REPO="https://github.com/ROCm/aiter.git"
 
 FROM ${BASE_IMAGE} AS base
@@ -45,7 +43,7 @@ RUN apt-get update -y \
     && curl -sS https://bootstrap.pypa.io/get-pip.py | python${PYTHON_VERSION} \
     && python3 --version && python3 -m pip --version
 
-RUN pip install -U packaging 'cmake<4' ninja wheel setuptools pybind11 Cython
+RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
 
 FROM base AS build_hipblaslt
 ARG HIPBLASLT_BRANCH
@@ -53,6 +51,7 @@ ARG HIPBLAS_COMMON_BRANCH
 # Set to "--legacy_hipblas_direct" for ROCm<=6.2
 ARG LEGACY_HIPBLASLT_OPTION
 RUN git clone https://github.com/ROCm/hipBLAS-common.git
+RUN apt-get remove -y hipblaslt && apt-get autoremove -y && apt-get autoclean -y
 RUN cd hipBLAS-common \
     && git checkout ${HIPBLAS_COMMON_BRANCH} \
     && mkdir build \
@@ -69,15 +68,6 @@ RUN cd hipBLASLt \
     && make package
 RUN mkdir -p /app/install && cp /app/hipBLASLt/build/release/*.deb /app/hipBLAS-common/build/*.deb /app/install
 
-FROM base AS build_rccl
-ARG RCCL_BRANCH
-ARG RCCL_REPO
-RUN git clone ${RCCL_REPO}
-RUN cd rccl \
-    && git checkout ${RCCL_BRANCH} \
-    && ./install.sh -p --amdgpu_targets ${PYTORCH_ROCM_ARCH}
-RUN mkdir -p /app/install && cp /app/rccl/build/release/*.deb /app/install
-
 FROM base AS build_triton
 ARG TRITON_BRANCH
 ARG TRITON_REPO
@@ -132,15 +122,25 @@ RUN cd aiter \
 RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl
 RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install
 
+FROM base AS debs
+RUN mkdir /app/debs
+RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
+    cp /install/*.deb /app/debs
+RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \
+    cp /install/*.whl /app/debs
+
 FROM base AS final
 RUN --mount=type=bind,from=build_hipblaslt,src=/app/install/,target=/install \
     dpkg -i /install/*deb \
-    && sed -i 's/, hipblaslt-dev \(.*\), hipcub-dev/, hipcub-dev/g' /var/lib/dpkg/status \
-    && sed -i 's/, hipblaslt \(.*\), hipfft/, hipfft/g' /var/lib/dpkg/status
-RUN --mount=type=bind,from=build_rccl,src=/app/install/,target=/install \
-    dpkg -i /install/*deb \
-    && sed -i 's/, rccl-dev \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status \
-    && sed -i 's/, rccl \(.*\), rocalution/, rocalution/g' /var/lib/dpkg/status
+    && perl -p -i -e 's/, hipblas-common-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt-dev \([^)]*?\), /, /g' /var/lib/dpkg/status \
+    && perl -p -i -e 's/, hipblaslt \([^)]*?\), /, /g' /var/lib/dpkg/status
 RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \
     pip install /install/*.whl
 RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \
@@ -154,8 +154,6 @@ ARG BASE_IMAGE
 ARG HIPBLAS_COMMON_BRANCH
 ARG HIPBLASLT_BRANCH
 ARG LEGACY_HIPBLASLT_OPTION
-ARG RCCL_BRANCH
-ARG RCCL_REPO
 ARG TRITON_BRANCH
 ARG TRITON_REPO
 ARG PYTORCH_BRANCH
@@ -170,8 +168,6 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
     && echo "HIPBLAS_COMMON_BRANCH: ${HIPBLAS_COMMON_BRANCH}" >> /app/versions.txt \
     && echo "HIPBLASLT_BRANCH: ${HIPBLASLT_BRANCH}" >> /app/versions.txt \
     && echo "LEGACY_HIPBLASLT_OPTION: ${LEGACY_HIPBLASLT_OPTION}" >> /app/versions.txt \
-    && echo "RCCL_BRANCH: ${RCCL_BRANCH}" >> /app/versions.txt \
-    && echo "RCCL_REPO: ${RCCL_REPO}" >> /app/versions.txt \
     && echo "TRITON_BRANCH: ${TRITON_BRANCH}" >> /app/versions.txt \
     && echo "TRITON_REPO: ${TRITON_REPO}" >> /app/versions.txt \
     && echo "PYTORCH_BRANCH: ${PYTORCH_BRANCH}" >> /app/versions.txt \
diff --git a/docs/dev-docker/README.md b/docs/dev-docker/README.md
new file mode 100644
index 000000000000..7c9e04ee684d
--- /dev/null
+++ b/docs/dev-docker/README.md
@@ -0,0 +1,553 @@
+# vllm FP8 Latency and Throughput benchmarks with vLLM on the AMD Instinct™ MI300X accelerator
+
+Documentation for Inferencing with vLLM on AMD Instinct™ MI300X platforms.
+
+## Overview
+
+vLLM is a toolkit and library for large language model (LLM) inference and serving. It deploys the PagedAttention algorithm, which reduces memory consumption and increases throughput by leveraging dynamic key and value allocation in GPU memory. vLLM also incorporates many recent LLM acceleration and quantization algorithms, such as fp8 GeMM, fp8 KV cache, continuous batching, flash attention, hip graph, tensor parallel, GPTQ, AWQ, and token speculation. In addition, AMD implements high-performance custom kernels and modules in vLLM to enhance performance further.
+
+This documentation includes information for running the popular Llama 3.1 series models from Meta using a pre-built AMD vLLM docker image optimized for an AMD Instinct™ MI300X or MI325X accelerator. The container is publicly available at [AMD Infinity Hub](https://www.amd.com/en/developer/resources/infinity-hub.html)
+
+The pre-built image includes:
+
+- ROCm™ 6.4.1
+- HipblasLT 0.15
+- vLLM 0.9.0.1
+- PyTorch 2.7
+
+## Pull latest Docker Image
+
+Pull the most recent validated docker image with `docker pull rocm/vllm-dev:main`
+
+## What is New
+
+- Updated to ROCm 6.4.1 and vLLM v0.9.0.1
+- AITER MHA
+- IBM 3d kernel for unified attention
+- Full graph capture for split attention
+
+## Known Issues and Workarounds
+
+- No AITER MoE. Do not use VLLM_ROCM_USE_AITER for Mixtral or DeepSeek models.
+
+## Performance Results
+
+The data in the following tables is a reference point to help users validate observed performance. It should not be considered as the peak performance that can be delivered by AMD Instinct™ MI300X accelerator with vLLM. See the MLPerf section in this document for information about MLPerf 4.1 inference results. The performance numbers above were collected using the steps below.
+*Note Benchmarks were run with benchmark scripts from [v0.6.5](https://github.com/vllm-project/vllm/tree/v0.6.5/benchmarks)*
+
+### Throughput Measurements
+
+The table below shows performance data where a local inference client is fed requests at an infinite rate and shows the throughput client-server scenario under maximum load.
+
+| Model | Precision | TP Size | Input | Output | Num Prompts | Max Num Seqs | Throughput (tokens/s) |
+|-------|-----------|---------|-------|--------|-------------|--------------|-----------------------|
+| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 3200 | 3200 | 16581.5  |
+|       |           |         | 128   | 4096   | 1500        | 1500         | 13667.3               |
+|       |           |         | 500   | 2000   | 2000        | 2000         | 13367.1               |
+|       |           |         | 2048  | 2048   | 1500        | 1500         | 8352.6                |
+| Llama 3.1 405B (amd/Llama-3.1-405B-Instruct-FP8-KV) | FP8 | 8 | 128 | 2048 | 1500 | 1500 | 4275.0 |
+|       |           |         | 128   | 4096   | 1500        | 1500         | 3356.7                |
+|       |           |         | 500   | 2000   | 2000        | 2000         | 3201.4                |
+|       |           |         | 2048  | 2048   | 500         | 500          | 2179.7                |
+
+*TP stands for Tensor Parallelism.*
+
+### Latency Measurements
+
+The table below shows latency measurement, which typically involves assessing the time from when the system receives an input to when the model produces a result.
+
+| Model | Precision | TP Size | Batch Size | Input | Output | MI300X Latency (sec) |
+|-------|-----------|----------|------------|--------|---------|-------------------|
+| Llama 3.1 70B (amd/Llama-3.1-70B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 15.566 |
+| | | | 2 | 128 | 2048 | 16.858 |
+| | | | 4 | 128 | 2048 | 17.518 |
+| | | | 8 | 128 | 2048 | 18.898 |
+| | | | 16 | 128 | 2048 | 21.023 |
+| | | | 32 | 128 | 2048 | 23.896 |
+| | | | 64 | 128 | 2048 | 30.753 |
+| | | | 128 | 128 | 2048 | 43.767 |
+| | | | 1 | 2048 | 2048 | 15.496 |
+| | | | 2 | 2048 | 2048 | 17.380 |
+| | | | 4 | 2048 | 2048 | 17.983 |
+| | | | 8 | 2048 | 2048 | 19.771 |
+| | | | 16 | 2048 | 2048 | 22.702 |
+| | | | 32 | 2048 | 2048 | 27.392 |
+| | | | 64 | 2048 | 2048 | 36.879 |
+| | | | 128 | 2048 | 2048 | 57.003 |
+| Llama 3.1 405B (amd/Llama-3.1-405B-Instruct-FP8-KV) | FP8 | 8 | 1 | 128 | 2048 | 45.828 |
+| | | | 2 | 128 | 2048 | 46.757 |
+| | | | 4 | 128 | 2048 | 48.322 |
+| | | | 8 | 128 | 2048 | 51.479 |
+| | | | 16 | 128 | 2048 | 54.861 |
+| | | | 32 | 128 | 2048 | 63.119 |
+| | | | 64 | 128 | 2048 | 82.362 |
+| | | | 128 | 128 | 2048 | 109.698 |
+| | | | 1 | 2048 | 2048 | 46.514 |
+| | | | 2 | 2048 | 2048 | 47.271 |
+| | | | 4 | 2048 | 2048 | 49.679 |
+| | | | 8 | 2048 | 2048 | 54.366 |
+| | | | 16 | 2048 | 2048 | 60.390 |
+| | | | 32 | 2048 | 2048 | 74.209 |
+| | | | 64 | 2048 | 2048 | 104.728 |
+| | | | 128 | 2048 | 2048 | 154.041 |
+
+*TP stands for Tensor Parallelism.*
+
+Supermicro AS-8125GS-TNMR2 with 2x AMD EPYC 9575F Processors, 2.25 TiB RAM, 8x AMD Instinct MI300X (192GiB, 750W) GPUs, Ubuntu 22.04, and amdgpu driver 6.8.5
+
+## Reproducing Benchmarked Results
+
+### Preparation - Obtaining access to models
+
+The vllm-dev docker image should work with any model supported by vLLM.  When running with FP8, AMD has quantized models available for a variety of popular models, or you can quantize models yourself using Quark.  If needed, the vLLM benchmark scripts will automatically download models and then store them in a Hugging Face cache directory for reuse in future tests. Alternatively, you can choose to download the model to the cache (or to another directory on the system) in advance.
+
+Many HuggingFace models, including Llama-3.1, have gated access.  You will need to set up an account at (https://huggingface.co), search for the model of interest, and request access if necessary. You will also need to create a token for accessing these models from vLLM: open your user profile (https://huggingface.co/settings/profile), select "Access Tokens", press "+ Create New Token", and create a new Read token.
+
+### System optimization
+
+Before running performance tests you should ensure the system is optimized according to the [ROCm Documentation](https://rocm.docs.amd.com/en/latest/how-to/system-optimization/mi300x.html).  In particular, it is important to ensure that NUMA auto-balancing is disabled.
+
+*Note: Check that NUMA balancing is properly set by inspecting the output of the command below, which should have a value of 0, with, `cat /proc/sys/kernel/numa_balancing`*
+
+### Launch AMD vLLM Docker
+
+Download and launch the docker.  The HF_TOKEN is required to be set (either here or after launching the container) if you want to allow vLLM to download gated models automatically; use your HuggingFace token in place of `<token>` in the command below:
+
+```bash
+docker run -it --rm --ipc=host --network=host --group-add render \
+    --privileged --security-opt seccomp=unconfined \
+    --cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \
+    --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
+    -e HF_HOME=/data \
+    -e HF_TOKEN=<token> \
+    -v /data:/data \
+    rocm/vllm-dev:main
+```
+
+Note: The instructions in this document use `/data` to store the models.  If you choose a different directory, you will also need to make that change to the host volume mount when launching the docker container.  For example, `-v /home/username/models:/data` in place of `-v /data:/data` would store the models in /home/username/models on the host.  Some models can be quite large; please ensure that you have sufficient disk space prior to downloading the model.  Since the model download may take a long time, you can use `tmux` or `screen` to avoid getting disconnected.
+
+### Downloading models with huggingface-cli
+
+If you would like want to download models directly (instead of allowing vLLM to download them automatically), you can use the huggingface-cli inside the running docker container. (remove an extra white space) Login using the token that you created earlier. (Note, it is not necessary to save it as a git credential.)
+
+```bash
+huggingface-cli login
+```
+
+You can download a model to the huggingface-cache directory using a command similar to the following (substituting the name of the model you wish to download):
+
+```bash
+sudo mkdir -p /data/huggingface-cache
+sudo chmod -R a+w /data/huggingface-cache
+HF_HOME=/data/huggingface-cache huggingface-cli download meta-llama/Llama-3.1-405B-Instruct --exclude "original/*"
+```
+
+Alternatively, you may wish to download the model to a specific directory, e.g. so you can quantize the model with Quark:
+
+```bash
+sudo mkdir -p /data/llama-3.1
+sudo chmod -R a+w /data/llama-3.1
+huggingface-cli download meta-llama/Llama-3.1-405B-Instruct --exclude "original/*" --local-dir /data/llama-3.1/Llama-3.1-405B-Instruct
+```
+
+In the benchmark commands provided later in this document, replace the model name (e.g. `amd/Llama-3.1-405B-Instruct-FP8-KV`) with the path to the model (e.g. `/data/llama-3.1/Llama-3.1-405B-Instruct`)
+
+### Use pre-quantized models
+
+AMD has provided [FP8-quantized versions](https://huggingface.co/collections/amd/quark-quantized-ocp-fp8-models-66db7936d18fcbaf95d4405c) of several models in order to make them easier to run on MI300X / MI325X, including:
+
+- <https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV>
+- <https://huggingface.co/amd/Llama-3.1-70B-Instruct-FP8-KV>
+- <https://huggingface.co/amd/Llama-3.1-405B-Instruct-FP8-KV>
+
+Some models may be private to those who are members of <https://huggingface.co/amd>.
+
+These FP8 quantized checkpoints were generated with AMD’s Quark Quantizer. For more information about Quark, please refer to <https://quark.docs.amd.com/latest/quark_example_torch_llm_gen.html>
+
+### Quantize your own models
+
+This is an optional step if you would like to quantize your own model instead of using AMD's pre-quantized models.  These instructions use Llama-3.1-405B as an example, but the commands are similar for other models.
+
+First download the model from <https://huggingface.co/meta-llama/Llama-3.1-405B> to the /data/llama-3.1 directory as described above.
+
+[Download and install Quark](https://quark.docs.amd.com/latest/install.html)
+
+Run the quantization script in the example folder using the following command line:
+
+```bash
+# path to quark quantization script
+export QUARK_DIR=/data/quark-0.6.0+dba9ca364/examples/torch/language_modeling/llm_ptq/quantize_quark.py
+# path to Model 
+export MODEL_DIR=/data/llama-3.1/Llama-3.1-405B-Instruct
+python3 $QUARK_DIR \
+--model_dir $MODEL_DIR \
+--output_dir Llama-3.1-405B-Instruct-FP8-KV \
+--kv_cache_dtype fp8 \
+--quant_scheme w_fp8_a_fp8 \
+--num_calib_data 128 \
+--model_export quark_safetensors \
+--no_weight_matrix_merge \
+--multi_gpu
+```
+
+Note: the `--multi_gpu` parameter can be omitted for small models that fit on a single GPU.
+
+## Performance testing with AMD vLLM Docker
+
+### Performance environment variables
+
+Some environment variables enhance the performance of the vLLM kernels on the MI300X / MI325X accelerator. See the AMD Instinct MI300X workload optimization guide for more information.
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+```
+
+### vLLM engine performance settings
+
+vLLM provides a number of engine options which can be changed to improve performance.  Refer to the [vLLM Engine Args](https://docs.vllm.ai/en/stable/usage/engine_args.html) documentation for the complete list of vLLM engine options.
+
+Below is a list of a few of the key vLLM engine arguments for performance; these can be passed to the vLLM benchmark scripts:
+- **--max-model-len** : Maximum context length supported by the model instance. Can be set to a lower value than model configuration value to improve performance and gpu memory utilization.
+- **--max-num-batched-tokens** : The maximum prefill size, i.e., how many prompt tokens can be packed together in a single prefill. Set to a higher value to improve prefill performance at the cost of higher gpu memory utilization. 65536 works well for LLama models.
+- **--max-num-seqs** : The maximum decode batch size (default 256). Using larger values will allow more prompts to be processed concurrently, resulting in increased throughput (possibly at the expense of higher latency).  If the value is too large, there may not be enough GPU memory for the KV cache, resulting in requests getting preempted.  The optimal value will depend on the GPU memory, model size, and maximum context length.
+- **--max-seq-len-to-capture** : Maximum sequence length for which Hip-graphs are captured and utilized. It's recommended to use Hip-graphs for the best decode performance. The default value of this parameter is 8K, which is lower than the large context lengths supported by recent models such as LLama. Set this parameter to max-model-len or maximum context length supported by the model for best performance.
+- **--gpu-memory-utilization** : The ratio of GPU memory reserved by a vLLM instance. Default value is 0.9.  Increasing the value (potentially as high as 0.99) will increase the amount of memory available for KV cache.  When running in graph mode (i.e. not using `--enforce-eager`), it may be necessary to use a slightly smaller value of 0.92 - 0.95 to ensure adequate memory is available for the HIP graph.
+
+### Latency Benchmark
+
+vLLM's benchmark_latency.py script measures end-to-end latency for a specified model, input/output length, and batch size.
+
+You can run latency tests for FP8 models with:
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+MODEL=amd/Llama-3.1-405B-Instruct-FP8-KV
+BS=1
+IN=128
+OUT=2048
+TP=8
+
+python3 /app/vllm/benchmarks/benchmark_latency.py \
+    --distributed-executor-backend mp \
+    --quantization fp8 \
+    --kv-cache-dtype fp8 \
+    --dtype float16 \
+    --gpu-memory-utilization 0.9 \
+    --trust-remote-code \
+    --model $MODEL \
+    --batch-size $BS \
+    --input-len $IN \
+    --output-len $OUT \
+    --tensor-parallel-size $TP \
+    --num-iters-warmup 3 \
+    --num-iters 5 \
+    --output-json output.json
+```
+
+For FP16 models, remove `--quantization fp8 --kv-cache-dtype fp8`.
+
+When measuring models with long context lengths, performance may improve by setting `--max-model-len` to a smaller value.  It is important, however, to ensure that the `--max-model-len` is at least as large as the IN + OUT token counts.
+
+To estimate Time To First Token (TTFT) with the benchmark_latency.py script, set the OUT to 1 token.  It is also recommended to use `--enforce-eager` to get a more accurate measurement of the time that it actually takes to generate the first token.  (For a more comprehensive measurement of TTFT, use the Online Serving Benchmark.)
+
+For additional information about the available parameters run:
+
+```bash
+/app/vllm/benchmarks/benchmark_latency.py -h
+```
+
+### Throughput Benchmark
+
+vLLM's benchmark_throughput.py script measures offline throughput.  It can either use an input dataset or random prompts with fixed input/output lengths.
+
+You can run latency tests for FP8 models with:
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+MODEL=amd/Llama-3.1-405B-Instruct-FP8-KV
+IN=128
+OUT=2048
+TP=8
+PROMPTS=1500
+MAX_NUM_SEQS=1500
+
+python3 /app/vllm/benchmarks/benchmark_throughput.py \
+    --distributed-executor-backend mp \
+    --quantization fp8 \
+    --kv-cache-dtype fp8 \
+    --dtype float16 \
+    --gpu-memory-utilization 0.9 \
+    --trust-remote-code \
+    --num-scheduler-steps 10 \
+    --enable-chunked-prefill False \
+    --model $MODEL \
+    --max-model-len 8192 \
+    --max-num-batched-tokens 131072 \
+    --max-seq-len-to-capture 131072 \
+    --input-len $IN \
+    --output-len $OUT \
+    --tensor-parallel-size $TP \
+    --num-prompts $PROMPTS \
+    --max-num-seqs $MAX_NUM_SEQS \
+    --output-json output.json
+```
+
+For FP16 models, remove `--quantization fp8 --kv-cache-dtype fp8`.
+
+When measuring models with long context lengths, performance may improve by setting `--max-model-len` to a smaller value (8192 in this example).  It is important, however, to ensure that the `--max-model-len` is at least as large as the IN + OUT token counts.
+
+It is important to tune vLLM’s --max-num-seqs value to an appropriate value depending on the model and input/output lengths.  Larger values will allow vLLM to leverage more of the GPU memory for KV Cache and process more prompts concurrently.  But if the value is too large, the KV cache will reach its capacity and vLLM will have to cancel and re-process some prompts.  Suggested values for various models and configurations are listed below.
+
+For models that fit on a single GPU, it is usually best to run with `--tensor-parallel-size 1`.  Requests can be distributed across multiple copies of vLLM running on different GPUs.  This will be more efficient than running a single copy of the model with `--tensor-parallel-size 8`.  (Note: the benchmark_throughput.py script does not include direct support for using multiple copies of vLLM)
+
+For optimal performance, the PROMPTS value should be a multiple of the MAX_NUM_SEQS value -- for example, if MAX_NUM_SEQS=1500 then the PROMPTS value could be 1500, 3000, etc.  If PROMPTS is smaller than MAX_NUM_SEQS then there won’t be enough prompts for vLLM to maximize concurrency.
+
+For additional information about the available parameters run:
+
+```bash
+python3 /app/vllm/benchmarks/benchmark_throughput.py -h
+```
+
+### Online Serving Benchmark
+
+Benchmark Llama-3.1-70B with input 4096 tokens, output 512 tokens and tensor parallelism 8 as an example,
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+vllm serve amd/Llama-3.1-70B-Instruct-FP8-KV \
+    --swap-space 16 \
+    --disable-log-requests \
+    --quantization fp8 \
+    --kv-cache-dtype fp8 \
+    --dtype float16 \
+    --max-model-len 8192 \
+    --tensor-parallel-size 8 \
+    --max-num-batched-tokens 65536 \
+    --gpu-memory-utilization 0.99 \
+    --num_scheduler-steps 10
+```
+
+Change port (for example --port 8005) if port=8000 is currently being used by other processes.
+
+Run client in a separate terminal. Use port_id from previous step else port-id=8000.
+
+```bash
+python /app/vllm/benchmarks/benchmark_serving.py \
+    --port 8000 \
+    --model amd/Llama-3.1-70B-Instruct-FP8-KV \
+    --dataset-name random \
+    --random-input-len 4096 \
+    --random-output-len 512 \
+    --request-rate 1 \
+    --ignore-eos \
+    --num-prompts 500 \
+    --percentile-metrics ttft,tpot,itl,e2el
+```
+
+Once all prompts are processed, terminate the server gracefully (ctrl+c).
+
+### Running DeepSeek-V3 and DeepSeek-R1
+
+We have experimental support for running both DeepSeek-V3 and DeepSeek-R1 models.
+*Note there are currently limitations and `--max-model-len` cannot be greater than 32768*
+
+```bash
+docker run -it --rm --ipc=host --network=host --group-add render \
+    --privileged --security-opt seccomp=unconfined \
+    --cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \
+    --device=/dev/kfd --device=/dev/dri --device=/dev/mem \
+    -e VLLM_USE_TRITON_FLASH_ATTN=1 \
+    -e VLLM_USE_AITER=1 \
+    -e  VLLM_MLA_DISABLE=0 \
+    rocm/vllm-dev:main
+
+# Online serving
+vllm serve deepseek-ai/DeepSeek-V3 \
+    --disable-log-requests \
+    --tensor-parallel-size 8 \
+    --trust-remote-code \
+    --max-model-len 131072 \
+    --block-size=1 
+
+python3 /app/vllm/benchmarks/benchmark_serving.py \
+    --backend vllm \
+    --model deepseek-ai/DeepSeek-V3 \
+    --max-concurrency 256\
+    --dataset-name random \
+    --random-input-len 128 \
+    --random-output-len 128 \
+    --num-prompts 1000
+
+# Offline throughput 
+python3 /app/vllm/benchmarks/benchmark_throughput.py --model deepseek-ai/DeepSeek-V3 \
+    --input-len <> --output-len <> --tensor-parallel-size 8 \
+    --quantization fp8 --kv-cache-dtype fp8 --dtype float16 \
+    --max-model-len 32768 --block-size=1 --trust-remote-code
+
+# Offline Latency
+python /app/vllm/benchmarks/benchmark_latency.py --model deepseek-ai/DeepSeek-V3 \
+--tensor-parallel-size 8 --trust-remote-code --max-model-len 32768 --block-size=1 \
+--batch-size <> --input-len <> --output-len <>
+```
+
+### CPX mode
+
+Currently only CPX-NPS1 mode is supported. So ONLY tp=1 is supported in CPX mode.
+But multiple instances can be started simultaneously (if needed) in CPX-NPS1 mode.
+
+Set GPUs in CPX mode with:
+
+```bash
+rocm-smi --setcomputepartition cpx
+```
+
+Example of running Llama3.1-8B on 1 CPX-NPS1 GPU with input 4096 and output 512. As mentioned above, tp=1.
+
+```bash
+HIP_VISIBLE_DEVICES=0 \
+python3 /app/vllm/benchmarks/benchmark_throughput.py \
+    --max-model-len 4608 \
+    --num-scheduler-steps 10 \
+    --num-prompts 100 \
+    --model amd/Llama-3.1-8B-Instruct-FP8-KV \
+    --input-len 4096 \
+    --output-len 512 \
+    --dtype float16 \
+    --tensor-parallel-size 1 \
+    --output-json <path/to/output.json> \
+    --quantization fp8 \
+    --gpu-memory-utilization 0.99
+```
+
+Set GPU to SPX mode.
+
+```bash
+rocm-smi --setcomputepartition spx
+```
+
+### Speculative Decoding
+
+Speculative decoding is one of the key features in vLLM. It has been supported on MI300. Here below is an example of the performance benchmark w/wo speculative decoding for Llama 3.1 405B with Llama 3.1 8B as the draft model.
+
+Without Speculative Decoding -
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+python /app/vllm/benchmarks/benchmark_latency.py --model amd/Llama-3.1-405B-Instruct-FP8-KV --max-model-len 26720 -tp 8 --batch-size 1 --input-len 1024 --output-len 128
+```
+
+With Speculative Decoding -
+
+```bash
+export VLLM_USE_TRITON_FLASH_ATTN=0
+python /app/vllm/benchmarks/benchmark_latency.py --model amd/Llama-3.1-405B-Instruct-FP8-KV --max-model-len 26720 -tp 8 --batch-size 1 --input-len 1024 --output-len 128 --speculative-model amd/Llama-3.1-8B-Instruct-FP8-KV --num-speculative-tokens 5
+```
+
+You should see some performance improvement about the e2e latency.
+
+### AITER use cases
+
+`rocm/vllm-dev:main` image has experimental [AITER](https://github.com/ROCm/aiter) support, and can yield siginficant performance increase for some model/input/output/batch size configurations. To enable the feature make sure the following environment is set: `VLLM_USE_AITER=1`, the default value is `0`. When building your own image follow the [Docker build steps](#Docker-manifest) using the [aiter_integration_final](https://github.com/ROCm/vllm/tree/aiter_integration_final) branch.
+
+Some use cases include:
+- amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV
+- amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV
+
+```bash
+export VLLM_USE_AITER=1
+python3 /app/vllm/benchmarks/benchmark_latency.py --model amd/Mixtral-8x22B-Instruct-v0.1-FP8-KV -tp 8 --batch-size 256 --input-len 128 --output-len 2048
+```
+
+## MMLU_PRO_Biology Accuracy Evaluation
+
+### FP16
+
+vllm (pretrained=models--meta-llama--Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26,dtype=float16,tensor_parallel_size=8), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 64
+
+| Tasks |Version|    Filter    |n-shot|  Metric   |   |Value |   |Stderr|
+|-------|------:|--------------|-----:|-----------|---|-----:|---|-----:|
+|biology|      0|custom-extract|     5|exact_match|↑  |0.8466|±  |0.0135|
+
+### FP8
+
+vllm (pretrained=models--meta-llama--Llama-3.1-405B-Instruct/snapshots/069992c75aed59df00ec06c17177e76c63296a26,dtype=float16,quantization=fp8,quantized_weights_path=/llama.safetensors,tensor_parallel_size=8), gen_kwargs: (None), limit: None, num_fewshot: None, batch_size: 32
+
+| Tasks |Version|    Filter    |n-shot|  Metric   |   |Value|   |Stderr|
+|-------|------:|--------------|-----:|-----------|---|----:|---|-----:|
+|biology|      0|custom-extract|     5|exact_match|↑  |0.848|±  |0.0134|
+
+## Performance
+
+### MLPerf Performance Results
+
+#### LLama-2-70B
+
+Please refer to the [Benchmarking Machine Learning using ROCm and AMD GPUs: Reproducing Our MLPerf Inference Submission — ROCm Blogs](https://rocm.blogs.amd.com/artificial-intelligence/mlperf-inf-4-1/README.html) for information on reproducing MLPerf 4.1 Inference results.  Note that due to changes in vLLM, it is not possible to use these instructions with the current rocm/vllm-dev docker image. Due to recent changes in vLLM, the instructions for MLPerf 4.1 submission do not apply to the current rocm/vllm-dev docker image.
+
+## Docker Manifest
+
+To reproduce the release docker:
+
+```bash
+    git clone https://github.com/ROCm/vllm.git
+    cd vllm
+    git checkout 71faa188073d427c57862c45bf17745f3b54b1b1
+    docker build -f docker/Dockerfile.rocm -t <your_tag> --build-arg USE_CYTHON=1 .
+```
+
+### Building AITER Image
+
+Use AITER release candidate branch instead:
+
+```bash
+    git clone https://github.com/ROCm/vllm.git
+    cd vllm
+    git checkout aiter_integration_final
+    docker build -f docker/Dockerfile.rocm -t <your_tag> --build-arg USE_CYTHON=1 .
+```
+
+## Changelog
+
+20250605_aiter:
+- Updated to ROCm 6.4.1 and vLLM v0.9.0.1
+- AITER MHA
+- IBM 3d kernel for unified attention
+- Full graph capture for split attention
+
+20250521_aiter:
+- AITER V1 engine performance improvement
+
+20250513_aiter:
+- Out of memory bug fix
+- PyTorch fixes
+- Tunable ops fixes
+
+20250410_aiter:
+- 2-stage MoE
+- MLA from AITER
+
+20250325_aiter:
+- Improved DeepSeek-V3/R1 performance
+- Initial Gemma-3 enablement
+- Detokenizer disablement
+- Torch.compile support
+
+20250305_aiter:
+- AITER improvements
+- Support for FP8 skinny GEMM
+
+20250207_aiter:
+- More performant AITER
+- Bug fixes
+
+20250205_aiter:
+- [AITER](https://github.com/ROCm/aiter) support
+- Performance improvement for custom paged attention
+- Reduced memory overhead bug fix
+
+20250124:
+- Fix accuracy issue with 405B FP8 Triton FA
+- Fixed accuracy issue with TP8
+
+20250117:
+- [Experimental DeepSeek-V3 and DeepSeek-R1 support](#running-deepseek-v3-and-deepseek-r1)
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index fe5b386c4d25..f14273070bcf 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -281,6 +281,19 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
     torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
+def scaled_rms_norm(out: torch.Tensor, input: torch.Tensor,
+                    weight: torch.Tensor, scale: torch.Tensor,
+                    epsilon: float) -> None:
+    torch.ops._C.rms_norm_static_fp8_quant(out, input, weight, scale, epsilon)
+
+
+def scaled_fused_add_rms_norm(out: torch.Tensor, input: torch.Tensor,
+                              residual: torch.Tensor, weight: torch.Tensor,
+                              scale: torch.Tensor, epsilon: float) -> None:
+    torch.ops._C.fused_add_rms_norm_static_fp8_quant(out, input, residual,
+                                                     weight, scale, epsilon)
+
+
 def apply_repetition_penalties_torch(
         logits: torch.Tensor, prompt_mask: torch.Tensor,
         output_mask: torch.Tensor, repetition_penalties: torch.Tensor) -> None:
@@ -1752,6 +1765,11 @@ def free_shared_buffer(ptr: int) -> None:
     torch.ops._C_custom_ar.free_shared_buffer(ptr)
 
 
+def LLMM_Silu(a: torch.Tensor, b: torch.Tensor, out: torch.Tensor,
+              rows_per_block: int) -> None:
+    torch.ops._rocm_C.LLMM_Silu(a, b, out, rows_per_block)
+
+
 def get_flash_mla_metadata(
     cache_seqlens: torch.Tensor,
     num_heads_per_head_k: int,
diff --git a/vllm/attention/backends/abstract.py b/vllm/attention/backends/abstract.py
index 0ba5a5bf94c9..15cd27b13632 100644
--- a/vllm/attention/backends/abstract.py
+++ b/vllm/attention/backends/abstract.py
@@ -243,6 +243,7 @@ class AttentionLayer(Protocol):
     _k_scale_float: float
     _v_scale_float: float
     _prob_scale: torch.Tensor
+    _out_scale: torch.Tensor
 
     def forward(
         self,
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index 7887ebf65f44..fda02c314eb7 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -799,6 +799,7 @@ def forward(
                         attn_masks[0][None]
                         if attn_masks is not None else None,
                         full_scales,
+                        layer._out_scale,
                     )
                 elif self.use_naive_attn:
                     if self.num_kv_heads != self.num_heads:
@@ -825,6 +826,7 @@ def forward(
                         self.num_heads,
                         self.head_size,
                         self.scale,
+                        causal_mask,
                         attn_masks,
                     )
                 else:
@@ -887,7 +889,7 @@ def forward(
                 assert _PARTITION_SIZE_ROCM % block_size == 0
                 tmp_output = torch.empty(
                     size=(num_seqs, num_heads, max_num_partitions, head_size),
-                    dtype=output.dtype,
+                    dtype=query.dtype,
                     device=output.device,
                 )
                 exp_sums = torch.empty(
@@ -921,6 +923,7 @@ def forward(
                     self.kv_cache_dtype,
                     layer._k_scale,
                     layer._v_scale,
+                    layer._out_scale,
                 )
             else:
                 output[num_prefill_tokens:] = paged_attn.forward_decode(
@@ -958,6 +961,7 @@ def _sdpa_attention(
     num_heads: int,
     head_size: int,
     scale: float,
+    is_causal: bool,
     attn_masks: Optional[List[torch.Tensor]] = None,
 ) -> torch.Tensor:
     start = 0
@@ -974,7 +978,7 @@ def _sdpa_attention(
                 key[:, start:end, :],
                 value[:, start:end, :],
                 dropout_p=0.0,
-                is_causal=attn_masks is None,
+                is_causal=is_causal,
                 attn_mask=attn_masks[i] if attn_masks else None,
                 scale=scale).movedim(query.dim() - 2, 0)
             output[start:end, :, :] = sub_out
diff --git a/vllm/attention/backends/utils.py b/vllm/attention/backends/utils.py
index 34e059067d84..daa353cd31b2 100644
--- a/vllm/attention/backends/utils.py
+++ b/vllm/attention/backends/utils.py
@@ -237,9 +237,18 @@ def build(self, seq_lens: List[int], query_lens: List[int],
             # The shape of graph_block_tables is
             # [max batch size, max context len // block size].
             input_block_tables = self.runner.graph_block_tables[:batch_size]
+            max_blocks = input_block_tables.shape[1]
             for i, block_table in enumerate(self.block_tables):
                 if block_table:
-                    input_block_tables[i, :len(block_table)] = block_table
+                    num_blocks = len(block_table)
+                    if num_blocks <= max_blocks:
+                        input_block_tables[i, :num_blocks] = block_table
+                    else:
+                        # It may be possible to have more blocks allocated due
+                        # to lookahead slots of multi-step, however, they are
+                        # not used anyway, so can be safely ignored.
+                        input_block_tables[
+                            i, :max_blocks] = block_table[:max_blocks]
             block_tables = torch.from_numpy(input_block_tables).to(
                 device, non_blocking=True)
         else:
diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py
index a5fbd1a1c016..1aa87abeb2b9 100644
--- a/vllm/attention/layer.py
+++ b/vllm/attention/layer.py
@@ -94,6 +94,7 @@ def __init__(
         # but requires q to be quantized as well.
         self._q_scale = torch.tensor(1.0, dtype=torch.float32)
         self._prob_scale = torch.tensor(1.0, dtype=torch.float32)
+        self._out_scale = None
 
         # We also keep the float32 versions of k/v_scale for attention
         # backends that don't support tensors (Flashinfer)
@@ -206,8 +207,10 @@ def forward(
         if self.use_output:
             output_shape = (output_shape
                             if output_shape is not None else query.shape)
+            output_dtype = (query.dtype if self._out_scale is None else
+                            current_platform.fp8_dtype())
             output = torch.empty(output_shape,
-                                 dtype=query.dtype,
+                                 dtype=output_dtype,
                                  device=query.device)
             hidden_size = output_shape[-1]
             # We skip reshaping query, key and value tensors for the MLA
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
index 4f839348e522..4d4183493ebe 100644
--- a/vllm/attention/ops/chunked_prefill_paged_decode.py
+++ b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -34,6 +34,7 @@ def kernel_paged_attention_2d(
         scale,  # float32
         k_scale,  # float32
         v_scale,  # float32
+        out_scale,
         num_query_heads: tl.constexpr,  # int
         num_queries_per_kv: tl.constexpr,  # int
         num_queries_per_kv_padded: tl.constexpr,  # int
@@ -59,7 +60,7 @@ def kernel_paged_attention_2d(
         stride_v_cache_3: tl.int64,  # int
         filter_by_query_len: tl.constexpr,  # bool
         query_start_len_ptr,  # [num_seqs+1]
-):
+        USE_FP8: tl.constexpr):
     seq_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
 
@@ -193,6 +194,8 @@ def kernel_paged_attention_2d(
 
     # epilogue
     acc = acc / L[:, None]
+    if USE_FP8:
+        acc = acc / tl.load(out_scale)
 
     output_offset = (cur_batch_in_all_start_index * output_stride_0 +
                      query_head_idx * output_stride_1)
@@ -223,6 +226,7 @@ def chunked_prefill_paged_decode(
     alibi_slopes=None,
     sliding_window=None,
     sm_scale=None,
+    fp8_out_scale=None,
 ):
 
     if sm_scale is None:
@@ -253,6 +257,7 @@ def chunked_prefill_paged_decode(
             sliding_window=sliding_window,
             sm_scale=sm_scale,
             skip_decode=True,
+            fp8_out_scale=fp8_out_scale,
         )
 
     block_size = value_cache.shape[3]
@@ -295,7 +300,7 @@ def chunked_prefill_paged_decode(
         tmp_output = torch.empty(
             size=(total_num_seq, num_query_heads, max_num_partitions,
                   head_size),
-            dtype=output.dtype,
+            dtype=query.dtype,
             device=output.device,
         )
         exp_sums = torch.empty(
@@ -324,6 +329,7 @@ def chunked_prefill_paged_decode(
             kv_cache_dtype=kv_cache_dtype,
             k_scale=k_scale,
             v_scale=v_scale,
+            fp8_out_scale=fp8_out_scale,
         )
     else:
         kernel_paged_attention_2d[(
@@ -340,6 +346,7 @@ def chunked_prefill_paged_decode(
             scale=sm_scale,
             k_scale=k_scale,
             v_scale=v_scale,
+            out_scale=fp8_out_scale,
             num_query_heads=num_query_heads,
             num_queries_per_kv=num_queries_per_kv,
             num_queries_per_kv_padded=num_queries_per_kv_padded,
@@ -365,4 +372,5 @@ def chunked_prefill_paged_decode(
             stride_v_cache_3=value_cache.stride(3),
             filter_by_query_len=True,
             query_start_len_ptr=query_start_loc,
+            USE_FP8=fp8_out_scale is not None,
         )
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
index 13bef96722d2..fd25119738ee 100644
--- a/vllm/attention/ops/prefix_prefill.py
+++ b/vllm/attention/ops/prefix_prefill.py
@@ -15,6 +15,7 @@
 
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)
+float8_info = torch.finfo(current_platform.fp8_dtype())
 
 
 # Here's an example autotuner config for this kernel. This config does provide
@@ -42,6 +43,7 @@ def _fwd_kernel(Q,
                 sm_scale,
                 k_scale,
                 v_scale,
+                out_scale,
                 B_Start_Loc,
                 B_Seqlen,
                 x: tl.constexpr,
@@ -80,8 +82,11 @@ def _fwd_kernel(Q,
                 num_unroll_cache: tl.constexpr,
                 num_unroll_request: tl.constexpr,
                 SKIP_DECODE: tl.constexpr,
+                USE_FP8: tl.constexpr,
                 MAX_Q_LEN: tl.constexpr = 0,
-                MAX_CTX_LEN: tl.constexpr = 0):
+                MAX_CTX_LEN: tl.constexpr = 0,
+                FP8_MIN: tl.constexpr = float8_info.min,
+                FP8_MAX: tl.constexpr = float8_info.max):
 
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -274,6 +279,9 @@ def _fwd_kernel(Q,
     off_o = ((cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs +
              cur_head * stride_oh + offs_d[None, :] * stride_od)
     out_ptrs = Out + off_o
+    if USE_FP8:
+        acc = acc / tl.load(out_scale)
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
     tl.store(out_ptrs,
              acc,
              mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))
@@ -732,7 +740,8 @@ def context_attention_fwd(q,
                           alibi_slopes=None,
                           sliding_window=None,
                           sm_scale=None,
-                          skip_decode=False):
+                          skip_decode=False,
+                          fp8_out_scale=None):
 
     q_dtype_is_f32 = q.dtype is torch.float32
 
@@ -857,6 +866,7 @@ def context_attention_fwd(q,
         sm_scale,
         k_scale,
         v_scale,
+        fp8_out_scale,
         b_start_loc,
         b_seq_len,
         k_cache.shape[4],
@@ -892,6 +902,7 @@ def context_attention_fwd(q,
         BLOCK_DMODEL_PADDED=Lk_padded,
         SLIDING_WINDOW=sliding_window,
         SKIP_DECODE=skip_decode,
+        USE_FP8=fp8_out_scale is not None,
         BLOCK_M=128,
         BLOCK_N=64,
         num_unroll_cache=4,
diff --git a/vllm/config.py b/vllm/config.py
index 5da44988bc5f..2640fddee3cf 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -4448,6 +4448,11 @@ def __post_init__(self):
             # FIXME(rob): Add function to set all of these.
             if not self.compilation_config.custom_ops:
                 self.compilation_config.custom_ops = ["none"]
+            if current_platform.is_rocm():
+                if "none" in self.compilation_config.custom_ops:
+                    self.compilation_config.custom_ops.remove("none")
+                self.compilation_config.custom_ops.append("+rms_norm")
+                self.compilation_config.custom_ops.append("+silu_and_mul")
             self.compilation_config.cudagraph_num_of_warmups = 1
             self.compilation_config.pass_config.enable_fusion = False
             self.compilation_config.pass_config.enable_noop = False
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index 7dd104a4fcc4..afb81f93292e 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -54,7 +54,7 @@ class CustomAllreduce:
     def __init__(self,
                  group: ProcessGroup,
                  device: Union[int, str, torch.device],
-                 max_size=8192 * 1024) -> None:
+                 max_size=2 * 8192 * 1024) -> None:
         """
         Args:
             group: the process group to work on. If None, it will use the
diff --git a/vllm/envs.py b/vllm/envs.py
index 80c5f289bba9..3590cb3448ae 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -15,7 +15,8 @@
     VLLM_RINGBUFFER_WARNING_INTERVAL: int = 60
     VLLM_NCCL_SO_PATH: Optional[str] = None
     LD_LIBRARY_PATH: Optional[str] = None
-    VLLM_USE_TRITON_FLASH_ATTN: bool = False
+    VLLM_USE_TRITON_FLASH_ATTN: bool = True
+    VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT: bool = True
     VLLM_V1_USE_PREFILL_DECODE_ATTENTION: bool = False
     VLLM_FLASH_ATTN_VERSION: Optional[int] = None
     LOCAL_RANK: int = 0
@@ -312,6 +313,12 @@ def get_vllm_port() -> Optional[int]:
     lambda: bool(
         os.environ.get("VLLM_TEST_DYNAMO_FULLGRAPH_CAPTURE", "1") != "0"),
 
+    # have custom paged attention implemented for MI3* cards write out fp8
+    "VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT":
+    lambda:
+    (os.getenv("VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT", "True").lower() in
+     ("true", "1")),
+
     # Feature flag to enable/disable Inductor standalone compile.
     # In torch <= 2.7 we ignore this flag; in torch >= 2.8 this is
     # enabled by default.
@@ -530,7 +537,7 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_XLA_USE_SPMD":
     lambda: bool(int(os.getenv("VLLM_XLA_USE_SPMD", "0"))),
     "VLLM_FUSED_MOE_CHUNK_SIZE":
-    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "32768")),
+    lambda: int(os.getenv("VLLM_FUSED_MOE_CHUNK_SIZE", "65536")),
 
     # If set, vllm will skip the deprecation warnings.
     "VLLM_NO_DEPRECATION_WARNING":
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index cc9c8d445ab6..22eadc7290f3 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -71,16 +71,28 @@ def __init__(self):
             from vllm._ipex_ops import ipex_ops
             self.op = ipex_ops.silu_and_mul
 
-    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self,
+                       x: torch.Tensor,
+                       scale: Optional[torch.Tensor] = None) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
 
-    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self,
+                     x: torch.Tensor,
+                     scale: Optional[torch.Tensor] = None) -> torch.Tensor:
+
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
-        out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
-        self.op(out, x)
+        if scale is None:
+            out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
+            self.op(out, x)
+        else:
+            # for scaled fp8 output
+            out = torch.empty(output_shape,
+                              dtype=torch.float8_e4m3fnuz,
+                              device=x.device)
+            torch.ops._C.scaled_silu_and_mul(out, x, scale)
         return out
 
     def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index b3c65e34178a..3ef48071a40c 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -120,6 +120,7 @@ def forward_native(
         self,
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         """PyTorch-native implementation equivalent to forward()."""
         orig_dtype = x.dtype
@@ -158,10 +159,23 @@ def forward_cuda(
         self,
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
+        scale: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
         if self.variance_size_override is not None:
             return self.forward_native(x, residual)
 
+        from vllm import _custom_ops as ops
+
+        if scale is not None:
+            out = torch.empty_like(x, dtype=torch.float8_e4m3fnuz)
+            if residual is not None:
+                ops.scaled_fused_add_rms_norm(out, x, residual,
+                                              self.weight.data, scale,
+                                              self.variance_epsilon)
+                return out, residual
+            ops.scaled_rms_norm(out, x, self.weight.data, scale,
+                                self.variance_epsilon)
+            return out
         add_residual = residual is not None
         norm_func = dispatch_cuda_rmsnorm_func(add_residual)
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index b3042bfaed3d..145a2e96417c 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -343,9 +343,13 @@ def process_weights_after_loading(self, layer: Module) -> None:
         # If checkpoint is fp8, handle that there are N scales for N
         # shards in a fused module
         else:
+            layer.weight_scale.data[layer.weight_scale.data == torch.finfo(
+                torch.float32).min] = 1
             layer.weight_scale = torch.nn.Parameter(layer.weight_scale.data,
                                                     requires_grad=False)
             if self.quant_config.activation_scheme == "static":
+                layer.input_scale.data[layer.input_scale.data == torch.finfo(
+                    torch.float32).min] = 1
                 layer.input_scale = torch.nn.Parameter(layer.input_scale.data,
                                                        requires_grad=False)
 
diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py
index e5604670fb4c..77f975b37f45 100644
--- a/vllm/model_executor/layers/quantization/kv_cache.py
+++ b/vllm/model_executor/layers/quantization/kv_cache.py
@@ -52,7 +52,7 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         # regardless whether the kv-scale is available in the checkpoint.
         # No need to process kv scales after loading if we are going to
         # calculate them on the fly.
-        if layer.kv_cache_dtype != "auto" and not layer.calculate_kv_scales:
+        if not layer.calculate_kv_scales:
             if layer.k_scale > 0.0 and layer.v_scale > 0.0:
                 # We prefer to use separate k_scale and v_scale if present
                 k_scale = layer.k_scale.to("cpu").tolist()
diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py
index 6ae5f5c9ad46..693b3dfcfa59 100644
--- a/vllm/model_executor/layers/quantization/quark/quark.py
+++ b/vllm/model_executor/layers/quantization/quark/quark.py
@@ -163,6 +163,19 @@ def _check_scheme_supported(self,
         else:
             return False
 
+    def is_fp8_w8a8(self) -> bool:
+        # Returns True if all quantized layers in model are fp8 w8a8
+        global_quant_config = cast(
+            dict[str, Any], self.quant_config.get("global_quant_config"))
+        layer_quant_configs = cast(dict[str, Any],
+                                   self.quant_config.get("layer_quant_config"))
+        for config in (global_quant_config, *layer_quant_configs.values()):
+            weight_config = cast(dict[str, Any], config.get("weight"))
+            input_config = cast(dict[str, Any], config.get("input_tensors"))
+            if not self._is_fp8_w8a8(weight_config, input_config):
+                return False
+        return True
+
     def _is_fp8_w8a8(self, weight_quant: Optional[dict[str, Any]],
                      input_quant: Optional[dict[str, Any]]) -> bool:
         # Confirm weights and input quantized.
diff --git a/vllm/model_executor/layers/sampler.py b/vllm/model_executor/layers/sampler.py
index 08840fc40cf6..0998fb287a31 100644
--- a/vllm/model_executor/layers/sampler.py
+++ b/vllm/model_executor/layers/sampler.py
@@ -71,6 +71,7 @@ class SampleResultArgsType:
     multinomial_samples: MultinomialSamplesType
     sample_results_dict: SampleResultsDictType
     sampling_metadata: SamplingMetadata
+    forced_samples: Optional[torch.Tensor]
     greedy_samples: Optional[torch.Tensor]
 
 
@@ -476,6 +477,39 @@ def _greedy_sample(
     return results
 
 
+def _forced_sample(
+    selected_seq_groups: list[SequenceGroupToSample],
+    samples: torch.Tensor,
+) -> list[tuple[list[int], list[int]]]:
+    """Run forced sampling on a given samples.
+    Args:
+        selected_seq_groups: A list of sequence groups batched.
+        samples: (num_selected_samples,) A tensor of samples. The length of
+            samples could be smaller than selected_seq_groups if
+            seq_group.do_sample is False.
+    Returns:
+        Tuple of (next_token_ids, parent_ids). The length of returned list is
+        same as the length of selected_seq_groups. If the corresponding
+        seq_group has do_sample=False, tuple contains ([], [])
+        
+        The next_token_ids is guided (forced) by the id containing in the 
+        sampling_parameters.future_context property.
+    """
+    samples = samples.tolist()
+    sample_idx = 0
+    results = []
+    for seq_group in selected_seq_groups:
+        seq_ids = seq_group.seq_ids
+        num_parent_seqs = len(seq_ids)
+        assert num_parent_seqs == 1, (
+            "Deterministic sampling should have only one seq.")
+        parent_ids = list(range(num_parent_seqs))
+        next_token_ids = [samples[sample_idx]]
+        results.append((next_token_ids, parent_ids))
+        sample_idx += num_parent_seqs
+    return results
+
+
 def _random_sample(
     selected_seq_groups: list[SequenceGroupToSample],
     random_samples: torch.Tensor,
@@ -583,12 +617,14 @@ def get_pythonized_sample_results(
     (
         sample_metadata,
         sampling_metadata,
+        forced_samples,
         greedy_samples,
         multinomial_samples,
         sample_results_dict,
     ) = (
         sample_result_args.sample_metadata,
         sample_result_args.sampling_metadata,
+        sample_result_args.forced_samples,
         sample_result_args.greedy_samples,
         sample_result_args.multinomial_samples,
         sample_result_args.sample_results_dict,
@@ -603,6 +639,8 @@ def get_pythonized_sample_results(
         elif sampling_type in (SamplingType.RANDOM, SamplingType.RANDOM_SEED):
             sample_results = _random_sample(seq_groups,
                                             multinomial_samples[sampling_type])
+        elif sampling_type == SamplingType.FORCED:
+            sample_results = _forced_sample(seq_groups, forced_samples)
         sample_results_dict.update(zip(seq_group_id, sample_results))
 
     return [
@@ -644,6 +682,7 @@ def _sample_with_torch(
     sample_results_dict: SampleResultsDictType = {}
     sample_metadata: SampleMetadataType = {}
     multinomial_samples: MultinomialSamplesType = {}
+    forced_samples: Optional[torch.Tensor] = None
     greedy_samples: Optional[torch.Tensor] = None
 
     # Create output tensor for sampled token ids.
@@ -707,6 +746,23 @@ def _sample_with_torch(
                 # Store sampled tokens in output tensor.
                 sampled_token_ids_tensor[long_sample_indices] = \
                     multinomial_samples[sampling_type].to(torch.long)
+        elif sampling_type == SamplingType.FORCED:
+            forced_samples = torch.tensor([], dtype=torch.int32)
+            for sgidx in range(len(seq_groups)):
+                if (seq_groups[sgidx].sampling_params.future_context
+                        is not None):
+                    forced_sample = torch.tensor([
+                        seq_groups[sgidx].sampling_params.future_context[sgidx]
+                        [min(
+                            len(sampling_metadata.seq_groups[sgidx].seq_data[
+                                sampling_params.cntr[sgidx]].output_token_ids),
+                            len(seq_groups[sgidx].sampling_params.
+                                future_context[sgidx]) - 1)]
+                    ])
+                else:
+                    forced_sample = torch.argmax(logprobs[long_sample_indices],
+                                                 dim=-1)
+                forced_samples = torch.cat([forced_samples, forced_sample])
 
         else:
             raise ValueError(f"Unsupported sampling type: {sampling_type}")
@@ -716,6 +772,7 @@ def _sample_with_torch(
     maybe_deferred_args = SampleResultArgsType(
         sampling_metadata=sampling_metadata,
         sample_metadata=sample_metadata,
+        forced_samples=forced_samples,
         multinomial_samples=multinomial_samples,
         greedy_samples=greedy_samples,
         sample_results_dict=sample_results_dict)
diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
index e6eaade09027..c470f68568ce 100644
--- a/vllm/model_executor/model_loader/utils.py
+++ b/vllm/model_executor/model_loader/utils.py
@@ -118,6 +118,10 @@ def process_weights_after_loading(model: nn.Module, model_config: ModelConfig,
             # of process_weights_after_loading
             module.process_weights_after_loading(model_config.dtype)
 
+    if hasattr(model, "process_weights_after_loading"):
+        with device_loading_context(model, target_device):
+            model.process_weights_after_loading()
+
 
 @contextmanager
 def device_loading_context(module: torch.nn.Module,
diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py
index 817c6bb9a7f9..654c344c9bd3 100644
--- a/vllm/model_executor/models/commandr.py
+++ b/vllm/model_executor/models/commandr.py
@@ -47,7 +47,6 @@
     row_parallel_weight_loader)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.model_executor.utils import set_weight_attrs
-from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP, SupportsQuant
@@ -57,7 +56,6 @@
                     maybe_prefix)
 
 
-@torch.compile(backend=current_platform.simple_compile_backend)
 def layer_norm_func(hidden_states, weight, variance_epsilon):
     input_dtype = hidden_states.dtype
     hidden_states = hidden_states.to(torch.float32)
diff --git a/vllm/model_executor/models/eagle.py b/vllm/model_executor/models/eagle.py
index d219b5228ac3..2af543996ec8 100644
--- a/vllm/model_executor/models/eagle.py
+++ b/vllm/model_executor/models/eagle.py
@@ -30,7 +30,7 @@ def __init__(self, weight=None, bias=None):
         self.weight = nn.Parameter(weight) if weight is not None else None
         self.bias = nn.Parameter(bias) if bias is not None else None
 
-    def forward(self, x):
+    def forward(self, x, residual=None, scale=None):
         return x
 
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 5d5080479e51..939f16ec5c7c 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -30,6 +30,8 @@
 from torch import nn
 from transformers import LlamaConfig
 
+import vllm.envs as envs
+from vllm import _custom_ops as ops
 from vllm.attention import Attention, AttentionType
 from vllm.compilation.decorators import support_torch_compile
 from vllm.config import CacheConfig, VllmConfig
@@ -41,12 +43,15 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+from vllm.model_executor.layers.quantization.quark.quark import QuarkConfig
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
     DEFAULT_VOCAB_PADDING_SIZE, ParallelLMHead, VocabParallelEmbedding)
 from vllm.model_executor.model_loader.weight_utils import (
     default_weight_loader, maybe_remap_kv_scale_name)
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.platforms import current_platform
 from vllm.sequence import IntermediateTensors
 
 from .interfaces import SupportsLoRA, SupportsPP
@@ -84,14 +89,28 @@ def __init__(
             reduce_results=reduce_results,
             prefix=f"{prefix}.down_proj",
         )
+        self.use_fp8 = (isinstance(quant_config, Fp8Config) or
+                        (isinstance(quant_config, QuarkConfig)
+                         and quant_config.is_fp8_w8a8())
+                        if current_platform.is_fp8_fnuz() else False)
         if hidden_act != "silu":
             raise ValueError(f"Unsupported activation: {hidden_act}. "
                              "Only silu is supported for now.")
         self.act_fn = SiluAndMul()
 
     def forward(self, x):
-        x, _ = self.gate_up_proj(x)
-        x = self.act_fn(x)
+        if current_platform.is_rocm() and x.shape[0] == 1 and x.shape[1] == 1:
+            out = torch.empty(x.shape[0],
+                              self.gate_up_proj.weight.shape[0] // 2,
+                              dtype=x.dtype,
+                              device=x.device)
+            ops.LLMM_Silu(self.gate_up_proj.weight, x.view(-1, x.size(-1)),
+                          out, 8)
+            x = out.view(x.shape[0], x.shape[1], out.shape[1])
+        else:
+            x, _ = self.gate_up_proj(x)
+            x = self.act_fn(
+                x, self.down_proj.input_scale if self.use_fp8 else None)
         x, _ = self.down_proj(x)
         return x
 
@@ -180,6 +199,16 @@ def __init__(
         else:
             sliding_window = None
 
+        # For CUDA devices and Navi4x, attn_fp8 will be set to false.
+        use_fp8 = isinstance(
+            quant_config, Fp8Config) or (isinstance(quant_config, QuarkConfig)
+                                         and quant_config.is_fp8_w8a8())
+        self.attn_fp8_out = (envs.VLLM_USE_ROCM_CUSTOM_PAGED_ATTN_FP8_OUT
+                             and envs.VLLM_USE_TRITON_FLASH_ATTN
+                             and current_platform.is_fp8_fnuz() and use_fp8)
+        if envs.VLLM_USE_V1 and not envs.VLLM_V1_USE_PREFILL_DECODE_ATTENTION:
+            self.attn_fp8_out = False
+
         self.attn = Attention(
             self.num_heads,
             self.head_dim,
@@ -234,6 +263,10 @@ def __init__(
     ) -> None:
         super().__init__()
         self.hidden_size = config.hidden_size
+        self.use_fp8 = (isinstance(quant_config, Fp8Config) or
+                        (isinstance(quant_config, QuarkConfig)
+                         and quant_config.is_fp8_w8a8())
+                        if current_platform.is_fp8_fnuz() else False)
         rope_theta = getattr(config, "rope_theta", 10000)
         rope_scaling = getattr(config, "rope_scaling", None)
         if rope_scaling is not None and getattr(
@@ -296,18 +329,21 @@ def forward(
         residual: Optional[torch.Tensor],
     ) -> tuple[torch.Tensor, torch.Tensor]:
         # Self Attention
+        scale = None if not self.use_fp8 else \
+            self.self_attn.qkv_proj.input_scale
         if residual is None:
             residual = hidden_states
-            hidden_states = self.input_layernorm(hidden_states)
+            hidden_states = self.input_layernorm(hidden_states, None, scale)
         else:
             hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
+                hidden_states, residual, scale)
         hidden_states = self.self_attn(positions=positions,
                                        hidden_states=hidden_states)
 
         # Fully Connected
+        scale = None if not self.use_fp8 else self.mlp.gate_up_proj.input_scale
         hidden_states, residual = self.post_attention_layernorm(
-            hidden_states, residual)
+            hidden_states, residual, scale)
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
@@ -642,3 +678,10 @@ def permute(w: torch.Tensor, n_heads: int):
                 name = name.replace(item, mapping[item])
 
         return name, loaded_weight
+
+    def process_weights_after_loading(self) -> None:
+        for layer in self.model.layers:
+            assert isinstance(layer, LlamaDecoderLayer)
+            if layer.self_attn.attn_fp8_out:
+                layer.self_attn.attn._out_scale = \
+                    layer.self_attn.o_proj.input_scale
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
index a929366db49c..a3404c6eafe5 100644
--- a/vllm/platforms/rocm.py
+++ b/vllm/platforms/rocm.py
@@ -141,7 +141,8 @@ def use_rocm_custom_paged_attention(
                 and (head_size == 64 or head_size == 128)
                 and (block_size == 16 or block_size == 32)
                 and (gqa_ratio >= 1 and gqa_ratio <= 16)
-                and max_seq_len <= 32768 and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
+                and max_seq_len <= 128 * 1024
+                and (envs.VLLM_ROCM_CUSTOM_PAGED_ATTN)
                 and not (envs.VLLM_ROCM_USE_AITER_PAGED_ATTN
                          and envs.VLLM_ROCM_USE_AITER))
 
diff --git a/vllm/sampling_params.py b/vllm/sampling_params.py
index 7abdcecca474..d918ac34707a 100644
--- a/vllm/sampling_params.py
+++ b/vllm/sampling_params.py
@@ -25,6 +25,7 @@ class SamplingType(IntEnum):
     GREEDY = 0
     RANDOM = 1
     RANDOM_SEED = 2
+    FORCED = 3
 
 
 # maybe make msgspec?
@@ -154,6 +155,8 @@ class SamplingParams(
         min_p: Float that represents the minimum probability for a token to be
             considered, relative to the probability of the most likely token.
             Must be in [0, 1]. Set to 0 to disable this.
+        ppl_measurement: Measure perplexity towards the deterministic string 
+            instead of probabilistic regressing.
         seed: Random seed to use for the generation.
         stop: list of strings that stop the generation when they are generated.
             The returned output will not contain the stop strings.
@@ -212,6 +215,9 @@ class SamplingParams(
     top_p: float = 1.0
     top_k: int = 0
     min_p: float = 0.0
+    ppl_measurement: bool = False
+    future_context: Optional[list[int]] = None
+    cntr: Optional[list[int]] = None
     seed: Optional[int] = None
     stop: Optional[Union[str, list[str]]] = None
     stop_token_ids: Optional[list[int]] = None
@@ -259,6 +265,9 @@ def from_optional(
         top_p: Optional[float] = 1.0,
         top_k: int = 0,
         min_p: float = 0.0,
+        ppl_measurement: bool = False,
+        future_context: Optional[list[int]] = None,
+        cntr: Optional[int] = None,
         seed: Optional[int] = None,
         stop: Optional[Union[str, list[str]]] = None,
         stop_token_ids: Optional[list[int]] = None,
@@ -302,6 +311,9 @@ def from_optional(
             top_p=1.0 if top_p is None else top_p,
             top_k=top_k,
             min_p=min_p,
+            ppl_measurement=ppl_measurement,
+            future_context=future_context,
+            cntr=cntr,
             seed=seed,
             stop=stop,
             stop_token_ids=stop_token_ids,
@@ -530,6 +542,8 @@ def update_from_tokenizer(self, tokenizer: AnyTokenizer) -> None:
 
     @cached_property
     def sampling_type(self) -> SamplingType:
+        if self.ppl_measurement:
+            return SamplingType.FORCED
         if self.temperature < _SAMPLING_EPS:
             return SamplingType.GREEDY
         if self.seed is not None:
@@ -570,6 +584,7 @@ def __repr__(self) -> str:
             f"top_p={self.top_p}, "
             f"top_k={self.top_k}, "
             f"min_p={self.min_p}, "
+            f"ppl_measurement={self.ppl_measurement}, "
             f"seed={self.seed}, "
             f"stop={self.stop}, "
             f"stop_token_ids={self.stop_token_ids}, "
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 52a7a903cd8e..2dd508232244 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -360,6 +360,10 @@ def get_config(
                     raise RuntimeError(err_msg) from e
                 else:
                     raise e
+            if config.model_type in _CONFIG_REGISTRY:
+                config_class = _CONFIG_REGISTRY[config.model_type]
+                config = config_class.from_pretrained(
+                    model, revision=revision, code_revision=code_revision)
 
     elif config_format == ConfigFormat.MISTRAL:
         config = load_params_config(model, revision, **kwargs)
diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py
index 5db592b15010..0900ae155bf5 100644
--- a/vllm/v1/attention/backends/triton_attn.py
+++ b/vllm/v1/attention/backends/triton_attn.py
@@ -257,7 +257,8 @@ def forward(
                                          v_scale=layer._v_scale,
                                          alibi_slopes=self.alibi_slopes,
                                          sliding_window=self.sliding_window[0],
-                                         sm_scale=self.scale)
+                                         sm_scale=self.scale,
+                                         fp8_out_scale=layer._out_scale)
 
         else:
             descale_shape = (cu_seqlens_q.shape[0] - 1, key.shape[1])