PaddlePaddle
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎README_CN.md‎
Lines changed: 1 addition & 0 deletions b/‎README_CN.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎build.sh‎
Lines changed: 9 additions & 1 deletion b/‎build.sh‎
Lines changed: 9 additions & 1 deletion
diff --git a/‎custom_ops/setup_ops.py‎
Lines changed: 2 additions & 0 deletions b/‎custom_ops/setup_ops.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/get_started/installation/README.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/get_started/installation/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/get_started/installation/intel_hpu.md‎
Lines changed: 75 additions & 0 deletions b/‎docs/get_started/installation/intel_hpu.md‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎docs/zh/get_started/installation/README.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/zh/get_started/installation/README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/zh/get_started/installation/intel_hpu.md‎
Lines changed: 75 additions & 0 deletions b/‎docs/zh/get_started/installation/intel_hpu.md‎
Lines changed: 75 additions & 0 deletions
diff --git a/‎fastdeploy/config.py‎
Lines changed: 3 additions & 0 deletions b/‎fastdeploy/config.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎fastdeploy/distributed/communication.py‎
Lines changed: 23 additions & 0 deletions b/‎fastdeploy/distributed/communication.py‎
Lines changed: 23 additions & 0 deletions
@@ -60,6 +60,7 @@ FastDeploy supports inference deployment on **NVIDIA GPUs**, **Kunlunxin XPUs**,
 - [Enflame GCU](./docs/get_started/installation/Enflame_gcu.md)
 - [Hygon DCU](./docs/get_started/installation/hygon_dcu.md)
 - [MetaX GPU](./docs/get_started/installation/metax_gpu.md)
+- [Intel HPU](./docs/get_started/installation/intel_hpu.md)
 
 **Note:** We are actively working on expanding hardware support. Additional hardware platforms including Ascend NPU are currently under development and testing. Stay tuned for updates!
 
 
@@ -58,6 +58,7 @@ FastDeploy 支持在**英伟达（NVIDIA）GPU**、**昆仑芯（Kunlunxin）XPU
 - [燧原 S60](./docs/zh/get_started/installation/Enflame_gcu.md)
 - [海光 DCU](./docs/zh/get_started/installation/hygon_dcu.md)
 - [沐曦 GPU](./docs/zh/get_started/installation/metax_gpu.md)
+- [英特尔 HPU](./docs/zh/get_started/installation/intel_hpu.md)
 
 **注意:** 我们正在积极拓展硬件支持范围。目前，包括昇腾（Ascend）NPU 等其他硬件平台正在开发测试中。敬请关注更新！
 
 
@@ -128,6 +128,12 @@ function copy_ops(){
       echo -e "MACA ops have been copy to fastdeploy"
       return
     fi
+    is_intel_hpu=`$python -c "import paddle; print(paddle.is_compiled_with_custom_device('intel_hpu'))"`
+    if [ "$is_intel_hpu" = "True" ]; then
+      DEVICE_TYPE="intel-hpu"
+      echo -e "intel_hpu ops have been copy to fastdeploy"
+      return
+    fi
 
     DEVICE_TYPE="cpu"
     cd ../../../../
@@ -159,7 +165,9 @@ function build_and_install_ops() {
     else
       FD_BUILDING_ARCS=${FD_BUILDING_ARCS} ${python} setup_ops.py install --install-lib ${OPS_TMP_DIR}
     fi
-    find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
+    if [ -d "${OPS_TMP_DIR}" ]; then
+      find ${OPS_TMP_DIR} -type f -name "*.o" -exec rm -f {} \;
+    fi
   else
       echo "Error: Invalid parameter '$FD_CPU_USE_BF16'. Please use true or false."
       exit 1
 
@@ -621,6 +621,8 @@ def find_end_files(directory, end_str):
             ],
         ),
     )
+elif paddle.is_compiled_with_custom_device('intel_hpu'):
+    pass
 else:
     use_bf16 = envs.FD_CPU_USE_BF16 == "True"
 
 
@@ -7,3 +7,4 @@ FastDeploy currently supports installation on the following hardware platforms:
 - [Enflame S60 GCU Installation](Enflame_gcu.md)
 - [Iluvatar GPU Installation](iluvatar_gpu.md)
 - [Hygon DCU Installation](hygon_dcu.md)
+- [Intel HPU Installation](intel_hpu.md)
@@ -0,0 +1,75 @@
+# Intel HPU Installation for running ERNIE 4.5 Series Models
+
+The following installation methods are available when your environment meets these requirements:
+
+- Python 3.10
+- Intel Gaudi 2
+- Intel Gaudi software version 1.22.0
+- Linux X86_64
+
+### 1. Run Docker Container
+
+Use the following commands to run a Docker container. Make sure to update the versions below as listed in the [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
+
+```{.console}
+$ docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
+```
+
+### 2. Install PaddlePaddle
+
+```bash
+python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+```
+
+### 3. Install PaddleCustomDevice
+```shell
+git clone https://github.com/PaddlePaddle/PaddleCustomDevice
+cd PaddleCustomDevice/backends/intel_hpu/
+mkdir -p build
+cd build
+cmake ..
+make -j
+pip install --force-reinstall dist/paddle_intel_hpu*.whl
+cd PaddleCustomDevice/backends/intel_hpu/custom_ops
+python setup.py install
+```
+
+### 4. Install FastDeploy
+
+```shell
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy
+bash build.sh
+```
+
+## Prepare the inference demo
+
+### 1. Start inference service
+```shell
+export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
+export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
+export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export PADDLE_DISTRI_BACKEND=xccl
+export PADDLE_XCCL_BACKEND=intel_hpu
+export HABANA_PROFILE=0
+export HPU_VISIBLE_DEVICES=0
+
+HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
+```
+
+### 2. Launch the request
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "What is AI?"}
+  ], "max_tokens": 24
+}'
+```
+
+### 3. Successfully returns the result
+```json
+{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}}
+```
@@ -7,3 +7,4 @@ FastDeploy支持如下硬件平台:
 - [Enflame S60 GCU Installation](Enflame_gcu.md)
 - [Iluvatar GPU Installation](iluvatar_gpu.md)
 - [Hygon DCU Installation](hygon_dcu.md)
+- [Intel HPU Installation](intel_hpu.md)
@@ -0,0 +1,75 @@
+# 使用 Intel HPU 运行ERNIE 4.5 系列模型
+
+在环境满足如下条件前提下
+
+- Python 3.10
+- Intel Gaudi 2
+- Intel Gaudi software version 1.22.0
+- Linux X86_64
+
+### 1. 运行Docker容器
+
+使用下面命令运行Docker容器. 确保更新的版本在如下列表中 [Support Matrix](https://docs.habana.ai/en/latest/Support_Matrix/Support_Matrix.html):
+
+```{.console}
+$ docker pull vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
+$ docker run -it --runtime=habana -e HABANA_VISIBLE_DEVICES=all -e OMPI_MCA_btl_vader_single_copy_mechanism=none --cap-add=sys_nice --net=host --ipc=host vault.habana.ai/gaudi-docker/1.22.0/ubuntu22.04/habanalabs/pytorch-installer-2.7.1:latest
+```
+
+### 2. 安装 PaddlePaddle
+
+```bash
+python -m pip install paddlepaddle==3.1.1 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
+```
+
+### 3. 安装 PaddleCustomDevice
+```shell
+git clone https://github.com/PaddlePaddle/PaddleCustomDevice
+cd PaddleCustomDevice/backends/intel_hpu/
+mkdir -p build
+cd build
+cmake ..
+make -j
+pip install --force-reinstall dist/paddle_intel_hpu*.whl
+cd PaddleCustomDevice/backends/intel_hpu/custom_ops
+python setup.py install
+```
+
+### 4. 安装 FastDeploy
+
+```shell
+git clone https://github.com/PaddlePaddle/FastDeploy
+cd FastDeploy
+bash build.sh
+```
+
+## 准备推理示例
+
+### 1. 启动推理服务
+```shell
+export GC_KERNEL_PATH=/usr/lib/habanalabs/libtpc_kernels.so
+export GC_KERNEL_PATH=/usr/local/lib/python3.10/dist-packages/paddle_custom_device/intel_hpu/libcustom_tpc_perf_lib.so:$GC_KERNEL_PATH
+export INTEL_HPU_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+export PADDLE_DISTRI_BACKEND=xccl
+export PADDLE_XCCL_BACKEND=intel_hpu
+export HABANA_PROFILE=0
+export HPU_VISIBLE_DEVICES=0
+
+HPU_WARMUP_BUCKET=1 HPU_WARMUP_MODEL_LEN=4096 FD_ATTENTION_BACKEND=HPU_ATTN python -m fastdeploy.entrypoints.openai.api_server --model ERNIE-4.5-21B-A3B-Paddle --tensor-parallel-size 1 --max-model-len 32768 --max-num-seqs 128
+```
+
+### 2. 发送请求
+```bash
+curl -X POST "http://0.0.0.0:8188/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "What is AI?"}
+  ], "max_tokens": 24
+}'
+```
+
+### 3. 成功返回结果
+```json
+{"id":"chatcmpl-3bd98ae2-fafe-46ae-a552-d653a8526503","object":"chat.completion","created":1757653575,"model":"ERNIE-4.5-21B-A3B-Paddle","choices":[{"index":0,"message":{"role":"assistant","content":"**AI (Artificial Intelligence)** refers to the development of computer systems that can perform tasks typically requiring human intelligence.","multimodal_content":null,"reasoning_content":null,"tool_calls":null,"prompt_token_ids":null,"completion_token_ids":null,"text_after_process":null,"raw_prediction":null,"prompt_tokens":null,"completion_tokens":null},"logprobs":null,"finish_reason":"length"}],"usage":{"prompt_tokens":11,"total_tokens":35,"completion_tokens":24,"prompt_tokens_details":{"cached_tokens":0}}}
+```
@@ -267,6 +267,7 @@ def __init__(
         self.expert_parallel_size = 1  # EP degree
         self.data_parallel_size = 1  # DP degree
         self.enable_expert_parallel = False
+        self.enable_tensor_or_expert_parallel = False
         self.local_data_parallel_id = 0
         # The embedding weight distributed on your gpu cards is divided by row or column.
         # Defaults to False means divide by row. When vocab_size can not be divided by world_size
@@ -1219,6 +1220,8 @@ def __init__(
         self.device_ids = os.getenv("CUDA_VISIBLE_DEVICES", self.device_ids)
         if current_platform.is_xpu():
             self.device_ids = os.getenv("XPU_VISIBLE_DEVICES", self.device_ids)
+        if current_platform.is_intel_hpu():
+            self.device_ids = os.getenv("HPU_VISIBLE_DEVICES", self.device_ids)
 
         self.read_from_config()
         self.postprocess()
 
@@ -66,3 +66,26 @@ def tensor_model_parallel_all_reduce(
 
 except:
     tensor_model_parallel_all_reduce = None
+
+from paddle.distributed.communication import stream
+from paddle.distributed.communication.reduce import ReduceOp
+
+def all_reduce(
+    tensor,
+    op,
+    group,
+    sync_op: bool = True,
+):
+    return stream.all_reduce(
+        tensor, op=op, group=group, sync_op=sync_op, use_calc_stream=True
+    )
+
+@paddle.jit.marker.unified
+def tensor_model_parallel_all_reduce_custom(input_: paddle.Tensor) -> paddle.Tensor:
+    """All-reduce the input tensor across model parallel group on calc stream."""
+    if paddle.in_dynamic_mode():
+        hcg = dist.fleet.get_hybrid_communicate_group()
+        mp_group = hcg.get_model_parallel_group()
+        all_reduce(input_, op=ReduceOp.SUM, group=mp_group)
+    else:
+        dist.all_reduce(input_)
Original file line number	Diff line number	Diff line change
`@@ -621,6 +621,8 @@ def find_end_files(directory, end_str):`
`621`	`621`	`],`
`622`	`622`	`),`
`623`	`623`	`)`
	`624`	`+elif paddle.is_compiled_with_custom_device('intel_hpu'):`
	`625`	`+ pass`
`624`	`626`	`else:`
`625`	`627`	`use_bf16 = envs.FD_CPU_USE_BF16 == "True"`
`626`	`628`