diff --git a/.precommit/check_imports.py b/.precommit/check_imports.py
index 2e736cb201..94ff9c64be 100644
--- a/.precommit/check_imports.py
+++ b/.precommit/check_imports.py
@@ -24,13 +24,13 @@
 from stdlib_list import stdlib_list
 
 sys.path.append(str(pathlib.Path(__file__).parent.parent))
-from setup import DEP_SPECS, REQUIRED_DEPS
+from setup import REQUIRED_DEPS
 
 # NOTE: We do not use `importlib.metadata.packages_distributions` here because
 # 1. It is supported only in Python 3.10+.
 # 2. It requires the packages to be installed, but we are doing a static check.
 MOD_TO_DEP = {
-    "aistudio_sdk": "aistudio_sdk",
+    "aistudio-sdk": "aistudio_sdk",
     "aiohttp": "aiohttp",
     "baidubce": "bce-python-sdk",
     "bs4": "beautifulsoup4",
@@ -43,9 +43,10 @@
     "fastapi": "fastapi",
     "filelock": "filelock",
     "filetype": "filetype",
+    "flash_attn": "flash-attn",
     "ftfy": "ftfy",
     "GPUtil": "GPUtil",
-    "huggingface_hub": "huggingface_hub",
+    "huggingface_hub": "huggingface-hub",
     "imagesize": "imagesize",
     "jinja2": "Jinja2",
     "joblib": "joblib",
@@ -61,6 +62,7 @@
     "cv2": "opencv-contrib-python",
     "openpyxl": "openpyxl",
     "packaging": "packaging",
+    "paddle2onnx": "paddle2onnx",
     "pandas": "pandas",
     "PIL": "pillow",
     "premailer": "premailer",
@@ -76,20 +78,24 @@
     "ruamel.yaml": "ruamel.yaml",
     "skimage": "scikit-image",
     "sklearn": "scikit-learn",
+    "sentencepiece": "sentencepiece",
+    "sglang": "sglang",
     "shapely": "shapely",
     "soundfile": "soundfile",
     "starlette": "starlette",
     "tiktoken": "tiktoken",
     "tokenizers": "tokenizers",
+    "torch": "torch",
     "tqdm": "tqdm",
+    "transformers": "transformers",
     "typing_extensions": "typing-extensions",
     "ujson": "ujson",
     "uvicorn": "uvicorn",
+    "uvloop": "uvloop",
+    "vllm": "vllm",
+    "xformers": "xformers",
     "yarl": "yarl",
 }
-assert (
-    set(MOD_TO_DEP.values()) == DEP_SPECS.keys()
-), f"`MOD_TO_DEP` should be updated to match `DEP_SPECS`. Symmetric difference: {set(MOD_TO_DEP.values()) ^ DEP_SPECS.keys()}"
 MOD_PATTERN = re.compile(
     rf"^(?:{'|'.join([re.escape(mod) for mod in MOD_TO_DEP])})(?=\.|$)"
 )
@@ -107,7 +113,11 @@
     "paddle3d",
     "paddlevideo",
 }
-MANUALLY_MANAGED_OPTIONAL_HEAVY_MODS = {"paddle_custom_device", "ultra_infer"}
+MANUALLY_MANAGED_OPTIONAL_HEAVY_MODS = {
+    "paddle_custom_device",
+    "ultra_infer",
+    "fastdeploy",
+}
 
 
 def check(file_path):
diff --git a/THIRD_PARTY_LICENSES/sglang/LICENSE b/THIRD_PARTY_LICENSES/sglang/LICENSE
new file mode 100644
index 0000000000..9c422689c8
--- /dev/null
+++ b/THIRD_PARTY_LICENSES/sglang/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2023-2024 SGLang Team
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/THIRD_PARTY_LICENSES/transformers/LICENSE b/THIRD_PARTY_LICENSES/transformers/LICENSE
new file mode 100644
index 0000000000..68b7d66c97
--- /dev/null
+++ b/THIRD_PARTY_LICENSES/transformers/LICENSE
@@ -0,0 +1,203 @@
+Copyright 2018- The Hugging Face team. All rights reserved.
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/docs/installation/installation.en.md b/docs/installation/installation.en.md
index b861147bb0..9a2a3510a3 100644
--- a/docs/installation/installation.en.md
+++ b/docs/installation/installation.en.md
@@ -4,6 +4,9 @@ comments: true
 
 # PaddleX Local Installation Tutorial
 > ❗Before installing PaddleX, please ensure you have a basic <b>Python environment</b> (Note: Currently supports Python 3.8 to Python 3.12, with more Python versions being adapted).
+
+> ❗In most cases, you need to first install the PaddlePaddle framework by referring to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md) before proceeding with PaddleX's installation steps. [4 PaddleX's Dependency on PaddlePaddle Framework](#4-paddlexs-dependency-on-paddlepaddle-framework) lists scenarios where installing the PaddlePaddle framework is not required.
+
 ## 1. Quick Installation
 Welcome to PaddleX, Baidu's low-code development tool for AI. Before we dive into the local installation process, please clarify your development needs and choose the appropriate installation mode.
 
@@ -12,16 +15,14 @@ PaddleX offers two installation modes: <b>Wheel Package Installation</b> and <b>
 ### 1.1 Wheel Package Installation Mode
 If your use case for PaddleX involves <b>model inference and integration</b>, we recommend the more <b>convenient</b> and <b>lightweight</b> Wheel package installation mode.
 
-After installing PaddlePaddle (refer to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md)), you can quickly install the PaddleX Wheel package by executing the following commands:
-
-> ❗ <b>Note</b>: Please ensure that PaddlePaddle is successfully installed before proceeding to the next step.
+You can quickly install the PaddleX Wheel package by executing the following commands:
 
 ```bash
 # Only install the required dependencies (optional dependencies can be installed later as needed)
 pip install paddlex
 ```
 
-You can install the optional dependencies as needed using the following method (For more details, please refer to [2.3 Selective Installation of Dependencies](#23-selective-installation-of-dependencies)):
+You can install the optional dependencies as needed using the following method (For more details, please refer to [3 Selective Installation of Dependencies](#3-selective-installation-of-dependencies)):
 
 Install all dependencies required for PaddleX "basic features":
 
@@ -121,7 +122,7 @@ The model training related plugins supported by PaddleX are listed below. Please
 </tbody>
 </table></details>
 
-If the plugin you need to install is `PaddleXXX`, after installing PaddlePaddle (refer to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md)), you can quickly install the corresponding PaddleX plugin by executing the following commands:
+If the plugin you need to install is `PaddleXXX`, you can quickly install the corresponding PaddleX plugin by executing the following commands:
 
 ```bash
 git clone https://github.com/PaddlePaddle/PaddleX.git
@@ -139,7 +140,7 @@ When installing PaddleX on Linux, we <b>strongly recommend using the official Pa
 
 When using the official Docker image, <b>PaddlePaddle, PaddleX (including the wheel package and all plugins), and the corresponding CUDA environment are already pre-installed</b>. You can simply obtain the Docker image and start the container to begin using it. <b>Please note that the official Docker image of PaddleX is different from the official Docker image of the PaddlePaddle framework, as the latter does not come with PaddleX pre-installed.</b>
 
-When using custom installation methods, you need to first install the PaddlePaddle framework, then obtain the PaddleX source code, and finally choose the PaddleX installation mode.
+When using custom installation methods, you need to first install the PaddlePaddle framework (except for [a few cases](#4-paddlexs-dependency-on-paddlepaddle-framework)), then obtain the PaddleX source code, and finally choose the PaddleX installation mode.
 ### 2.1 Get PaddleX based on Docker
 Using the PaddleX official Docker image, create a container called 'paddlex' and map the current working directory to the '/paddle' directory inside the container by following the command.
 
@@ -177,7 +178,6 @@ nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -i
 * If you want to delve deeper into the principles or usage of Docker, please refer to the [Docker Official Website](https://www.docker.com/) or the [Docker Official Tutorial](https://docs.docker.com/get-started/).
 
 ### 2.2 Custom Installation of PaddleX
-Before installation, please ensure you have completed the local installation of PaddlePaddle by referring to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md).
 
 #### 2.2.1 Obtain PaddleX Source Code
 Next, use the following command to obtain the latest PaddleX source code from GitHub:
@@ -248,7 +248,7 @@ All packages are installed.
 
 For PaddleX installation on more hardware environments, please refer to the [PaddleX Multi-hardware Usage Guide](../other_devices_support/multi_devices_use_guide.en.md)
 
-### 2.3 Selective Installation of Dependencies
+## 3 Selective Installation of Dependencies
 
 PaddleX offers a wide range of features, and different features require different dependencies. The features in PaddleX that can be used without installing plugins are categorized as "basic features." The official PaddleX Docker images have all dependencies required for these basic features preinstalled. Similarly, using the installation method introduced earlier—`pip install "...[base]"`—will install all dependencies needed for the basic features.
 
@@ -279,8 +279,17 @@ PaddleX currently provides the following dependency groups:
 | `ts` | Basic features of time series pipelines. |
 | `video` | Basic features of video pipelines. |
 | `trans` | Basic features of translation pipelines. |
+| `genai-client` | The generative AI client feature. Installing this group is equivalent to installing the generative AI client plugin; the plugin can also be installed via the PaddleX CLI. |
+| `genai-sglang-server` | The serving feature. Installing this group is equivalent to installing the PaddleX serving plugin; the plugin can also be installed via the PaddleX CLI. |
+| `genai-vllm-server` | The serving feature. Installing this group is equivalent to installing the PaddleX serving plugin; the plugin can also be installed via the PaddleX CLI. |
 | `serving` | The serving feature. Installing this group is equivalent to installing the PaddleX serving plugin; the plugin can also be installed via the PaddleX CLI. |
-| `plugins` | All plugin-provided features that support installation via dependency groups. |
-| `all` | All basic features of PaddleX, as well as all plugin-provided features installable via dependency groups. |
+| `paddle2onnx` | The Paddle2ONNX feature. Installing this group is equivalent to installing the PaddleX Paddle2ONNX plugin; the plugin can also be installed via the PaddleX CLI. |
 
 Each pipeline belongs to exactly one dependency group. You can refer to the tutorial of each pipeline to find out which dependency group it belongs to. For modules, you can access the related basic features by installing any dependency group that includes the module.
+
+## 4 PaddleX's Dependency on PaddlePaddle Framework
+
+The vast majority of PaddleX's functionalities rely on the PaddlePaddle framework. Therefore, in most cases, you need to install the PaddlePaddle framework before using PaddleX by referring to the [PaddlePaddle Local Installation Tutorial](paddlepaddle_install.en.md). However, for the following scenarios, you can use the corresponding features without installing the PaddlePaddle framework:
+
+- Using the capabilities provided by PaddleX's `genai-vllm-server` or `genai-sglang-server` plugins to deploy model inference services.
+- Using PaddleX's `genai-client` plugin to call generative AI inference services.
diff --git a/docs/installation/installation.md b/docs/installation/installation.md
index 73e8db0449..46fbaab074 100644
--- a/docs/installation/installation.md
+++ b/docs/installation/installation.md
@@ -5,6 +5,8 @@ comments: true
 # PaddleX本地安装教程
 > ❗安装 PaddleX 前请先确保您有基础的 <b>Python 运行环境</b>（注：当前支持Python 3.8 ～ Python 3.12下运行）。
 
+> ❗在大多数情况下，您需要先参考 [飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md) 安装飞桨框架，再执行 PaddleX 的安装步骤。[4 PaddleX 对飞桨框架的依赖](#4-paddlex-对飞桨框架的依赖) 中列举了不需要安装飞桨框架的情形。
+
 ## 1. 快速安装
 欢迎您使用飞桨低代码开发工具PaddleX，在我们正式开始本地安装之前，请首先明确您的开发需求，并根据您的需求选择合适的安装模式。
 PaddleX为您提供了两种安装模式：<b>Wheel包安装</b>和<b>插件安装</b>，下面分别对其应用场景进行介绍：
@@ -14,16 +16,14 @@ PaddleX为您提供了两种安装模式：<b>Wheel包安装</b>和<b>插件安
 
 快速安装轻量级的Wheel包之后，您即可基于PaddleX支持的所有模型进行推理，并能直接集成进您的项目中。
 
-参考[飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md)安装飞桨后，您可直接执行如下指令快速安装PaddleX的Wheel包：
-
-> ❗ 注：请务必保证 PaddlePaddle 安装成功，安装成功后，方可进行下一步。
+您可直接执行如下指令快速安装PaddleX的Wheel包：
 
 ```bash
 # 仅安装必须依赖（可以在之后按需安装可选依赖）
 pip install paddlex
 ```
 
-通过如下方式可以安装所需的可选依赖（更多说明请参考 [2.3 选择性安装依赖](#23-选择性安装依赖)）：
+通过如下方式可以安装所需的可选依赖（更多说明请参考 [3 选择性安装依赖](#3-选择性安装依赖)）：
 
 安装 PaddleX “基础功能”需要的全部依赖：
 
@@ -124,7 +124,7 @@ PaddleX支持的模型训练相关插件如下，请您根据开发需求，确
 
 
 
-若您需要安装的插件为`PaddleXXX`，在参考[飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md)安装飞桨后，您可以直接执行如下指令快速安装PaddleX的对应插件：
+若您需要安装的插件为`PaddleXXX`，可以直接执行如下指令快速安装PaddleX的对应插件：
 
 ```bash
 git clone https://github.com/PaddlePaddle/PaddleX.git
@@ -144,7 +144,7 @@ paddlex --install PaddleXXX  # 例如PaddleOCR
 
 当您使用官方 Docker 镜像安装时，其中<b>已经内置了 PaddlePaddle、PaddleX（包括wheel包和所有插件）</b>，并配置好了相应的CUDA环境，<b>您获取 Docker 镜像并启动容器即可开始使用</b>。<b>请注意，PaddleX 官方 Docker 镜像与飞桨框架官方 Docker 镜像不同，后者并没有预装 PaddleX。</b>
 
-当您使用自定义方式安装时，需要先安装飞桨 PaddlePaddle 框架，随后获取 PaddleX 源码，最后选择PaddleX的安装模式。
+当您使用自定义方式安装时，需要先安装飞桨 PaddlePaddle 框架（除 [少数情形](#4-paddlex-对飞桨框架的依赖) 外），随后获取 PaddleX 源码，最后选择PaddleX的安装模式。
 
 > ❗ 无需关注物理机上的 CUDA 版本，只需关注显卡驱动程序版本。
 
@@ -187,7 +187,6 @@ nvidia-docker run --name paddlex -v $PWD:/paddle --shm-size=8g --network=host -i
 * 若您想更深入了解 Docker 的原理或使用方式，请参考 [Docker官方网站](https://www.docker.com/) 或 [Docker官方教程](https://docs.docker.com/get-started/)。
 
 ### 2.2 自定义方式安装PaddleX
-在安装之前，请确保您已经参考[飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md)完成飞桨的本地安装。
 
 #### 2.2.1 获取 PaddleX 源码
 接下来，请使用以下命令从 GitHub 获取 PaddleX 最新源码：
@@ -251,7 +250,7 @@ All packages are installed.
 ```
 更多硬件环境的PaddleX安装请参考[PaddleX多硬件使用指南](../other_devices_support/multi_devices_use_guide.md)
 
-### 2.3 选择性安装依赖
+## 3 选择性安装依赖
 
 PaddleX 的功能丰富，而不同的功能需要的依赖也不尽相同。将 PaddleX 中不需要安装插件即可使用的功能归类为“基础功能”。PaddleX 官方 Docker 镜像预置了基础功能所需的全部依赖；使用上文介绍的 `pip install "...[base]"` 的安装方式也将安装基础功能需要的所有依赖。如果您只专注于 PaddleX 的某一项功能，且希望保持安装的依赖的体积尽可能小，可以通过指定“依赖组”的方式，选择性地安装依赖：
 
@@ -280,9 +279,17 @@ PaddleX 目前提供如下依赖组：
 | `ts` | 时序产线的基础功能。 |
 | `video` | 视频产线的基础功能。 |
 | `trans` | 翻译产线的基础功能。 |
+| `genai-client` | 生成式 AI 客户端功能。安装此依赖组等效于安装 PaddleX 生成式 AI 客户端插件；也可以通过 PaddleX CLI 安装生成式 AI 客户端插件。 |
+| `genai-sglang-server` | SGLang 服务器功能。安装此依赖组等效于安装 PaddleX SGLang 服务器插件；也可以通过 PaddleX CLI 安装SGLang 服务器插件。 |
+| `genai-vllm-server` | vLLM 服务器功能。安装此依赖组等效于安装 PaddleX vLLM 服务器插件；也可以通过 PaddleX CLI 安装 vLLM 服务器插件。 |
 | `serving` | 服务化部署功能。安装此依赖组等效于安装 PaddleX 服务化部署插件；也可以通过 PaddleX CLI 安装服务化部署插件。 |
-| `plugins` | 所有支持通过指定依赖组安装的插件提供的功能。 |
-| `all` | PaddleX 的所有基础功能，以及所有支持通过指定依赖组安装的插件提供的功能。 |
-
+| `paddle2onnx` | Paddle2ONNX 功能。安装此依赖组等效于安装 PaddleX Paddle2ONNX 插件；也可以通过 PaddleX CLI 安装 Paddle2ONNX 插件。 |
 
 每一条产线属于且仅属于一个依赖组；在各产线的使用文档中可以了解产线属于哪一依赖组。对于单功能模块，安装任意包含该模块的产线对应的依赖组后即可使用相关的基础功能。
+
+## 4 PaddleX 对飞桨框架的依赖
+
+PaddleX 的绝大部分功能依赖飞桨框架，因此，在大多数情况下，您需要在使用 PaddleX 前参考 [飞桨PaddlePaddle本地安装教程](paddlepaddle_install.md) 安装飞桨框架。不过，对于以下几种情形，不必安装飞桨框架也可以使用相应的功能：
+
+- 使用 PaddleX `genai-vllm-server` 或 `genai-sglang-server` 插件提供的能力部署模型推理服务。
+- 使用 PaddleX `genai-client` 插件调用生成式 AI 推理服务。
diff --git a/docs/module_usage/instructions/distributed_training.en.md b/docs/module_usage/instructions/distributed_training.en.md
index 850d3c6418..7bd3ba41e4 100644
--- a/docs/module_usage/instructions/distributed_training.en.md
+++ b/docs/module_usage/instructions/distributed_training.en.md
@@ -23,4 +23,4 @@ python main.py -c paddlex/configs/modules/image_classification/PP-LCNet_x1_0.yam
 
 - The IP addresses of different machines should be separated by commas and can be checked using `ifconfig` or `ipconfig`.
 - Passwordless SSH should be set up between different machines, and they should be able to ping each other directly; otherwise, communication cannot be completed.
-- The code, data, and execution commands or scripts must be consistent across all machines, and the training command or script must be run on all machines. Finally, the first device of the first machine in the `Train.dist_ips` list will be trainer0, and so on.
\ No newline at end of file
+- The code, data, and execution commands or scripts must be consistent across all machines, and the training command or script must be run on all machines. Finally, the first device of the first machine in the `Train.dist_ips` list will be trainer0, and so on.
diff --git a/docs/module_usage/instructions/distributed_training.md b/docs/module_usage/instructions/distributed_training.md
index ca09d55044..5cf29a0038 100644
--- a/docs/module_usage/instructions/distributed_training.md
+++ b/docs/module_usage/instructions/distributed_training.md
@@ -23,4 +23,4 @@ python main.py -c paddlex/configs/modules/image_classification/PP-LCNet_x1_0.yam
 
 - 不同机器的ip信息需要用逗号隔开，可以通过 `ifconfig` 或者 `ipconfig` 查看。
 - 不同机器之间需要做免密设置，且可以直接ping通，否则无法完成通信。
-- 不同机器之间的代码、数据与运行命令或脚本需要保持一致，且所有的机器上都需要运行设置好的训练命令或者脚本。最终 `Train.dist_ips` 中的第一台机器的第一块设备是trainer0，以此类推。
\ No newline at end of file
+- 不同机器之间的代码、数据与运行命令或脚本需要保持一致，且所有的机器上都需要运行设置好的训练命令或者脚本。最终 `Train.dist_ips` 中的第一台机器的第一块设备是trainer0，以此类推。
diff --git a/docs/module_usage/instructions/model_python_API.en.md b/docs/module_usage/instructions/model_python_API.en.md
index 3d026b8479..aaebaac0d1 100644
--- a/docs/module_usage/instructions/model_python_API.en.md
+++ b/docs/module_usage/instructions/model_python_API.en.md
@@ -32,12 +32,13 @@ In short, just three steps:
 * `create_model`: Instantiate the prediction model object;
   * Parameters:
     * `model_name`: `str` type, model name, such as "PP-LCNet_x1_0", "/path/to/PP-LCNet_x1_0_infer/";
-    * `model_dir`: `str` type, local path to directory of inference model files ，such as "/path/to/PP-LCNet_x1_0_infer/", default to `None`, means that use the official model specified by `model_name`;
+    * `model_dir`: `str | None` type, local path to directory of inference model files ，such as "/path/to/PP-LCNet_x1_0_infer/", default to `None`, means that use the official model specified by `model_name`;
     * `batch_size`: `int` type, default to `1`;
     * `device`: `str` type, used to set the inference device, such as "cpu", "gpu:2" for GPU settings. By default, using 0 id GPU if available, otherwise CPU;
     * `pp_option`: `PaddlePredictorOption` type, used to change inference settings (e.g. the operating mode). Please refer to [4-Inference Configuration](#4-inference-configuration) for more details;
-    * `use_hpip`：`bool` type, whether to enable the high-performance inference plugin;
-    * `hpi_config`：`dict | None` type, high-performance inference configuration;
+    * `use_hpip`: `bool` type, whether to enable the high-performance inference plugin;
+    * `hpi_config`: `dict | None` type, high-performance inference configuration;
+    * `genai_config`: `dict | None` type, generative AI configuration;
     * _`inference hyperparameters`_: used to set common inference hyperparameters. Please refer to specific model description document for details.
 
 ### 2. Perform Inference Prediction by Calling the `predict()` Method of the Prediction Model Object
diff --git a/docs/module_usage/instructions/model_python_API.md b/docs/module_usage/instructions/model_python_API.md
index 2ec8352a87..47c923c6e8 100644
--- a/docs/module_usage/instructions/model_python_API.md
+++ b/docs/module_usage/instructions/model_python_API.md
@@ -33,12 +33,13 @@ for res in output:
 * `create_model`：实例化预测模型对象；
   * 参数：
     * `model_name`：`str` 类型，模型名，如“PP-LCNet_x1_0”；
-    * `model_dir`：`str` 类型，本地 inference 模型文件目录路径，如“/path/to/PP-LCNet_x1_0_infer/”，默认为 `None`，表示使用`model_name`指定的官方推理模型；
+    * `model_dir`：`str | None` 类型，本地 inference 模型文件目录路径，如“/path/to/PP-LCNet_x1_0_infer/”，默认为 `None`，表示使用`model_name`指定的官方推理模型或不使用本地模型；
     * `batch_size`：`int` 类型，默认为 `1`；
     * `device`：`str` 类型，用于设置模型推理设备，如为GPU设置则可以指定卡号，如“cpu”、“gpu:2”，默认情况下，如GPU可用，则使用GPU 0，否则使用CPU；
     * `pp_option`：`PaddlePredictorOption` 类型，用于改变运行模式等配置项，关于推理配置的详细说明，请参考下文[4-推理配置](#4-推理配置)；
     * `use_hpip`：`bool` 类型，是否启用高性能推理插件；
     * `hpi_config`：`dict | None` 类型，高性能推理配置；
+    * `genai_config`：`dict | None` 类型，生成式 AI 配置；
     * _`推理超参数`_：支持常见推理超参数的修改，具体参数说明详见具体模型文档；
 
 ### 2. 调用预测模型对象的`predict()`方法进行推理预测
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
index 73ad164049..96818cee3e 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.en.md
@@ -1582,7 +1582,7 @@ In the above Python script, the following steps are executed:
 <td>
 <ul>
 <li><b>bool</b>: <code>True</code> or <code>False</code>;</li>
-<li><b>None</b>: If set to <code>None</code>, the default value initialized in the pipeline will be used, initialized as <code>True</code>;</li>
+<li><b>None</b>: If set to <code>None</code>, the default value initialized in the pipeline will be used, initialized as <code>False</code>;</li>
 </ul>
 </td>
 <td><code>None</code></td>
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
index 27c51bb067..630fc0de37 100644
--- a/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
+++ b/docs/pipeline_usage/tutorials/ocr_pipelines/PP-StructureV3.md
@@ -1540,7 +1540,7 @@ for item in markdown_images:
 <td>
 <ul>
 <li><b>bool</b>：<code>True</code> 或者 <code>False</code>；</li>
-<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>True</code>；</li>
+<li><b>None</b>：如果设置为<code>None</code>, 将默认使用产线初始化的该参数值，初始化为<code>False</code>；</li>
 </ul>
 </td>
 <td><code>None</code></td>
diff --git a/paddlex/inference/genai/__init__.py b/paddlex/inference/genai/__init__.py
new file mode 100644
index 0000000000..3fb06b39fe
--- /dev/null
+++ b/paddlex/inference/genai/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...utils.deps import require_genai_engine_plugin
+
+require_genai_engine_plugin()
diff --git a/paddlex/inference/genai/backends/__init__.py b/paddlex/inference/genai/backends/__init__.py
new file mode 100644
index 0000000000..b64cf01fdc
--- /dev/null
+++ b/paddlex/inference/genai/backends/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlex/inference/genai/backends/fastdeploy.py b/paddlex/inference/genai/backends/fastdeploy.py
new file mode 100644
index 0000000000..e7f2d85f86
--- /dev/null
+++ b/paddlex/inference/genai/backends/fastdeploy.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+
+from ....utils.deps import require_genai_engine_plugin
+from ..configs.utils import (
+    backend_config_to_args,
+    set_config_defaults,
+    update_backend_config,
+)
+
+
+def run_fastdeploy_server(
+    host, port, model_name, model_dir, config, chat_template_path
+):
+    require_genai_engine_plugin("fastdeploy-server")
+
+    if chat_template_path:
+        set_config_defaults(config, {"chat-template": str(chat_template_path)})
+
+    update_backend_config(
+        config,
+        {
+            "model": model_dir,
+            "host": host,
+            "port": port,
+        },
+    )
+
+    args = backend_config_to_args(config)
+    sys.argv[1:] = args
+
+    from fastdeploy.entrypoints.openai.api_server import main as run
+
+    run()
diff --git a/paddlex/inference/genai/backends/sglang.py b/paddlex/inference/genai/backends/sglang.py
new file mode 100644
index 0000000000..5b28b7831a
--- /dev/null
+++ b/paddlex/inference/genai/backends/sglang.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import subprocess
+import sys
+import tempfile
+import textwrap
+
+from ....utils.deps import require_genai_engine_plugin
+
+
+def run_sglang_server(host, port, model_name, model_dir, config, chat_template_path):
+    require_genai_engine_plugin("sglang-server")
+
+    data = json.dumps(
+        {
+            "host": host,
+            "port": port,
+            "model_name": model_name,
+            "model_dir": model_dir,
+            "config": config,
+            "chat_template_path": str(chat_template_path),
+        }
+    )
+
+    # HACK
+    code = textwrap.dedent(
+        f"""
+    import json
+    import os
+
+    from paddlex.inference.genai.configs.utils import (
+        backend_config_to_args,
+        set_config_defaults,
+        update_backend_config,
+    )
+    from paddlex.inference.genai.models import get_model_components
+    from sglang.srt.configs.model_config import multimodal_model_archs
+    from sglang.srt.entrypoints.http_server import launch_server
+    from sglang.srt.managers.multimodal_processor import PROCESSOR_MAPPING
+    from sglang.srt.models.registry import ModelRegistry
+    from sglang.srt.server_args import prepare_server_args
+    from sglang.srt.utils import kill_process_tree
+
+    data = json.loads({repr(data)})
+
+    host = data["host"]
+    port = data["port"]
+    model_name = data["model_name"]
+    model_dir = data["model_dir"]
+    config = data["config"]
+    chat_template_path = data["chat_template_path"]
+
+    network_class, processor_class = get_model_components(model_name, "sglang")
+
+    ModelRegistry.models[network_class.__name__] = network_class
+    multimodal_model_archs.append(network_class.__name__)
+    PROCESSOR_MAPPING[network_class] = processor_class
+
+    set_config_defaults(config, {{"served-model-name": model_name}})
+
+    if chat_template_path:
+        set_config_defaults(config, {{"chat-template": chat_template_path}})
+
+    set_config_defaults(config, {{"enable-metrics": True}})
+
+    update_backend_config(
+        config,
+        {{
+            "model-path": model_dir,
+            "host": host,
+            "port": port,
+        }},
+    )
+
+    if __name__ == "__main__":
+        args = backend_config_to_args(config)
+
+        server_args = prepare_server_args(args)
+
+        try:
+            launch_server(server_args)
+        finally:
+            kill_process_tree(os.getpid(), include_parent=False)
+    """
+    )
+
+    with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
+        f.write(code)
+        script_path = f.name
+
+    try:
+        subprocess.check_call([sys.executable, script_path])
+    finally:
+        os.unlink(script_path)
diff --git a/paddlex/inference/genai/backends/vllm.py b/paddlex/inference/genai/backends/vllm.py
new file mode 100644
index 0000000000..35a1b77eaa
--- /dev/null
+++ b/paddlex/inference/genai/backends/vllm.py
@@ -0,0 +1,68 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....utils.deps import is_genai_engine_plugin_available, require_genai_engine_plugin
+from ..configs.utils import (
+    backend_config_to_args,
+    set_config_defaults,
+    update_backend_config,
+)
+from ..models import ALL_MODEL_NAMES, get_model_components
+
+
+def register_models():
+    from vllm import ModelRegistry
+
+    if is_genai_engine_plugin_available("vllm-server"):
+        for model_name in ALL_MODEL_NAMES:
+            if model_name not in ModelRegistry.get_supported_archs():
+                net_cls, _ = get_model_components(model_name, "vllm")
+                ModelRegistry.register_model(net_cls.__name__, net_cls)
+
+
+def run_vllm_server(host, port, model_name, model_dir, config, chat_template_path):
+    require_genai_engine_plugin("vllm-server")
+
+    import uvloop
+    from vllm.entrypoints.openai.api_server import (
+        FlexibleArgumentParser,
+        cli_env_setup,
+        make_arg_parser,
+        run_server,
+        validate_parsed_serve_args,
+    )
+
+    cli_env_setup()
+    parser = FlexibleArgumentParser()
+    parser = make_arg_parser(parser)
+
+    set_config_defaults(config, {"served-model-name": model_name})
+
+    if chat_template_path:
+        set_config_defaults(config, {"chat-template": str(chat_template_path)})
+
+    update_backend_config(
+        config,
+        {
+            "model": model_dir,
+            "host": host,
+            "port": port,
+        },
+    )
+
+    args = backend_config_to_args(config)
+    args = parser.parse_args(args)
+    validate_parsed_serve_args(args)
+
+    uvloop.run(run_server(args))
diff --git a/paddlex/inference/genai/chat_templates/PaddleOCR-VL.jinja b/paddlex/inference/genai/chat_templates/PaddleOCR-VL.jinja
new file mode 100644
index 0000000000..11a4f373f5
--- /dev/null
+++ b/paddlex/inference/genai/chat_templates/PaddleOCR-VL.jinja
@@ -0,0 +1,45 @@
+{%- if not add_generation_prompt is defined -%}
+    {%- set add_generation_prompt = true -%}
+{%- endif -%}
+{%- if not cls_token is defined -%}
+    {%- set cls_token = "<|begin_of_sentence|>" -%}
+{%- endif -%}
+{%- if not sep_token is defined -%}
+    {%- set sep_token = "<|end_of_sentence|>" -%}
+{%- endif -%}
+{%- if not image_token is defined -%}
+    {%- set image_token = "<|vision_start|><|image_pad|><|vision_end|>" -%}
+{%- endif -%}
+{{- cls_token -}}
+{%- for message in messages -%}
+    {%- if message["role"] == "user" -%}
+        {{- "User: " -}}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] }}
+            {%- elif content["type"] == "image" -%}
+                {{ image_token }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ "\n" -}}
+    {%- elif message["role"] == "assistant" -%}
+        {{- "Assistant: " -}}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] }}
+            {%- elif content["type"] == "image" -%}
+                {{ image_token }}
+            {%- endif -%}
+        {%- endfor -%}
+        {{ sep_token -}}
+    {%- elif message["role"] == "system" -%}
+        {%- for content in message["content"] -%}
+            {%- if content["type"] == "text" -%}
+                {{ content["text"] + "\n" }}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {{- "Assistant: " -}}
+{%- endif -%}
diff --git a/paddlex/inference/genai/chat_templates/__init__.py b/paddlex/inference/genai/chat_templates/__init__.py
new file mode 100644
index 0000000000..b64cf01fdc
--- /dev/null
+++ b/paddlex/inference/genai/chat_templates/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlex/inference/genai/configs/__init__.py b/paddlex/inference/genai/configs/__init__.py
new file mode 100644
index 0000000000..b64cf01fdc
--- /dev/null
+++ b/paddlex/inference/genai/configs/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlex/inference/genai/configs/paddleocr_vl.py b/paddlex/inference/genai/configs/paddleocr_vl.py
new file mode 100644
index 0000000000..a3f80a65dd
--- /dev/null
+++ b/paddlex/inference/genai/configs/paddleocr_vl.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def get_config(backend):
+    if backend == "fastdeploy":
+        return {
+            "gpu-memory-utilization": 0.3,
+            "max-model-len": 16384,
+            "max-num-batched-tokens": 16384,
+        }
+    elif backend == "vllm":
+        return {
+            "trust-remote-code": True,
+            "gpu-memory-utilization": 0.3,
+            "max-model-len": 16384,
+            "max-num-batched-tokens": 16384,
+        }
+    elif backend == "sglang":
+        return {
+            "trust-remote-code": True,
+            "mem-fraction-static": 0.3,
+            "context-length": 16384,
+            "max-prefill-tokens": 16384,
+        }
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
diff --git a/paddlex/inference/genai/configs/utils.py b/paddlex/inference/genai/configs/utils.py
new file mode 100644
index 0000000000..9df90aa91a
--- /dev/null
+++ b/paddlex/inference/genai/configs/utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+
+
+def load_backend_config(config_path):
+    with open(config_path, "r", encoding="utf-8") as f:
+        config = yaml.safe_load(f)
+    return config
+
+
+def update_backend_config(config, overrides):
+    for k, v in overrides.items():
+        config[k] = v
+
+
+def set_config_defaults(config, defaults):
+    for k, v in defaults.items():
+        if k not in config:
+            config[k] = v
+
+
+def backend_config_to_args(config):
+    # Limited support
+    args = []
+    for k, v in config.items():
+        opt = "--" + k
+        args.append(opt)
+        if not isinstance(v, bool):
+            args.append(str(v))
+    return args
diff --git a/paddlex/inference/genai/constants.py b/paddlex/inference/genai/constants.py
new file mode 100644
index 0000000000..ef8bfc87b0
--- /dev/null
+++ b/paddlex/inference/genai/constants.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+SUPPORTED_BACKENDS = ("fastdeploy", "vllm", "sglang")
+DEFAULT_BACKEND = "fastdeploy"
diff --git a/paddlex/inference/genai/models/__init__.py b/paddlex/inference/genai/models/__init__.py
new file mode 100644
index 0000000000..b252caf80d
--- /dev/null
+++ b/paddlex/inference/genai/models/__init__.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import importlib
+from pathlib import Path
+from typing import Any, Dict, Optional, Type
+
+from pydantic import BaseModel
+
+from ....utils import logging
+from ...utils.official_models import official_models
+from ..utils import check_backend, model_name_to_module_name
+
+NETWORK_CLASS_GETTER_KEY = "get_network_class"
+PROCESSOR_CLASS_GETTER_KEY = "get_processor_class"
+CONFIG_GETTER_KEY = "get_config"
+CHAT_TEMPLATE_PATH_GETTER_KEY = "get_chat_template_path"
+DEFAULT_CHAT_TEMPLATE_FILENAME = "chat_template.jinja"
+
+ALL_MODEL_NAMES = {"PaddleOCR-VL"}
+
+
+def _check_model_name_and_backend(model_name, backend):
+    if model_name not in ALL_MODEL_NAMES:
+        raise ValueError(f"Unknown model: {model_name}")
+
+    check_backend(backend)
+
+
+def get_model_dir(model_name, backend):
+    _check_model_name_and_backend(model_name, backend)
+
+    if backend in ("vllm", "sglang"):
+        suffix = "_paddle"
+    else:
+        suffix = "_torch"
+    try:
+        model_dir = official_models[model_name + suffix]
+    except Exception as e:
+        raise RuntimeError(
+            f"Could not prepare the official model for the {repr(model_name)} model with the {repr(backend)} backend."
+        ) from e
+
+    return model_dir
+
+
+def get_model_components(model_name, backend):
+    def _get_component(getter_key):
+        if not hasattr(model_module, getter_key):
+            raise RuntimeError(f"`{model_module}` does not have `{getter_key}`")
+        getter = getattr(model_module, getter_key)
+        comp = getter(backend)
+        return comp
+
+    _check_model_name_and_backend(model_name, backend)
+
+    mod_name = model_name_to_module_name(model_name)
+
+    try:
+        model_module = importlib.import_module(f".{mod_name}", package=__package__)
+    except ModuleNotFoundError as e:
+        raise ValueError(f"Unknown model: {model_name}") from e
+
+    network_class = _get_component(NETWORK_CLASS_GETTER_KEY)
+
+    if backend == "sglang":
+        processor_class = _get_component(PROCESSOR_CLASS_GETTER_KEY)
+    else:
+        processor_class = None
+
+    return network_class, processor_class
+
+
+def get_default_config(model_name, backend):
+    _check_model_name_and_backend(model_name, backend)
+
+    mod_name = model_name_to_module_name(model_name)
+
+    try:
+        config_module = importlib.import_module(
+            f"..configs.{mod_name}", package=__package__
+        )
+    except ModuleNotFoundError:
+        logging.debug("No default configs were found for the model '%s'", model_name)
+        default_config = {}
+    else:
+        if not hasattr(config_module, CONFIG_GETTER_KEY):
+            raise RuntimeError(f"`{config_module}` does not have `{CONFIG_GETTER_KEY}`")
+        config_getter = getattr(config_module, CONFIG_GETTER_KEY)
+        default_config = config_getter(backend)
+
+    return default_config
+
+
+@contextlib.contextmanager
+def get_chat_template_path(model_name, backend, model_dir):
+    _check_model_name_and_backend(model_name, backend)
+
+    with importlib.resources.path(
+        "paddlex.inference.genai.chat_templates", f"{model_name}.jinja"
+    ) as chat_template_path:
+        if not chat_template_path.exists():
+            default_chat_template_path = Path(model_dir, DEFAULT_CHAT_TEMPLATE_FILENAME)
+            if (
+                default_chat_template_path.exists()
+                and default_chat_template_path.is_file()
+            ):
+                # TODO: Support symbolic links
+                yield default_chat_template_path
+            else:
+                logging.debug(
+                    "No chat template was found for the model '%s' with the backend '%s'",
+                    model_name,
+                    backend,
+                )
+                yield None
+        else:
+            yield chat_template_path
diff --git a/paddlex/inference/genai/models/paddleocr_vl/__init__.py b/paddlex/inference/genai/models/paddleocr_vl/__init__.py
new file mode 100644
index 0000000000..57bc8fd3e9
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def get_network_class(backend):
+    if backend == "vllm":
+        from ._vllm import PPOCRVLForConditionalGeneration
+
+        return PPOCRVLForConditionalGeneration
+    elif backend == "sglang":
+        from ._sglang import PPOCRVLForConditionalGeneration
+
+        return PPOCRVLForConditionalGeneration
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
+
+
+def get_processor_class(backend):
+    if backend == "sglang":
+        from ._sglang import PPOCRVLImageProcessor
+
+        return PPOCRVLImageProcessor
+    else:
+        raise ValueError(f"Unsupported backend: {backend}")
diff --git a/paddlex/inference/genai/models/paddleocr_vl/_sglang/__init__.py b/paddlex/inference/genai/models/paddleocr_vl/_sglang/__init__.py
new file mode 100644
index 0000000000..2c51a91d5d
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl/_sglang/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .network import PPOCRVLForConditionalGeneration
+from .processor import PPOCRVLImageProcessor
diff --git a/paddlex/inference/genai/models/paddleocr_vl/_sglang/network.py b/paddlex/inference/genai/models/paddleocr_vl/_sglang/network.py
new file mode 100644
index 0000000000..53e94179b7
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl/_sglang/network.py
@@ -0,0 +1,755 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from collections.abc import Iterable
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ......utils.deps import is_dep_available
+
+if all(
+    map(is_dep_available, ("einops", "torch", "transformers", "sglang", "flash-attn"))
+):
+    import torch
+    import torch.nn as nn
+    from einops import rearrange
+    from sglang.srt.layers.activation import get_act_fn
+    from sglang.srt.layers.attention.vision import VisionAttention
+    from sglang.srt.layers.linear import ColumnParallelLinear, RowParallelLinear
+    from sglang.srt.layers.quantization.base_config import QuantizationConfig
+    from sglang.srt.managers.mm_utils import (
+        MultiModalityDataPaddingPatternMultimodalTokens,
+        general_mm_embed_routine,
+    )
+    from sglang.srt.managers.schedule_batch import MultimodalDataItem, MultimodalInputs
+    from sglang.srt.model_executor.forward_batch_info import ForwardBatch
+    from sglang.srt.model_loader.weight_utils import default_weight_loader
+    from sglang.srt.models.ernie4 import Ernie4_5_ForCausalLM
+    from transformers.activations import GELUActivation
+    from transformers.modeling_outputs import (
+        BaseModelOutput,
+        BaseModelOutputWithPooling,
+    )
+    from transformers.utils import torch_int
+
+    class Projector(nn.Module):
+
+        def __init__(
+            self,
+            text_config,
+            vision_config,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.text_config = text_config
+            self.vision_config = vision_config
+            self.merge_kernel_size = (2, 2)
+
+            self.hidden_size = (
+                self.vision_config.hidden_size
+                * self.merge_kernel_size[0]
+                * self.merge_kernel_size[1]
+            )
+
+            self.pre_norm = torch.nn.LayerNorm(
+                self.vision_config.hidden_size, eps=1e-05
+            )
+            self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+            self.act = GELUActivation()
+            self.linear_2 = nn.Linear(
+                self.hidden_size, self.text_config.hidden_size, bias=True
+            )
+
+        def forward(
+            self,
+            image_features: torch.Tensor,
+            image_grid_thw: List[Tuple[int, int, int]],
+        ) -> torch.Tensor:
+            m1, m2 = self.merge_kernel_size
+            if isinstance(image_features, (list, tuple)):
+                processed_features = list()
+                for image_feature, image_grid in zip(image_features, image_grid_thw):
+                    image_feature = self.pre_norm(image_feature)
+                    t, h, w = image_grid
+
+                    image_feature = rearrange(
+                        image_feature,
+                        "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                        t=t,
+                        h=h // m1,
+                        p1=m1,
+                        w=w // m2,
+                        p2=m2,
+                    )
+                    hidden_states = self.linear_1(image_feature)
+                    hidden_states = self.act(hidden_states)
+                    hidden_states = self.linear_2(hidden_states)
+                    processed_features.append(hidden_states)
+
+                return processed_features
+
+            dims = image_features.shape[:-1]
+            dim = image_features.shape[-1]
+            image_features = image_features.view(np.prod(dims), dim)
+            hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+            hidden_states = self.linear_1(hidden_states)
+            hidden_states = self.act(hidden_states)
+            hidden_states = self.linear_2(hidden_states)
+
+            return hidden_states.view(*dims, -1)
+
+    class SiglipVisionEmbeddings(nn.Module):
+
+        def __init__(self, config):
+            super().__init__()
+            self.config = config
+            self.embed_dim = config.hidden_size
+            self.image_size = config.image_size
+            self.patch_size = config.patch_size
+
+            self.patch_embedding = nn.Conv2d(
+                in_channels=config.num_channels,
+                out_channels=self.embed_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+                padding="valid",
+            )
+
+            self.num_patches = (self.image_size // self.patch_size) ** 2
+            self.num_positions = self.num_patches
+            self.cache_position_embedding = dict()
+            self.cache_position_count = dict()
+            self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+            self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+            self.register_buffer(
+                "position_ids",
+                torch.arange(self.num_positions).expand((1, -1)),
+                persistent=False,
+            )
+
+        def interpolate_pos_encoding(
+            self,
+            embeddings: torch.Tensor,
+            height: int,
+            width: int,
+            is_after_patchify: bool = False,
+        ) -> torch.Tensor:
+
+            num_positions = self.position_embedding.weight.shape[0]
+
+            patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+            dim = embeddings.shape[-1]
+
+            if is_after_patchify:
+                new_height = height
+                new_width = width
+            else:
+                new_height = height // self.patch_size
+                new_width = width // self.patch_size
+
+            sqrt_num_positions = torch_int(num_positions**0.5)
+            patch_pos_embed = patch_pos_embed.reshape(
+                1, sqrt_num_positions, sqrt_num_positions, dim
+            )
+            patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+            patch_pos_embed = nn.functional.interpolate(
+                patch_pos_embed,
+                size=(new_height, new_width),
+                mode="bilinear",
+                align_corners=False,
+            )
+
+            patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+            return patch_pos_embed
+
+        def fetch_position_embedding_lfu_cache(
+            self, embeddings, h, w, max_cache: int = 20
+        ):
+            grid = (h, w)
+            if grid in self.cache_position_embedding:
+                self.cache_position_count[grid] += 1
+                return self.cache_position_embedding[grid]
+
+            if len(self.cache_position_embedding) >= max_cache:
+                min_hit_grid = min(
+                    self.cache_position_count,
+                    key=self.cache_position_count.get,
+                )
+                self.cache_position_count.pop(min_hit_grid)
+                self.cache_position_embedding.pop(min_hit_grid)
+
+            position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+            self.cache_position_count[grid] = 1
+            self.cache_position_embedding[grid] = position_embedding
+            return position_embedding
+
+        def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            position_ids: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            interpolate_pos_encoding=False,
+        ) -> torch.Tensor:
+            if pixel_values.dim() == 4:
+                pixel_values = pixel_values.unsqueeze(0)
+            if pixel_values.dim() == 5:
+                if position_ids is None:
+                    raise ValueError(
+                        "position_ids cannot be None when pixel_values.dim() is 5."
+                    )
+                (
+                    batch_size,
+                    squence_len,
+                    channel,
+                    height,
+                    width,
+                ) = pixel_values.shape
+                target_dtype = self.patch_embedding.weight.dtype
+                pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+                patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+                embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+                if interpolate_pos_encoding and image_grid_thw is not None:
+                    start = 0
+                    tmp_embeddings = list()
+                    for image_grid in image_grid_thw:
+                        t, h, w = image_grid
+                        end = start + t * h * w
+                        image_embeddings = embeddings[start:end, :]
+                        position_embedding = (
+                            self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                            .squeeze(0)
+                            .repeat(t, 1)
+                        )
+                        image_embeddings = image_embeddings + position_embedding
+                        tmp_embeddings.append(image_embeddings)
+                        start = end
+                    embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+                else:
+                    embeddings = embeddings + self.packing_position_embedding(
+                        position_ids
+                    )
+                return embeddings
+            else:
+                raise ValueError(
+                    "Unsupported pixel_values dimension:"
+                    f" {pixel_values.dim()}. Expected 4 or 5."
+                )
+
+    class SigLIPRotaryEmbedding(nn.Module):
+
+        def __init__(self, dim: int, theta: float = 10000.0) -> None:
+            super().__init__()
+            self.dim = dim
+            self.theta = theta
+            self.rope_init()
+
+        def rope_init(self):
+            inv_freq = 1.0 / (
+                self.theta
+                ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        def forward(self, seqlen: int) -> torch.Tensor:
+            seq = torch.arange(
+                seqlen,
+                device=self.inv_freq.device,
+                dtype=self.inv_freq.dtype,
+            )
+            freqs = torch.outer(seq, self.inv_freq)
+            return freqs
+
+    class SiglipMLP(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ) -> None:
+            super().__init__()
+
+            self.config = config
+            self.activation_fn = get_act_fn(config.hidden_act)
+            if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
+                quantizable = True
+            else:
+                quantizable = (
+                    config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
+                )
+            self.fc1 = ColumnParallelLinear(
+                config.hidden_size,
+                config.intermediate_size,
+                quant_config=quant_config if quantizable else None,
+                prefix=f"{prefix}.fc1",
+            )
+            self.fc2 = RowParallelLinear(
+                config.intermediate_size,
+                config.hidden_size,
+                quant_config=quant_config if quantizable else None,
+                prefix=f"{prefix}.fc2",
+            )
+
+        def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+            hidden_states, _ = self.fc1(hidden_states)
+            hidden_states = self.activation_fn(hidden_states)
+            hidden_states, _ = self.fc2(hidden_states)
+            return hidden_states
+
+    class SiglipEncoderLayer(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            attn_implementation: Optional[str] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.embed_dim = config.hidden_size
+            self.num_heads = config.num_attention_heads
+            self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.mlp = SiglipMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+            if attn_implementation is None:
+                softmax_in_single_precision = False
+                qkv_backend = None
+            elif attn_implementation == "sdpa":
+                softmax_in_single_precision = False
+                qkv_backend = "sdpa"
+            elif attn_implementation == "flash_attention_2":
+                softmax_in_single_precision = False
+                qkv_backend = "triton_attn"
+            elif attn_implementation == "eager":
+                softmax_in_single_precision = True
+                qkv_backend = "sdpa"
+            elif attn_implementation == "flash_attention_3":
+                softmax_in_single_precision = False
+                qkv_backend = "fa3"
+
+            self.self_attn = VisionAttention(
+                embed_dim=self.embed_dim,
+                num_heads=self.num_heads,
+                projection_size=self.embed_dim,
+                use_qkv_parallel=True,
+                qkv_backend=qkv_backend,
+                softmax_in_single_precision=softmax_in_single_precision,
+                flatten_batch=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            rope_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        ) -> Tuple[torch.FloatTensor]:
+
+            residual = hidden_states
+
+            hidden_states = self.layer_norm1(hidden_states)
+            hidden_states = self.self_attn(
+                hidden_states,
+                cu_seqlens=cu_seqlens,
+                position_embeddings=rope_emb,
+            )
+            hidden_states = residual + hidden_states
+
+            residual = hidden_states
+            hidden_states = self.layer_norm2(hidden_states)
+            hidden_states = self.mlp(hidden_states)
+
+            hidden_states = residual + hidden_states
+
+            return hidden_states
+
+    class SiglipEncoder(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+            embed_dim = config.hidden_size
+            num_heads = config.num_attention_heads
+            head_dim = embed_dim // num_heads
+            self.layers = nn.ModuleList(
+                [
+                    SiglipEncoderLayer(
+                        config,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                    )
+                    for layer_idx in range(config.num_hidden_layers)
+                ]
+            )
+            self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+        @staticmethod
+        def flatten_list(image_grid_thw):
+            tmp_image_grid_thw = list()
+            for image_grid in image_grid_thw:
+                if isinstance(image_grid, list):
+                    tmp_image_grid_thw.extend(image_grid)
+                else:
+                    tmp_image_grid_thw.append(image_grid)
+            return tmp_image_grid_thw
+
+        def forward(
+            self,
+            inputs_embeds,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            height_position_ids: Optional[torch.Tensor] = None,
+            width_position_ids: Optional[torch.Tensor] = None,
+        ) -> BaseModelOutput:
+            device = inputs_embeds.device
+            hidden_states = inputs_embeds
+            flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+            if width_position_ids is None or height_position_ids is None:
+                split_hids = list()
+                split_wids = list()
+                for t, h, w in flatten_image_grid_thw:
+                    image_pids = torch.arange(t * h * w, device=device) % (h * w)
+                    sample_hids = image_pids // w
+                    sample_wids = image_pids % w
+                    split_hids.append(sample_hids)
+                    split_wids.append(sample_wids)
+                width_position_ids = torch.concat(split_wids, dim=0)
+                height_position_ids = torch.concat(split_hids, dim=0)
+
+            pids = torch.stack(
+                [height_position_ids, width_position_ids],
+                dim=-1,
+            )
+            max_grid_size = pids.max() + 1
+            rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+            rope_emb = rope_emb_max_grid[pids].flatten(1)
+            rope_emb = rope_emb.repeat(1, 2)
+            rope_emb = (rope_emb.cos(), rope_emb.sin())
+
+            attn_cu_seqlens = cu_seqlens
+            hidden_states = inputs_embeds
+
+            for encoder_layer in self.layers:
+                hidden_states = encoder_layer(
+                    hidden_states,
+                    cu_seqlens=attn_cu_seqlens,
+                    rope_emb=rope_emb,
+                )
+            return hidden_states
+
+    class SiglipVisionTransformer(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+            embed_dim = config.hidden_size
+
+            self.embeddings = SiglipVisionEmbeddings(config)
+            self.encoder = SiglipEncoder(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.encoder",
+            )
+            self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        def forward(
+            self,
+            pixel_values,
+            interpolate_pos_encoding: Optional[bool] = False,
+            position_ids: Optional[torch.Tensor] = None,
+            height_position_ids: Optional[torch.Tensor] = None,
+            width_position_ids: Optional[torch.Tensor] = None,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+        ) -> BaseModelOutputWithPooling:
+
+            hidden_states = self.embeddings(
+                pixel_values,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+                position_ids=position_ids,
+                image_grid_thw=image_grid_thw,
+            )
+
+            last_hidden_state = self.encoder(
+                inputs_embeds=hidden_states,
+                cu_seqlens=cu_seqlens,
+                image_grid_thw=image_grid_thw,
+                height_position_ids=height_position_ids,
+                width_position_ids=width_position_ids,
+            )
+
+            last_hidden_state = self.post_layernorm(last_hidden_state)
+
+            sample_hidden_state = list()
+            if cu_seqlens is None:
+                raise ValueError(
+                    "cu_seqlens cannot be None for "
+                    "SiglipVisionTransformer output processing."
+                )
+            for i in range(cu_seqlens.shape[0] - 1):
+                start = cu_seqlens[i]
+                end = cu_seqlens[i + 1]
+                tensor = last_hidden_state[:, start:end, :].squeeze(0)
+                sample_hidden_state.append(tensor)
+
+            return sample_hidden_state
+
+    class SiglipVisionModel(nn.Module):
+        config_class = "PPOCRVisionConfig"
+        main_input_name = "pixel_values"
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+
+            self.vision_model = SiglipVisionTransformer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.vision_model",
+            )
+            self.quant_config = quant_config
+
+        @property
+        def dtype(self) -> torch.dtype:
+            return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+        @property
+        def device(self) -> torch.device:
+            return self.vision_model.embeddings.patch_embedding.weight.device
+
+        def get_input_embeddings(self) -> nn.Module:
+            return self.vision_model.embeddings.patch_embedding
+
+        def forward(
+            self,
+            pixel_values,
+            interpolate_pos_encoding: bool = False,
+            position_ids: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+        ) -> BaseModelOutputWithPooling:
+
+            return self.vision_model(
+                pixel_values=pixel_values,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+                position_ids=position_ids,
+                image_grid_thw=image_grid_thw,
+                cu_seqlens=cu_seqlens,
+            )
+
+    class PPOCRVLForConditionalGeneration(Ernie4_5_ForCausalLM):
+
+        def __init__(self, *, config, quant_config=None, prefix: str = ""):
+            super().__init__(config=config, prefix=prefix)
+            config = self.config
+
+            self.mlp_AR = Projector(config, config.vision_config)
+            self.visual = SiglipVisionModel(config=config.vision_config)
+            if not hasattr(self.model, "get_input_embeddings"):
+                import types
+
+                self.model.get_input_embeddings = types.MethodType(
+                    get_input_embeddings, self.model
+                )
+            self.is_mrope_enabled = "mrope_section" in self.config.rope_scaling
+
+        def pad_input_ids(self, input_ids: List[int], mm_inputs: MultimodalInputs):
+            pattern = MultiModalityDataPaddingPatternMultimodalTokens()
+            return pattern.pad_input_tokens(input_ids, mm_inputs)
+
+        def get_input_embeddings(self):
+            return self.model.embed_tokens
+
+        def encode_image(self, pixel_values, image_grid_thw):
+            pixel_values = pixel_values.type(self.visual.dtype)
+            siglip_position_ids = list()
+            image_grid_hws = list()
+            cu_seqlens = [0]
+
+            for idx, thw in enumerate(image_grid_thw):
+                thw_tuple = tuple(thw.detach().cpu().numpy().tolist())
+                numel = np.prod(thw_tuple)
+                image_grid_hws.append(thw_tuple)
+                image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+                siglip_position_ids.append(image_position_ids)
+                cu_seqlens.append(cu_seqlens[-1] + numel)
+
+            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+                pixel_values.device
+            )
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values.device
+            )
+            vision_outputs = self.visual(
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_hws,
+                position_ids=siglip_position_ids,
+                interpolate_pos_encoding=True,
+                cu_seqlens=cu_seqlens,
+            )
+            image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
+            image_embeds = torch.stack(image_embeds, dim=0)
+
+            return image_embeds
+
+        def get_image_feature(self, items: List[MultimodalDataItem]) -> torch.Tensor:
+            pixel_values = torch.cat([item.feature for item in items], dim=0).type(
+                self.visual.dtype
+            )
+            image_grid_thw = torch.concat(
+                [item.image_grid_thw for item in items], dim=0
+            )
+            image_embeds = self.encode_image(pixel_values, image_grid_thw)
+
+            return image_embeds
+
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            forward_batch: ForwardBatch,
+            get_embedding: bool = False,
+        ):
+            if self.is_mrope_enabled:
+                positions = forward_batch.mrope_positions
+            if not (
+                forward_batch.forward_mode.is_decode()
+                or not forward_batch.contains_image_inputs()
+            ):
+                if self.is_mrope_enabled:
+                    assert positions.ndim == 2 and positions.size(0) == 3, (
+                        "multimodal section rotary embedding requires "
+                        f"(3, seq_len) positions, but got {positions.size()}"
+                    )
+
+            hidden_states = general_mm_embed_routine(
+                input_ids=input_ids,
+                forward_batch=forward_batch,
+                language_model=self.model,
+                multimodal_model=self,
+                positions=positions,
+            )
+
+            return self.logits_processor(
+                input_ids, hidden_states, self.lm_head, forward_batch
+            )
+
+        def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> set[str]:
+            stacked_params_mapping = [
+                # (param_name, weight_name, shard_id)
+                (".qkv_proj", ".q_proj", "q"),
+                (".qkv_proj", ".k_proj", "k"),
+                (".qkv_proj", ".v_proj", "v"),
+                (".gate_up_proj", ".gate_proj", 0),
+                (".gate_up_proj", ".up_proj", 1),
+            ]
+            params_dict = dict(self.named_parameters())
+            for name, loaded_weight in weights:
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                if "head.attention" in name or "head.layernorm" in name:
+                    continue
+                if "head.mlp" in name or "head.probe" in name:
+                    continue
+
+                for param_name, weight_name, shard_id in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if "visual" in name:
+                        # adapt to VisionAttention
+                        name = name.replace(r"self_attn.qkv.", r"self_attn.qkv_proj.")
+                        name = name.replace(r"self_attn.out_proj.", r"self_attn.proj.")
+
+                    try:
+                        # Skip loading extra bias for GPTQ models.
+                        if name.endswith(".bias") and name not in params_dict:
+                            continue
+                        param = params_dict[name]
+                    except KeyError:
+                        print(params_dict.keys())
+                        raise
+
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+
+    # monkey patch
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embed_tokens
diff --git a/paddlex/inference/genai/models/paddleocr_vl/_sglang/processor.py b/paddlex/inference/genai/models/paddleocr_vl/_sglang/processor.py
new file mode 100644
index 0000000000..7f08d07e19
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl/_sglang/processor.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List, Optional, Tuple, Union
+
+from ......utils.deps import is_dep_available
+
+if is_dep_available("sglang"):
+    import torch
+    from sglang.srt.multimodal.processors.base_processor import (
+        BaseMultimodalProcessor,
+        MultimodalSpecialTokens,
+    )
+
+    class PPOCRVLImageProcessor(BaseMultimodalProcessor):
+
+        def __init__(self, hf_config, server_args, _processor, *args, **kwargs):
+            super().__init__(hf_config, server_args, _processor, *args, **kwargs)
+
+            self.vision_start_token_id = hf_config.vision_start_token_id
+            self.mm_tokens = MultimodalSpecialTokens(
+                image_token="<|vision_start|><|image_pad|><|vision_end|>",
+                image_token_id=hf_config.image_token_id,
+                video_token_id=hf_config.video_token_id,
+            ).build(_processor)
+
+        async def process_mm_data_async(
+            self,
+            image_data: List[Union[str, bytes]],
+            input_text,
+            request_obj,
+            *args,
+            **kwargs,
+        ):
+            base_out = self.load_mm_data(
+                prompt=input_text,
+                image_data=image_data,
+                multimodal_tokens=self.mm_tokens,
+            )
+
+            mm_items, input_ids, ret = self.process_and_combine_mm_data(
+                base_out, self.mm_tokens
+            )
+
+            input_ids = input_ids.flatten()
+            mrope_positions, mrope_position_delta = self.get_rope_index(
+                spatial_merge_size=self.hf_config.vision_config.spatial_merge_size,
+                image_token_id=self.mm_tokens.image_token_id,
+                video_token_id=self.mm_tokens.video_token_id,
+                vision_start_token_id=self.vision_start_token_id,
+                model_type=self.hf_config.model_type,
+                tokens_per_second=getattr(
+                    self.hf_config.vision_config, "tokens_per_second", None
+                ),
+                input_ids=input_ids.unsqueeze(0),
+                image_grid_thw=getattr(ret, "image_grid_thw", None),
+            )
+            mrope_positions = mrope_positions.squeeze(1)
+
+            return {
+                "mm_items": mm_items,
+                "input_ids": input_ids.tolist(),
+                "im_token_id": self.mm_tokens.image_token_id,
+                "mrope_positions": mrope_positions,
+                "mrope_position_delta": mrope_position_delta,
+            }
+
+        @staticmethod
+        def get_rope_index(
+            spatial_merge_size: int,
+            image_token_id: int,
+            video_token_id: int,
+            vision_start_token_id: int,
+            model_type: str,
+            tokens_per_second: Optional[int] = None,
+            input_ids: Optional[torch.LongTensor] = None,
+            image_grid_thw: Optional[torch.LongTensor] = None,
+            video_grid_thw: Optional[torch.LongTensor] = None,
+            second_per_grid_ts: Optional[torch.Tensor] = None,
+            **kwargs,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+            mrope_position_deltas = []
+            if input_ids is not None and (
+                image_grid_thw is not None or video_grid_thw is not None
+            ):
+                total_input_ids = input_ids
+                position_ids = torch.ones(
+                    3,
+                    input_ids.shape[0],
+                    input_ids.shape[1],
+                    dtype=input_ids.dtype,
+                    device=input_ids.device,
+                )
+                image_index, video_index = 0, 0
+                for i, input_ids in enumerate(total_input_ids):
+                    image_nums, video_nums = 0, 0
+                    vision_start_indices = torch.argwhere(
+                        input_ids == vision_start_token_id
+                    ).squeeze(1)
+                    vision_tokens = input_ids[vision_start_indices + 1]
+                    image_nums = (vision_tokens == image_token_id).sum()
+                    video_nums = (vision_tokens == video_token_id).sum()
+                    input_tokens = input_ids.tolist()
+                    llm_pos_ids_list: list = []
+                    st = 0
+                    remain_images, remain_videos = image_nums, video_nums
+                    for _ in range(image_nums + video_nums):
+                        if image_token_id in input_tokens and remain_images > 0:
+                            ed_image = input_tokens.index(image_token_id, st)
+                        else:
+                            ed_image = len(input_tokens) + 1
+                        if video_token_id in input_tokens and remain_videos > 0:
+                            ed_video = input_tokens.index(video_token_id, st)
+                        else:
+                            ed_video = len(input_tokens) + 1
+                        if ed_image < ed_video:
+                            t, h, w = (
+                                image_grid_thw[image_index][0],
+                                image_grid_thw[image_index][1],
+                                image_grid_thw[image_index][2],
+                            )
+                            second_per_grid_t = 0
+                            image_index += 1
+                            remain_images -= 1
+                            ed = ed_image
+                        else:
+                            t, h, w = (
+                                video_grid_thw[video_index][0],
+                                video_grid_thw[video_index][1],
+                                video_grid_thw[video_index][2],
+                            )
+                            if second_per_grid_ts is not None:
+                                second_per_grid_t = second_per_grid_ts[video_index]
+                            else:
+                                second_per_grid_t = 1.0
+                            video_index += 1
+                            remain_videos -= 1
+                            ed = ed_video
+                        llm_grid_t, llm_grid_h, llm_grid_w = (
+                            t.item(),
+                            h.item() // spatial_merge_size,
+                            w.item() // spatial_merge_size,
+                        )
+                        text_len = ed - st
+
+                        st_idx = (
+                            llm_pos_ids_list[-1].max() + 1
+                            if len(llm_pos_ids_list) > 0
+                            else 0
+                        )
+                        llm_pos_ids_list.append(
+                            torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                        )
+
+                        range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                        expanded_range = range_tensor.expand(
+                            -1, llm_grid_h * llm_grid_w
+                        )
+
+                        time_tensor = (
+                            expanded_range * second_per_grid_t * tokens_per_second
+                        )
+
+                        time_tensor_long = time_tensor.long()
+                        t_index = time_tensor_long.flatten()
+
+                        h_index = (
+                            torch.arange(llm_grid_h)
+                            .view(1, -1, 1)
+                            .expand(llm_grid_t, -1, llm_grid_w)
+                            .flatten()
+                        )
+                        w_index = (
+                            torch.arange(llm_grid_w)
+                            .view(1, 1, -1)
+                            .expand(llm_grid_t, llm_grid_h, -1)
+                            .flatten()
+                        )
+                        llm_pos_ids_list.append(
+                            torch.stack([t_index, h_index, w_index]) + text_len + st_idx
+                        )
+                        st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+                    if st < len(input_tokens):
+                        st_idx = (
+                            llm_pos_ids_list[-1].max() + 1
+                            if len(llm_pos_ids_list) > 0
+                            else 0
+                        )
+                        text_len = len(input_tokens) - st
+                        llm_pos_ids_list.append(
+                            torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx
+                        )
+
+                    llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+                    position_ids[..., i, :] = llm_positions.to(position_ids.device)
+                    mrope_position_deltas.append(
+                        llm_positions.max() + 1 - len(total_input_ids[i])
+                    )
+                mrope_position_deltas = torch.tensor(
+                    mrope_position_deltas, device=input_ids.device
+                ).unsqueeze(1)
+                return position_ids, mrope_position_deltas
+            else:
+                s = input_ids.shape[1]
+                position_ids = torch.arange(s)
+                position_ids = (
+                    position_ids.unsqueeze(0).expand(3, -1, -1).to(input_ids.device)
+                )
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                    -1, keepdim=True
+                )[0]
+                mrope_position_deltas = max_position_ids + 1 - s
+                return position_ids, mrope_position_deltas
diff --git a/paddlex/inference/genai/models/paddleocr_vl/_vllm.py b/paddlex/inference/genai/models/paddleocr_vl/_vllm.py
new file mode 100644
index 0000000000..9c54ae740c
--- /dev/null
+++ b/paddlex/inference/genai/models/paddleocr_vl/_vllm.py
@@ -0,0 +1,1210 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from collections.abc import Iterable, Mapping, Sequence
+from functools import partial
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from .....utils.deps import is_dep_available
+
+if all(
+    map(is_dep_available, ("einops", "torch", "transformers", "vllm", "flash-attn"))
+):
+    import torch
+    import torch.nn as nn
+    from einops import rearrange, repeat
+    from transformers import BatchFeature
+    from transformers.activations import GELUActivation
+    from transformers.modeling_outputs import (
+        BaseModelOutput,
+        BaseModelOutputWithPooling,
+    )
+    from transformers.utils import torch_int
+    from vllm.compilation.decorators import support_torch_compile
+    from vllm.config import VllmConfig
+    from vllm.distributed import get_tensor_model_parallel_world_size
+    from vllm.model_executor.layers.activation import get_act_fn
+    from vllm.model_executor.layers.linear import (
+        ColumnParallelLinear,
+        QKVParallelLinear,
+        RowParallelLinear,
+    )
+    from vllm.model_executor.layers.logits_processor import LogitsProcessor
+    from vllm.model_executor.layers.quantization import QuantizationConfig
+    from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+    from vllm.model_executor.model_loader.weight_utils import (
+        default_weight_loader,
+        maybe_remap_kv_scale_name,
+    )
+    from vllm.model_executor.models.vision import get_vit_attn_backend
+    from vllm.platforms import _Backend, current_platform
+
+    try:
+        from vllm.model_executor.models.ernie45 import Ernie4_5_ForCausalLM
+    except ImportError:
+        from vllm.model_executor.models.ernie45 import (
+            Ernie4_5ForCausalLM as Ernie4_5_ForCausalLM,
+        )
+    from vllm.model_executor.models.interfaces import SupportsMultiModal
+    from vllm.model_executor.models.utils import (
+        AutoWeightsLoader,
+        PPMissingLayer,
+        is_pp_missing_parameter,
+        merge_multimodal_embeddings,
+    )
+    from vllm.multimodal import MULTIMODAL_REGISTRY
+    from vllm.multimodal.inputs import (
+        MultiModalDataDict,
+        MultiModalFieldConfig,
+        MultiModalKwargs,
+        NestedTensors,
+    )
+    from vllm.multimodal.parse import (
+        ImageProcessorItems,
+        ImageSize,
+        MultiModalDataItems,
+    )
+    from vllm.multimodal.processing import (
+        BaseMultiModalProcessor,
+        BaseProcessingInfo,
+        PromptReplacement,
+        PromptUpdate,
+    )
+    from vllm.multimodal.profiling import BaseDummyInputsBuilder
+    from vllm.sequence import IntermediateTensors
+
+    def smart_resize(
+        height: int,
+        width: int,
+        factor: int = 28,
+        min_pixels: int = 28 * 28 * 130,
+        max_pixels: int = 28 * 28 * 1280,
+    ):
+        """Rescales the image so that the following conditions are met:
+
+        1. Both dimensions (height and width) are divisible by 'factor'.
+
+        2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+        3. The aspect ratio of the image is maintained as closely as possible.
+
+        """
+        # if height < factor or width < factor:
+        #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+        # if int(height < factor//4) + int(width < factor//4):
+        #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
+
+        if height < factor:
+            print(
+                f"smart_resize: height={height} < factor={factor}, reset height=factor"
+            )
+            width = round((width * factor) / height)
+            height = factor
+
+        if width < factor:
+            print(f"smart_resize: width={width} < factor={factor}, reset width=factor")
+            height = round((height * factor) / width)
+            width = factor
+
+        if max(height, width) / min(height, width) > 200:
+            raise ValueError(
+                f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+            )
+        h_bar = round(height / factor) * factor
+        w_bar = round(width / factor) * factor
+        if h_bar * w_bar > max_pixels:
+            beta = math.sqrt((height * width) / max_pixels)
+            h_bar = math.floor(height / beta / factor) * factor
+            w_bar = math.floor(width / beta / factor) * factor
+        elif h_bar * w_bar < min_pixels:
+            beta = math.sqrt(min_pixels / (height * width))
+            h_bar = math.ceil(height * beta / factor) * factor
+            w_bar = math.ceil(width * beta / factor) * factor
+        return h_bar, w_bar
+
+    class PPOCRVLProcessingInfo(BaseProcessingInfo):
+
+        def get_hf_config(self):
+            return self.ctx.get_hf_config()
+
+        def get_hf_processor(self, **kwargs: object):
+            return self.ctx.get_hf_processor(**kwargs)
+
+        def get_image_processor(self, **kwargs: object):
+            return self.get_hf_processor(**kwargs).image_processor
+
+        def get_supported_mm_limits(self):
+            return {"image": None}
+
+        def get_num_image_tokens(
+            self,
+            *,
+            image_width: int,
+            image_height: int,
+            image_processor,
+        ) -> int:
+            if image_processor is None:
+                image_processor = self.get_image_processor()
+
+            do_resize = True
+            hf_config = self.get_hf_config()
+            vision_config = hf_config.vision_config
+            patch_size = vision_config.patch_size
+            merge_size = vision_config.spatial_merge_size
+
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height=image_height,
+                    width=image_width,
+                    factor=patch_size * merge_size,
+                    min_pixels=image_processor.min_pixels,
+                    max_pixels=image_processor.max_pixels,
+                )
+                preprocessed_size = ImageSize(
+                    width=resized_width, height=resized_height
+                )
+            else:
+                preprocessed_size = ImageSize(width=image_width, height=image_height)
+
+            grid_t = 1
+            grid_h = preprocessed_size.height // patch_size
+            grid_w = preprocessed_size.width // patch_size
+
+            num_patches = grid_t * grid_h * grid_w
+            num_image_tokens = num_patches // (merge_size**2)
+
+            return num_image_tokens
+
+        def get_image_size_with_most_features(self) -> ImageSize:
+            hf_config = self.get_hf_config()
+            image_size = hf_config.vision_config.image_size
+            return ImageSize(height=image_size, width=image_size)
+
+    class PPOCRVLDummyInputsBuilder(BaseDummyInputsBuilder[PPOCRVLProcessingInfo]):
+
+        def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str:
+            num_images = mm_counts.get("image", 0)
+
+            processor = self.info.get_hf_processor()
+            image_token = processor.image_token
+
+            return image_token * num_images
+
+        def get_dummy_mm_data(
+            self,
+            seq_len: int,
+            mm_counts: Mapping[str, int],
+        ) -> MultiModalDataDict:
+            num_images = mm_counts.get("image", 0)
+
+            (target_width, target_height) = (
+                self.info.get_image_size_with_most_features()
+            )
+
+            return {
+                "image": self._get_dummy_images(
+                    width=target_width, height=target_height, num_images=num_images
+                )
+            }
+
+    class PPOCRVLMultiModalProcessor(BaseMultiModalProcessor[PPOCRVLProcessingInfo]):
+
+        def _call_hf_processor(
+            self,
+            prompt: str,
+            mm_data: Mapping[str, object],
+            mm_kwargs: Mapping[str, object],
+            tok_kwargs: Mapping[str, object],
+        ) -> BatchFeature:
+            if mm_data:
+                processed_outputs = self.info.ctx.call_hf_processor(
+                    self.info.get_hf_processor(**mm_kwargs),
+                    dict(text=prompt, **mm_data),
+                    dict(**mm_kwargs, **tok_kwargs),
+                )
+                processed_outputs["pixel_values"] = processed_outputs[
+                    "pixel_values"
+                ].unsqueeze(0)
+            else:
+                tokenizer = self.info.get_tokenizer()
+                processed_outputs = tokenizer(
+                    prompt, add_special_tokens=True, return_tensors="pt"
+                )
+            return processed_outputs
+
+        def _get_mm_fields_config(
+            self,
+            hf_inputs: BatchFeature,
+            hf_processor_mm_kwargs: Mapping[str, object],
+        ) -> Mapping[str, MultiModalFieldConfig]:
+            return dict(
+                pixel_values=MultiModalFieldConfig.batched("image"),
+                image_grid_thw=MultiModalFieldConfig.batched("image"),
+            )
+
+        def _get_prompt_updates(
+            self,
+            mm_items: MultiModalDataItems,
+            hf_processor_mm_kwargs: Mapping[str, object],
+            out_mm_kwargs: MultiModalKwargs,
+        ) -> Sequence[PromptUpdate]:
+            image_processor = self.info.get_image_processor(**hf_processor_mm_kwargs)
+            hf_config = self.info.get_hf_config()
+            image_token_id = hf_config.image_token_id
+
+            def get_replacement(item_idx: int, image_processor):
+                images = mm_items.get_items("image", ImageProcessorItems)
+
+                image_size = images.get_image_size(item_idx)
+                num_image_tokens = self.info.get_num_image_tokens(
+                    image_width=image_size.width,
+                    image_height=image_size.height,
+                    image_processor=image_processor,
+                )
+
+                return [image_token_id] * num_image_tokens
+
+            return [
+                PromptReplacement(
+                    modality="image",
+                    target=[image_token_id],
+                    replacement=partial(
+                        get_replacement, image_processor=image_processor
+                    ),
+                ),
+            ]
+
+    class Projector(nn.Module):
+
+        def __init__(
+            self,
+            text_config,
+            vision_config,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.text_config = text_config
+            self.vision_config = vision_config
+            self.merge_kernel_size = (2, 2)
+
+            self.hidden_size = (
+                self.vision_config.hidden_size
+                * self.merge_kernel_size[0]
+                * self.merge_kernel_size[1]
+            )
+
+            self.pre_norm = torch.nn.LayerNorm(
+                self.vision_config.hidden_size, eps=1e-05
+            )
+            self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
+            self.act = GELUActivation()
+            self.linear_2 = nn.Linear(
+                self.hidden_size, self.text_config.hidden_size, bias=True
+            )
+
+        def forward(
+            self,
+            image_features: torch.Tensor,
+            image_grid_thw: List[Tuple[int, int, int]],
+        ) -> torch.Tensor:
+            m1, m2 = self.merge_kernel_size
+            if isinstance(image_features, (list, tuple)):
+                processed_features = list()
+                for image_feature, image_grid in zip(image_features, image_grid_thw):
+                    image_feature = self.pre_norm(image_feature)
+                    t, h, w = image_grid
+
+                    image_feature = rearrange(
+                        image_feature,
+                        "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                        t=t,
+                        h=h // m1,
+                        p1=m1,
+                        w=w // m2,
+                        p2=m2,
+                    )
+                    hidden_states = self.linear_1(image_feature)
+                    hidden_states = self.act(hidden_states)
+                    hidden_states = self.linear_2(hidden_states)
+                    processed_features.append(hidden_states)
+
+                return processed_features
+
+            dims = image_features.shape[:-1]
+            dim = image_features.shape[-1]
+            image_features = image_features.view(np.prod(dims), dim)
+            hidden_states = self.pre_norm(image_features).view(-1, self.hidden_size)
+            hidden_states = self.linear_1(hidden_states)
+            hidden_states = self.act(hidden_states)
+            hidden_states = self.linear_2(hidden_states)
+
+            return hidden_states.view(*dims, -1)
+
+    class SiglipVisionEmbeddings(nn.Module):
+
+        def __init__(self, config):
+            super().__init__()
+            self.config = config
+            self.embed_dim = config.hidden_size
+            self.image_size = config.image_size
+            self.patch_size = config.patch_size
+
+            self.patch_embedding = nn.Conv2d(
+                in_channels=config.num_channels,
+                out_channels=self.embed_dim,
+                kernel_size=self.patch_size,
+                stride=self.patch_size,
+                padding="valid",
+            )
+
+            self.num_patches = (self.image_size // self.patch_size) ** 2
+            self.num_positions = self.num_patches
+            self.cache_position_embedding = dict()
+            self.cache_position_count = dict()
+            self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+            self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+            self.register_buffer(
+                "position_ids",
+                torch.arange(self.num_positions).expand((1, -1)),
+                persistent=False,
+            )
+
+        def interpolate_pos_encoding(
+            self,
+            embeddings: torch.Tensor,
+            height: int,
+            width: int,
+            is_after_patchify: bool = False,
+        ) -> torch.Tensor:
+
+            num_positions = self.position_embedding.weight.shape[0]
+
+            patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+            dim = embeddings.shape[-1]
+
+            if is_after_patchify:
+                new_height = height
+                new_width = width
+            else:
+                new_height = height // self.patch_size
+                new_width = width // self.patch_size
+
+            sqrt_num_positions = torch_int(num_positions**0.5)
+            patch_pos_embed = patch_pos_embed.reshape(
+                1, sqrt_num_positions, sqrt_num_positions, dim
+            )
+            patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+            patch_pos_embed = nn.functional.interpolate(
+                patch_pos_embed,
+                size=(new_height, new_width),
+                mode="bilinear",
+                align_corners=False,
+            )
+
+            patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+            return patch_pos_embed
+
+        def fetch_position_embedding_lfu_cache(
+            self, embeddings, h, w, max_cache: int = 20
+        ):
+            grid = (h, w)
+            if grid in self.cache_position_embedding:
+                self.cache_position_count[grid] += 1
+                return self.cache_position_embedding[grid]
+
+            if len(self.cache_position_embedding) >= max_cache:
+                min_hit_grid = min(
+                    self.cache_position_count,
+                    key=self.cache_position_count.get,
+                )
+                self.cache_position_count.pop(min_hit_grid)
+                self.cache_position_embedding.pop(min_hit_grid)
+
+            position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+            self.cache_position_count[grid] = 1
+            self.cache_position_embedding[grid] = position_embedding
+            return position_embedding
+
+        def forward(
+            self,
+            pixel_values: torch.FloatTensor,
+            position_ids: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            interpolate_pos_encoding=False,
+        ) -> torch.Tensor:
+            if pixel_values.dim() == 4:
+                pixel_values = pixel_values.unsqueeze(0)
+            if pixel_values.dim() == 5:
+                if position_ids is None:
+                    raise ValueError(
+                        "position_ids cannot be None when pixel_values.dim() is 5."
+                    )
+                (
+                    batch_size,
+                    squence_len,
+                    channel,
+                    height,
+                    width,
+                ) = pixel_values.shape
+                target_dtype = self.patch_embedding.weight.dtype
+                pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+                patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))
+                embeddings = patch_embeds.flatten(-2).squeeze(-1)
+
+                if interpolate_pos_encoding and image_grid_thw is not None:
+                    start = 0
+                    tmp_embeddings = list()
+                    for image_grid in image_grid_thw:
+                        t, h, w = image_grid
+                        end = start + t * h * w
+                        image_embeddings = embeddings[start:end, :]
+                        position_embedding = (
+                            self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                            .squeeze(0)
+                            .repeat(t, 1)
+                        )
+                        image_embeddings = image_embeddings + position_embedding
+                        tmp_embeddings.append(image_embeddings)
+                        start = end
+                    embeddings = torch.concat(tmp_embeddings, dim=0).unsqueeze(0)
+                else:
+                    embeddings = embeddings + self.packing_position_embedding(
+                        position_ids
+                    )
+                return embeddings
+            else:
+                raise ValueError(
+                    "Unsupported pixel_values dimension:"
+                    f" {pixel_values.dim()}. Expected 4 or 5."
+                )
+
+    def rotate_half(x: torch.Tensor, interleaved: bool = False) -> torch.Tensor:
+        if not interleaved:
+            x1, x2 = x.chunk(2, dim=-1)
+            return torch.cat((-x2, x1), dim=-1)
+        else:
+            x1, x2 = x[..., ::2], x[..., 1::2]
+            return rearrange(
+                torch.stack((-x2, x1), dim=-1), "... d two -> ... (d two)", two=2
+            )
+
+    def apply_rotary_emb_torch(
+        x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+    ) -> torch.Tensor:
+        """
+        x: (batch_size, seqlen, nheads, headdim)
+        cos, sin: (seqlen, rotary_dim / 2) or (batch_size, seqlen, rotary_dim / 2)
+        """
+        ro_dim = cos.shape[-1] * 2
+        assert ro_dim <= x.shape[-1]
+        cos = repeat(
+            cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+        )
+        sin = repeat(
+            sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)"
+        )
+        return torch.cat(
+            [
+                x[..., :ro_dim] * cos + rotate_half(x[..., :ro_dim], interleaved) * sin,
+                x[..., ro_dim:],
+            ],
+            dim=-1,
+        )
+
+    def apply_rotary_pos_emb_flashatt(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        cos: torch.Tensor,
+        sin: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos = cos.chunk(2, dim=-1)[0].contiguous()
+        sin = sin.chunk(2, dim=-1)[0].contiguous()
+
+        apply_rotary_emb = apply_rotary_emb_torch
+        if current_platform.is_cuda():
+            from vllm.vllm_flash_attn.layers.rotary import apply_rotary_emb
+
+        q_embed = apply_rotary_emb(q.float(), cos.float(), sin.float()).type_as(q)
+        k_embed = apply_rotary_emb(k.float(), cos.float(), sin.float()).type_as(k)
+        return q_embed, k_embed
+
+    class SiglipAttention(nn.Module):
+        """Multi-headed attention from 'Attention Is All You
+        Need' paper."""
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+
+            hidden_size = config.hidden_size
+            self.hidden_size = config.hidden_size
+            tp_size = get_tensor_model_parallel_world_size()
+            self.total_num_heads = config.num_attention_heads
+            assert self.total_num_heads % tp_size == 0
+            self.num_heads = self.total_num_heads // tp_size
+            self.total_num_kv_heads = config.num_attention_heads
+            if self.total_num_kv_heads >= tp_size:
+                assert self.total_num_kv_heads % tp_size == 0
+            else:
+                assert tp_size % self.total_num_kv_heads == 0
+            self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+            self.head_dim = config.hidden_size // self.total_num_heads
+            self.q_size = self.num_heads * self.head_dim
+            self.kv_size = self.num_kv_heads * self.head_dim
+            self.scale = self.head_dim**-0.5
+
+            self.qkv_proj = QKVParallelLinear(
+                hidden_size,
+                self.head_dim,
+                self.total_num_heads,
+                self.total_num_kv_heads,
+                bias=True,
+                quant_config=quant_config,
+                prefix=f"{prefix}.qkv_proj",
+            )
+            self.out_proj = RowParallelLinear(
+                input_size=hidden_size,
+                output_size=hidden_size,
+                quant_config=quant_config,
+                prefix=f"{prefix}.out_proj",
+            )
+
+            # Detect attention implementation.
+            self.attn_backend: _Backend = get_vit_attn_backend(support_fa=True)
+            if self.attn_backend not in {
+                _Backend.FLASH_ATTN,
+                _Backend.TORCH_SDPA,
+                _Backend.XFORMERS,
+            }:
+                raise RuntimeError(
+                    f"PaddleOCR-VL does not support {self.attn_backend} backend now."
+                )
+
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            rope_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        ) -> torch.Tensor:
+            batch_size, seq_length, embed_dim = hidden_states.shape
+
+            qkv_states, _ = self.qkv_proj(hidden_states)
+            q, k, v = qkv_states.chunk(3, dim=-1)
+
+            q = q.view(batch_size, seq_length, self.num_heads, self.head_dim)
+            k = k.view(batch_size, seq_length, self.num_heads, self.head_dim)
+            v = v.view(batch_size, seq_length, self.num_heads, self.head_dim)
+
+            if rope_emb is not None:
+                cos, sin = rope_emb
+                q, k = apply_rotary_pos_emb_flashatt(q, k, cos, sin)
+
+            if self.attn_backend == _Backend.FLASH_ATTN:
+                from flash_attn import flash_attn_varlen_func
+
+                q, k, v = (rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v])
+                max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
+                output = flash_attn_varlen_func(
+                    q,
+                    k,
+                    v,
+                    cu_seqlens_q=cu_seqlens,
+                    cu_seqlens_k=cu_seqlens,
+                    max_seqlen_q=max_seqlen,
+                    max_seqlen_k=max_seqlen,
+                )
+
+                context_layer = rearrange(output, "(b s) ... -> b s ...", b=batch_size)
+            elif self.attn_backend == _Backend.TORCH_SDPA:
+                # Execute attention entry by entry for speed & less VRAM.
+                import torch.nn.functional as F
+
+                outputs = []
+                for i in range(1, len(cu_seqlens)):
+                    start_idx = cu_seqlens[i - 1]
+                    end_idx = cu_seqlens[i]
+                    q_i = q[:, start_idx:end_idx]
+                    k_i = k[:, start_idx:end_idx]
+                    v_i = v[:, start_idx:end_idx]
+                    q_i, k_i, v_i = (
+                        rearrange(x, "b s h d -> b h s d") for x in [q_i, k_i, v_i]
+                    )
+                    output_i = F.scaled_dot_product_attention(
+                        q_i, k_i, v_i, dropout_p=0.0
+                    )
+                    output_i = rearrange(output_i, "b h s d -> b s h d ")
+                    outputs.append(output_i)
+                context_layer = torch.cat(outputs, dim=1)
+            elif self.attn_backend == _Backend.XFORMERS:
+                from xformers import ops as xops
+                from xformers.ops.fmha.attn_bias import BlockDiagonalMask
+
+                seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
+                attn_bias = BlockDiagonalMask.from_seqlens(
+                    q_seqlen=seqlens, kv_seqlen=None, device=q.device
+                )
+
+                context_layer = xops.memory_efficient_attention_forward(
+                    q, k, v, attn_bias=attn_bias, p=0, scale=None
+                )
+
+            context_layer = rearrange(
+                context_layer, "b s h d -> b s (h d)"
+            ).contiguous()
+
+            output, _ = self.out_proj(context_layer)
+            return output
+
+    class SigLIPRotaryEmbedding(nn.Module):
+
+        def __init__(self, dim: int, theta: float = 10000.0) -> None:
+            super().__init__()
+            self.dim = dim
+            self.theta = theta
+            self.rope_init()
+
+        def rope_init(self):
+            inv_freq = 1.0 / (
+                self.theta
+                ** (torch.arange(0, self.dim, 2, dtype=torch.float) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        def forward(self, seqlen: int) -> torch.Tensor:
+            seq = torch.arange(
+                seqlen,
+                device=self.inv_freq.device,
+                dtype=self.inv_freq.dtype,
+            )
+            freqs = torch.outer(seq, self.inv_freq)
+            return freqs
+
+    class SiglipMLP(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ) -> None:
+            super().__init__()
+
+            self.config = config
+            self.activation_fn = get_act_fn(config.hidden_act)
+            # Special handling for BNB and torchao quantization
+            if quant_config and quant_config.get_name() in ["bitsandbytes", "torchao"]:
+                quantizable = True
+            else:
+                # For other quantization, we require the hidden size to be a
+                # multiple of 64
+                quantizable = (
+                    config.hidden_size % 64 == 0 and config.intermediate_size % 64 == 0
+                )
+            self.fc1 = ColumnParallelLinear(
+                config.hidden_size,
+                config.intermediate_size,
+                quant_config=quant_config if quantizable else None,
+                prefix=f"{prefix}.fc1",
+            )
+            self.fc2 = RowParallelLinear(
+                config.intermediate_size,
+                config.hidden_size,
+                quant_config=quant_config if quantizable else None,
+                prefix=f"{prefix}.fc2",
+            )
+
+        def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+            hidden_states, _ = self.fc1(hidden_states)
+            hidden_states = self.activation_fn(hidden_states)
+            hidden_states, _ = self.fc2(hidden_states)
+            return hidden_states
+
+    class SiglipEncoderLayer(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.embed_dim = config.hidden_size
+            self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.self_attn = SiglipAttention(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+            self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+            self.mlp = SiglipMLP(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+
+        def forward(
+            self,
+            hidden_states: torch.Tensor,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            rope_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        ) -> Tuple[torch.FloatTensor]:
+
+            residual = hidden_states
+
+            hidden_states = self.layer_norm1(hidden_states)
+            hidden_states = self.self_attn(
+                hidden_states=hidden_states,
+                cu_seqlens=cu_seqlens,
+                rope_emb=rope_emb,
+            )
+
+            hidden_states = residual + hidden_states
+
+            residual = hidden_states
+            hidden_states = self.layer_norm2(hidden_states)
+            hidden_states = self.mlp(hidden_states)
+
+            hidden_states = residual + hidden_states
+
+            return hidden_states
+
+    class SiglipEncoder(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+            embed_dim = config.hidden_size
+            num_heads = config.num_attention_heads
+            head_dim = embed_dim // num_heads
+            self.layers = nn.ModuleList(
+                [
+                    SiglipEncoderLayer(
+                        config,
+                        quant_config=quant_config,
+                        prefix=f"{prefix}.layers.{layer_idx}",
+                    )
+                    for layer_idx in range(config.num_hidden_layers)
+                ]
+            )
+            self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+
+        @staticmethod
+        def flatten_list(image_grid_thw):
+            tmp_image_grid_thw = list()
+            for image_grid in image_grid_thw:
+                if isinstance(image_grid, list):
+                    tmp_image_grid_thw.extend(image_grid)
+                else:
+                    tmp_image_grid_thw.append(image_grid)
+            return tmp_image_grid_thw
+
+        def forward(
+            self,
+            inputs_embeds,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            height_position_ids: Optional[torch.Tensor] = None,
+            width_position_ids: Optional[torch.Tensor] = None,
+        ) -> BaseModelOutput:
+            device = inputs_embeds.device
+            hidden_states = inputs_embeds
+
+            flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+
+            if width_position_ids is None or height_position_ids is None:
+                split_hids = list()
+                split_wids = list()
+                for t, h, w in flatten_image_grid_thw:
+                    image_pids = torch.arange(t * h * w, device=device) % (h * w)
+                    sample_hids = image_pids // w
+                    sample_wids = image_pids % w
+                    split_hids.append(sample_hids)
+                    split_wids.append(sample_wids)
+                width_position_ids = torch.concat(split_wids, dim=0)
+                height_position_ids = torch.concat(split_hids, dim=0)
+
+            pids = torch.stack(
+                [height_position_ids, width_position_ids],
+                dim=-1,
+            )
+            max_grid_size = pids.max() + 1
+            rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+            rope_emb = rope_emb_max_grid[pids].flatten(1)
+            rope_emb = rope_emb.repeat(1, 2)
+            rope_emb = (rope_emb.cos(), rope_emb.sin())
+
+            attn_cu_seqlens = cu_seqlens
+            hidden_states = inputs_embeds
+
+            for encoder_layer in self.layers:
+                hidden_states = encoder_layer(
+                    hidden_states,
+                    cu_seqlens=attn_cu_seqlens,
+                    rope_emb=rope_emb,
+                )
+            return hidden_states
+
+    class SiglipVisionTransformer(nn.Module):
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+            self.config = config
+            embed_dim = config.hidden_size
+
+            self.embeddings = SiglipVisionEmbeddings(config)
+            self.encoder = SiglipEncoder(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.encoder",
+            )
+            self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        def forward(
+            self,
+            pixel_values,
+            interpolate_pos_encoding: Optional[bool] = False,
+            position_ids: Optional[torch.Tensor] = None,
+            height_position_ids: Optional[torch.Tensor] = None,
+            width_position_ids: Optional[torch.Tensor] = None,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+        ) -> BaseModelOutputWithPooling:
+
+            hidden_states = self.embeddings(
+                pixel_values,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+                position_ids=position_ids,
+                image_grid_thw=image_grid_thw,
+            )
+
+            last_hidden_state = self.encoder(
+                inputs_embeds=hidden_states,
+                cu_seqlens=cu_seqlens,
+                image_grid_thw=image_grid_thw,
+                height_position_ids=height_position_ids,
+                width_position_ids=width_position_ids,
+            )
+
+            last_hidden_state = self.post_layernorm(last_hidden_state)
+
+            sample_hidden_state = list()
+            if cu_seqlens is None:
+                raise ValueError(
+                    "cu_seqlens cannot be None for "
+                    "SiglipVisionTransformer output processing."
+                )
+            for i in range(cu_seqlens.shape[0] - 1):
+                start = cu_seqlens[i]
+                end = cu_seqlens[i + 1]
+                tensor = last_hidden_state[:, start:end, :].squeeze(0)
+                sample_hidden_state.append(tensor)
+
+            return sample_hidden_state
+
+    class SiglipVisionModel(nn.Module):
+        config_class = "PPOCRVisionConfig"
+        main_input_name = "pixel_values"
+
+        def __init__(
+            self,
+            config,
+            quant_config: Optional[QuantizationConfig] = None,
+            prefix: str = "",
+        ):
+            super().__init__()
+
+            self.vision_model = SiglipVisionTransformer(
+                config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.vision_model",
+            )
+            self.quant_config = quant_config
+
+        @property
+        def dtype(self) -> torch.dtype:
+            return self.vision_model.embeddings.patch_embedding.weight.dtype
+
+        @property
+        def device(self) -> torch.device:
+            return self.vision_model.embeddings.patch_embedding.weight.device
+
+        def get_input_embeddings(self) -> nn.Module:
+            return self.vision_model.embeddings.patch_embedding
+
+        def forward(
+            self,
+            pixel_values,
+            interpolate_pos_encoding: bool = False,
+            position_ids: Optional[torch.Tensor] = None,
+            image_grid_thw: Optional[
+                List[
+                    Union[
+                        Tuple[int, int, int],
+                        List[Tuple[int, int, int]],
+                    ]
+                ]
+            ] = None,
+            cu_seqlens: Optional[List[torch.Tensor]] = None,
+        ) -> BaseModelOutputWithPooling:
+
+            return self.vision_model(
+                pixel_values=pixel_values,
+                interpolate_pos_encoding=interpolate_pos_encoding,
+                position_ids=position_ids,
+                image_grid_thw=image_grid_thw,
+                cu_seqlens=cu_seqlens,
+            )
+
+        def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> set[str]:
+            stacked_params_mapping = [
+                ("qkv_proj", "q_proj", "q"),
+                ("qkv_proj", "k_proj", "k"),
+                ("qkv_proj", "v_proj", "v"),
+            ]
+            params_dict = dict(self.named_parameters(remove_duplicate=False))
+            loaded_params: set[str] = set()
+            for name, loaded_weight in weights:
+                if "rotary_emb.inv_freq" in name:
+                    continue
+                if "head.attention" in name or "head.layernorm" in name:
+                    continue
+                if "head.mlp" in name or "head.probe" in name:
+                    continue
+                if self.quant_config is not None and (
+                    scale_name := self.quant_config.get_cache_scale(name)
+                ):
+                    param = params_dict[scale_name]
+                    weight_loader = getattr(
+                        param,
+                        "weight_loader",
+                        default_weight_loader,
+                    )
+                    loaded_weight = (
+                        loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0]
+                    )
+                    weight_loader(param, loaded_weight)
+                    loaded_params.add(scale_name)
+                    continue
+                for (
+                    param_name,
+                    weight_name,
+                    shard_id,
+                ) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    name = name.replace(weight_name, param_name)
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    name = maybe_remap_kv_scale_name(name, params_dict)
+                    if name is None:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param,
+                        "weight_loader",
+                        default_weight_loader,
+                    )
+                    weight_loader(param, loaded_weight)
+                loaded_params.add(name)
+            return loaded_params
+
+    @MULTIMODAL_REGISTRY.register_processor(
+        PPOCRVLMultiModalProcessor,
+        info=PPOCRVLProcessingInfo,
+        dummy_inputs=PPOCRVLDummyInputsBuilder,
+    )
+    @support_torch_compile(
+        # set dynamic_arg_dims to support mrope
+        dynamic_arg_dims={
+            "input_ids": 0,
+            "positions": -1,
+            "intermediate_tensors": 0,
+            "inputs_embeds": 0,
+        }
+    )
+    class PPOCRVLForConditionalGeneration(Ernie4_5_ForCausalLM, SupportsMultiModal):
+
+        def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+            super().__init__(vllm_config=vllm_config, prefix=prefix)
+            config = self.config
+
+            self.mlp_AR = Projector(config, config.vision_config)
+            self.visual = SiglipVisionModel(config=config.vision_config)
+            self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
+            self.logits_processor = LogitsProcessor(config.vocab_size)
+
+            for layer in self.model.layers:
+                if not isinstance(layer, PPMissingLayer):
+                    layer.self_attn.rotary_emb.is_neox_style = True
+
+        def compute_logits(
+            self,
+            hidden_states: torch.Tensor,
+            sampling_metadata,
+        ) -> Optional[torch.Tensor]:
+            logits = self.logits_processor(
+                self.lm_head, hidden_states, sampling_metadata
+            )
+            return logits
+
+        @property
+        def language_model(self):
+            return self.model
+
+        def forward(
+            self,
+            input_ids: torch.Tensor,
+            positions: torch.Tensor,
+            intermediate_tensors: Optional[IntermediateTensors] = None,
+            inputs_embeds: Optional[torch.Tensor] = None,
+            **kwargs,
+        ):
+            if intermediate_tensors is not None:
+                inputs_embeds = None
+
+            elif inputs_embeds is None:
+                vision_embeddings = self.get_multimodal_embeddings(**kwargs)
+                inputs_embeds = self.get_input_embeddings(input_ids, vision_embeddings)
+                input_ids = None
+
+            return self.language_model(
+                input_ids, positions, intermediate_tensors, inputs_embeds
+            )
+
+        @classmethod
+        def get_placeholder_str(cls, modality: str, i: int) -> Optional[str]:
+            if modality.startswith("image"):
+                return "<|vision_start|><|image_pad|><|vision_end|>"
+
+            raise ValueError("Only image modality is supported")
+
+        def encode_image(self, pixel_values, image_grid_thw):
+            pixel_values = pixel_values.type(self.visual.dtype)
+            siglip_position_ids = list()
+            image_grid_hws = list()
+            cu_seqlens = [0]
+
+            for idx, thw in enumerate(image_grid_thw):
+                thw_tuple = tuple(thw.detach().cpu().numpy().tolist())
+                numel = np.prod(thw_tuple)
+                image_grid_hws.append(thw_tuple)
+                image_position_ids = torch.arange(numel) % np.prod(thw_tuple[1:])
+                siglip_position_ids.append(image_position_ids)
+                cu_seqlens.append(cu_seqlens[-1] + numel)
+
+            siglip_position_ids = torch.concat(siglip_position_ids, dim=0).to(
+                pixel_values.device
+            )
+            cu_seqlens = torch.tensor(cu_seqlens, dtype=torch.int32).to(
+                pixel_values.device
+            )
+
+            vision_outputs = self.visual(
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_hws,
+                position_ids=siglip_position_ids,
+                interpolate_pos_encoding=True,
+                cu_seqlens=cu_seqlens,
+            )
+            image_embeds = self.mlp_AR(vision_outputs, image_grid_thw)
+
+            return image_embeds
+
+        def get_multimodal_embeddings(self, **kwargs):
+            pixel_values = kwargs["pixel_values"]
+            image_grid_thw = kwargs["image_grid_thw"]
+
+            multimodal_embeddings = []
+            for pv, ig in zip(pixel_values, image_grid_thw):
+                if pv is not None:
+                    image_embeds = self.encode_image(pv, ig)
+                    multimodal_embeddings += image_embeds
+
+            return multimodal_embeddings
+
+        def get_input_embeddings(
+            self,
+            input_ids: torch.Tensor,
+            multimodal_embeddings: Optional[NestedTensors] = None,
+        ) -> torch.Tensor:
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            if multimodal_embeddings is not None and len(multimodal_embeddings) != 0:
+                inputs_embeds = merge_multimodal_embeddings(
+                    input_ids,
+                    inputs_embeds,
+                    multimodal_embeddings,
+                    self.config.image_token_id,
+                )
+
+            return inputs_embeds
+
+        def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]) -> set[str]:
+
+            loader = AutoWeightsLoader(self)
+            autoloaded_weights = loader.load_weights(weights)
+            return autoloaded_weights
diff --git a/paddlex/inference/genai/server.py b/paddlex/inference/genai/server.py
new file mode 100644
index 0000000000..f7b17d5ddc
--- /dev/null
+++ b/paddlex/inference/genai/server.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import sys
+
+from ...utils import logging
+from ...utils.deps import is_genai_engine_plugin_available
+from .configs.utils import load_backend_config, update_backend_config
+from .constants import DEFAULT_BACKEND, SUPPORTED_BACKENDS
+from .models import get_chat_template_path, get_default_config, get_model_dir
+
+
+def get_arg_parser():
+    parser = argparse.ArgumentParser("PaddleX generative AI server.")
+    parser.add_argument("--model_name", type=str, required=True)
+    parser.add_argument("--model_dir", type=str)
+    parser.add_argument("--host", type=str, default="localhost")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--backend", type=str, choices=SUPPORTED_BACKENDS, default=DEFAULT_BACKEND
+    )
+    parser.add_argument(
+        "--backend_config", type=str, help="Path to the backend configuration file."
+    )
+    return parser
+
+
+def run_genai_server(args=None):
+    parser = get_arg_parser()
+    args = parser.parse_args(args=args)
+
+    plugin_name = f"{args.backend}-server"
+    if not is_genai_engine_plugin_available(plugin_name):
+        logging.error(
+            f"The '{plugin_name}' plugin is not available. Please install it first."
+        )
+        sys.exit(1)
+
+    if args.backend == "fastdeploy":
+        from .backends.fastdeploy import run_fastdeploy_server
+
+        run_server_func = run_fastdeploy_server
+    elif args.backend == "vllm":
+        from .backends.vllm import run_vllm_server
+
+        run_server_func = run_vllm_server
+    elif args.backend == "sglang":
+        from .backends.sglang import run_sglang_server
+
+        run_server_func = run_sglang_server
+    else:
+        raise AssertionError
+
+    if args.model_dir:
+        model_dir = args.model_dir
+    else:
+        try:
+            model_dir = get_model_dir(args.model_name, args.backend)
+        except Exception:
+            logging.error("Failed to get model directory", exc_info=True)
+            sys.exit(1)
+
+    if args.backend_config:
+        try:
+            backend_config = load_backend_config(args.backend_config)
+        except Exception:
+            logging.error(
+                f"Failed to load backend configuration from file: {args.backend_config}",
+                exc_info=True,
+            )
+            sys.exit(1)
+    else:
+        backend_config = {}
+
+    try:
+        default_config = get_default_config(args.model_name, args.backend)
+    except Exception:
+        logging.error(
+            f"Failed to get default configuration for the model", exc_info=True
+        )
+        sys.exit(1)
+    update_backend_config(
+        default_config,
+        backend_config,
+    )
+    backend_config = default_config
+
+    with get_chat_template_path(
+        args.model_name, args.backend, model_dir
+    ) as chat_template_path:
+        run_server_func(
+            args.host,
+            args.port,
+            args.model_name,
+            model_dir,
+            backend_config,
+            chat_template_path,
+        )
+
+
+if __name__ == "__main__":
+    run_genai_server()
diff --git a/paddlex/inference/genai/utils.py b/paddlex/inference/genai/utils.py
new file mode 100644
index 0000000000..024519cbef
--- /dev/null
+++ b/paddlex/inference/genai/utils.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .constants import SUPPORTED_BACKENDS
+
+
+def model_name_to_module_name(model_name):
+    mod_name = model_name.lower().replace("-", "_")
+    if mod_name[0].isdigit():
+        return "m_" + mod_name
+    return mod_name
+
+
+def check_backend(backend):
+    if backend not in SUPPORTED_BACKENDS:
+        raise ValueError(f"{repr(backend)} is not a supported backend.")
diff --git a/paddlex/inference/models/__init__.py b/paddlex/inference/models/__init__.py
index cc26d1de0d..40c5818aea 100644
--- a/paddlex/inference/models/__init__.py
+++ b/paddlex/inference/models/__init__.py
@@ -25,6 +25,7 @@
 # from .general_recognition import ShiTuRecPredictor
 from .anomaly_detection import UadPredictor
 from .base import BasePredictor
+from .common.genai import GenAIConfig, need_local_model
 from .doc_vlm import DocVLMPredictor
 from .face_feature import FaceFeaturePredictor
 from .formula_recognition import FormulaRecPredictor
@@ -55,25 +56,34 @@
 def create_predictor(
     model_name: str,
     model_dir: Optional[str] = None,
-    device=None,
+    device: Optional[str] = None,
     pp_option=None,
     use_hpip: bool = False,
     hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+    genai_config: Optional[Union[Dict[str, Any], GenAIConfig]] = None,
     *args,
     **kwargs,
 ) -> BasePredictor:
-    if model_dir is None:
+    # TODO: Check if the model is a genai model
+    if genai_config is not None:
+        genai_config = GenAIConfig.model_validate(genai_config)
+
+    if need_local_model(genai_config):
+        if model_dir is None:
+            assert (
+                model_name in official_models
+            ), f"The model ({model_name}) is not supported! Please using directory of local model files or model name supported by PaddleX!"
+            model_dir = official_models[model_name]
+        else:
+            assert Path(model_dir).exists(), f"{model_dir} is not exists!"
+            model_dir = Path(model_dir)
+        config = BasePredictor.load_config(model_dir)
         assert (
-            model_name in official_models
-        ), f"The model ({model_name}) is not supported! Please using directory of local model files or model name supported by PaddleX!"
-        model_dir = official_models[model_name]
+            model_name == config["Global"]["model_name"]
+        ), f"Model name mismatch，please input the correct model dir."
     else:
-        assert Path(model_dir).exists(), f"{model_dir} is not exists!"
-        model_dir = Path(model_dir)
-    config = BasePredictor.load_config(model_dir)
-    assert (
-        model_name == config["Global"]["model_name"]
-    ), f"Model name mismatch，please input the correct model dir."
+        config = None
+
     return BasePredictor.get(model_name)(
         model_dir=model_dir,
         config=config,
@@ -81,6 +91,8 @@ def create_predictor(
         pp_option=pp_option,
         use_hpip=use_hpip,
         hpi_config=hpi_config,
+        genai_config=genai_config,
+        model_name=model_name,
         *args,
         **kwargs,
     )
diff --git a/paddlex/inference/models/base/predictor/base_predictor.py b/paddlex/inference/models/base/predictor/base_predictor.py
index cda676269b..0aac2e411c 100644
--- a/paddlex/inference/models/base/predictor/base_predictor.py
+++ b/paddlex/inference/models/base/predictor/base_predictor.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import warnings
 from abc import ABC, abstractmethod
 from copy import deepcopy
 from pathlib import Path
@@ -34,8 +35,10 @@
 from ....utils.benchmark import ENTRY_POINT_NAME, benchmark
 from ....utils.hpi import HPIConfig, HPIInfo
 from ....utils.io import YAMLReader
+from ....utils.model_paths import get_model_paths
 from ....utils.pp_option import PaddlePredictorOption
 from ...common import HPInfer, PaddleInfer
+from ...common.genai import GenAIClient, GenAIConfig, need_local_model
 
 
 class PredictionWrap:
@@ -79,7 +82,7 @@ class BasePredictor(
 
     def __init__(
         self,
-        model_dir: str,
+        model_dir: Optional[str] = None,
         config: Optional[Dict[str, Any]] = None,
         *,
         device: Optional[str] = None,
@@ -87,11 +90,14 @@ def __init__(
         pp_option: Optional[PaddlePredictorOption] = None,
         use_hpip: bool = False,
         hpi_config: Optional[Union[Dict[str, Any], HPIConfig]] = None,
+        genai_config: Optional[GenAIConfig] = None,
+        model_name: Optional[str] = None,
     ) -> None:
         """Initializes the BasePredictor.
 
         Args:
-            model_dir (str): The directory where the model files are stored.
+            model_dir (Optional[str], optional): The directory where the model
+                files are stored.
             config (Optional[Dict[str, Any]], optional): The model configuration
                 dictionary. Defaults to None.
             device (Optional[str], optional): The device to run the inference
@@ -105,11 +111,42 @@ def __init__(
             hpi_config (Optional[Union[Dict[str, Any], HPIConfig]], optional):
                 The high-performance inference configuration dictionary.
                 Defaults to None.
+            genai_config (Optional[GenAIConfig]], optional): The generative AI
+                configuration. Defaults to None.
+            model_name (Optional[str], optional): Optional model name.
+                Defaults to None.
         """
         super().__init__()
 
-        self.model_dir = Path(model_dir)
-        self.config = config if config else self.load_config(self.model_dir)
+        if need_local_model(genai_config):
+            if model_dir is None:
+                raise ValueError(
+                    "`model_dir` should not be `None`, as a local model is needed."
+                )
+            self.model_dir = Path(model_dir)
+            self.config = config if config else self.load_config(self.model_dir)
+            self._use_local_model = True
+        else:
+            if model_dir is not None:
+                warnings.warn("`model_dir` will be ignored, as it is not needed.")
+            self.model_dir = None
+            self.config = config
+            self._genai_config = genai_config
+            assert genai_config.server_url is not None
+            self._genai_client = GenAIClient(
+                backend=genai_config.backend,
+                base_url=genai_config.server_url,
+                model_name=model_name,
+                **(genai_config.client_kwargs or {}),
+            )
+            self._use_local_model = False
+
+        if model_name:
+            if self.config:
+                if self.config["Global"]["model_name"] != model_name:
+                    raise ValueError("`model_name` is not consistent with `config`")
+            self._model_name = model_name
+
         self.batch_sampler = self._build_batch_sampler()
         self.result_class = self._get_result_class()
 
@@ -117,12 +154,16 @@ def __init__(
         self.predict = self.__call__
 
         self.batch_sampler.batch_size = batch_size
-        self._use_hpip = use_hpip
-        if not use_hpip:
-            self._pp_option = self._prepare_pp_option(pp_option, device)
+
+        if self.model_dir and get_model_paths(self.model_dir, self.MODEL_FILE_PREFIX):
+            self._use_hpip = use_hpip
+            if not use_hpip:
+                self._pp_option = self._prepare_pp_option(pp_option, device)
+            else:
+                require_hpip()
+                self._hpi_config = self._prepare_hpi_config(hpi_config, device)
         else:
-            require_hpip()
-            self._hpi_config = self._prepare_hpi_config(hpi_config, device)
+            self._use_hpip = False
 
         logging.debug(f"{self.__class__.__name__}: {self.model_dir}")
 
@@ -144,7 +185,13 @@ def model_name(self) -> str:
         Returns:
             str: The model name.
         """
-        return self.config["Global"]["model_name"]
+        if self.config:
+            return self.config["Global"]["model_name"]
+        else:
+            if hasattr(self, "_model_name"):
+                return self._model_name
+            else:
+                raise AttributeError(f"{repr(self)} has no attribute 'model_name'.")
 
     @property
     def pp_option(self) -> PaddlePredictorOption:
@@ -162,6 +209,12 @@ def hpi_config(self) -> HPIConfig:
     def use_hpip(self) -> bool:
         return self._use_hpip
 
+    @property
+    def genai_config(self) -> GenAIConfig:
+        if not hasattr(self, "_genai_config"):
+            raise AttributeError(f"{repr(self)} has no attribute 'genai_config'.")
+        return self._genai_config
+
     def __call__(
         self,
         input: Any,
@@ -240,7 +293,6 @@ def get_hpi_info(self):
         try:
             return HPIInfo.model_validate(self.config["Hpi"])
         except ValidationError as e:
-            logging.exception("The HPI info in the model config file is invalid.")
             raise RuntimeError(f"Invalid HPI info: {str(e)}") from e
 
     def create_static_infer(self):
@@ -291,6 +343,10 @@ def process(self, batch_data: List[Any]) -> Dict[str, List[Any]]:
         """
         raise NotImplementedError
 
+    def close(self) -> None:
+        if hasattr(self, "_genai_client"):
+            self._genai_client.close()
+
     @classmethod
     def get_config_path(cls, model_dir) -> str:
         """Get the path to the configuration file for the given model directory.
@@ -345,6 +401,7 @@ def _prepare_pp_option(
             device_info = None
         if pp_option is None:
             pp_option = PaddlePredictorOption()
+
         if device_info:
             pp_option.device_type = device_info[0]
             pp_option.device_id = device_info[1]
diff --git a/paddlex/inference/models/common/genai.py b/paddlex/inference/models/common/genai.py
new file mode 100644
index 0000000000..260f516cee
--- /dev/null
+++ b/paddlex/inference/models/common/genai.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import weakref
+from typing import Any, Dict, Optional
+
+from pydantic import BaseModel, model_validator
+from typing_extensions import Literal
+
+from ....utils.deps import class_requires_deps
+
+SERVER_BACKENDS = ["fastdeploy-server", "vllm-server", "sglang-server"]
+
+
+class GenAIConfig(BaseModel):
+    backend: Literal["native", "fastdeploy-server", "vllm-server", "sglang-server"]
+    server_url: Optional[str] = None
+    client_kwargs: Optional[Dict[str, Any]] = None
+
+    @model_validator(mode="after")
+    def check_server_url(self):
+        if self.backend in SERVER_BACKENDS and self.server_url is None:
+            raise ValueError(
+                f"`server_url` must not be `None` for the {repr(self.backend)} backend."
+            )
+        return self
+
+
+def need_local_model(genai_config):
+    if genai_config is not None and genai_config.backend in SERVER_BACKENDS:
+        return False
+    return True
+
+
+@class_requires_deps("openai")
+class GenAIClient(object):
+    def __init__(self, backend, base_url, model_name=None, **kwargs):
+        from openai import OpenAI
+
+        super().__init__()
+
+        self.backend = backend
+
+        if "api_key" not in kwargs:
+            kwargs["api_key"] = "null"
+        self._client = OpenAI(base_url=base_url, **kwargs)
+
+        if model_name is not None:
+            self._model = model_name
+        else:
+            try:
+                models = self._client.models.list()
+            except Exception as e:
+                raise RuntimeError(
+                    f"Failed to get the model list from the OpenAI-compatible server: {e}"
+                ) from e
+            self._model = models.data[0].id
+
+        self._finalizer = weakref.finalize(self, self._close, self._client)
+
+    @property
+    def openai_client(self):
+        return self._client
+
+    def create_chat_completion(self, messages, **kwargs):
+        return self._client.chat.completions.create(
+            model=self._model,
+            messages=messages,
+            **kwargs,
+        )
+
+    def close(self):
+        self._close(self._client)
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc, exc_tb):
+        self.close()
+
+    @staticmethod
+    def _close(client):
+        client.close()
diff --git a/paddlex/inference/models/common/tokenizer/__init__.py b/paddlex/inference/models/common/tokenizer/__init__.py
index ff9930a519..78ec1a3d7d 100644
--- a/paddlex/inference/models/common/tokenizer/__init__.py
+++ b/paddlex/inference/models/common/tokenizer/__init__.py
@@ -15,6 +15,7 @@
 from .bert_tokenizer import BertTokenizer
 from .clip_tokenizer import CLIPTokenizer
 from .gpt_tokenizer import GPTTokenizer
+from .llama_tokenizer import LlamaTokenizer
 from .qwen2_5_tokenizer import MIXQwen2_5_Tokenizer
 from .qwen2_tokenizer import MIXQwen2Tokenizer, Qwen2Tokenizer
 from .qwen_tokenizer import QWenTokenizer
diff --git a/paddlex/inference/models/common/tokenizer/clip_tokenizer.py b/paddlex/inference/models/common/tokenizer/clip_tokenizer.py
index 24786709c9..d526db8db1 100644
--- a/paddlex/inference/models/common/tokenizer/clip_tokenizer.py
+++ b/paddlex/inference/models/common/tokenizer/clip_tokenizer.py
@@ -14,13 +14,13 @@
 
 
 import json
-import logging
 import os
 import shutil
 import unicodedata
 from functools import lru_cache
 from typing import List, Optional
 
+from .....utils import logging
 from .tokenizer_utils import (
     PretrainedTokenizer,
     _is_control,
diff --git a/paddlex/inference/models/common/tokenizer/llama_tokenizer.py b/paddlex/inference/models/common/tokenizer/llama_tokenizer.py
new file mode 100644
index 0000000000..ce6d0fdf2e
--- /dev/null
+++ b/paddlex/inference/models/common/tokenizer/llama_tokenizer.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple
+
+from paddlex.inference.models.common.tokenizer.tokenizer_utils import (
+    PretrainedTokenizer,
+)
+
+
+class LlamaTokenizer(PretrainedTokenizer):
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+    resource_files_names = {
+        "vocab_file": "sentencepiece.bpe.model",
+    }
+    pretrained_resource_files_map = {
+        "vocab_file": {
+            "__internal_testing__/micro-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "__internal_testing__/tiny-random-llama": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-7b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-13b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-30b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+            "facebook/llama-65b": "https://bj.bcebos.com/paddlenlp/models/transformers/llama/sentencepiece.bpe.model",
+        },
+    }
+
+    pretrained_init_configuration = {
+        "__internal_testing__/micro-random-llama": {},
+        "__internal_testing__/tiny-random-llama": {},
+        "facebook/llama-7b": {},
+        "facebook/llama-13b": {},
+        "facebook/llama-30b": {},
+        "facebook/llama-65b": {},
+    }
+    padding_side = "left"
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        add_bos_token=True,
+        add_eos_token=False,
+        sp_model_kwargs=None,
+        decode_with_prefix_space=False,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        super().__init__(
+            bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs
+        )
+
+        self.vocab_file = vocab_file
+        self.add_bos_token = add_bos_token
+        self.add_eos_token = add_eos_token
+        self.decode_with_prefix_space = decode_with_prefix_space
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", True))
+
+    @property
+    def vocab_size(self):
+        """Returns vocab size"""
+        return self.sp_model.get_piece_size()
+
+    def __len__(self):
+        """
+        Returns the vocabulary size. added_tokens_encoder has to be added in the sp_model
+        """
+        added_size = 0
+
+        for id in self.added_tokens_decoder:
+            if id >= self.sp_model.get_piece_size():
+                added_size += 1
+
+        return self.vocab_size + added_size
+
+    @property
+    def bos_token_id(self) -> Optional[int]:
+        return self.sp_model.bos_id()
+
+    @property
+    def eos_token_id(self) -> Optional[int]:
+        return self.sp_model.eos_id()
+
+    def get_spm_processor(self, from_slow=True):
+        import sentencepiece as spm
+        from sentencepiece import sentencepiece_model_pb2 as model_pb2
+
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        if from_slow:  # no dependency on protobuf
+            tokenizer.Load(self.vocab_file)
+            return tokenizer
+
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+
+    def get_vocab(self):
+        """Returns vocab as a dict"""
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Returns a tokenized string."""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.id_to_piece(index)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for i, token in enumerate(tokens):
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special and i != 0:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string
+
+    def save_vocabulary(
+        self, save_directory, filename_prefix: Optional[str] = None
+    ) -> Tuple[str]:
+        """
+        Save the vocabulary and special tokens file to a directory.
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if not os.path.isdir(save_directory):
+            raise ValueError(
+                f"Vocabulary path ({save_directory}) should be a directory"
+            )
+
+        out_vocab_file = os.path.join(
+            save_directory,
+            (filename_prefix + "-" if filename_prefix else "")
+            + self.resource_files_names["vocab_file"],
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(
+            out_vocab_file
+        ) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is not None:
+            output = output + token_ids_1
+
+        if self.add_eos_token:
+            output = output + [self.eos_token_id]
+
+        return output
+
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: List[int],
+        token_ids_1: Optional[List[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
+        use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `List[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
diff --git a/paddlex/inference/models/common/tokenizer/tokenizer_utils.py b/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
index b73b6b01c6..2da3a88ea8 100644
--- a/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
+++ b/paddlex/inference/models/common/tokenizer/tokenizer_utils.py
@@ -1239,12 +1239,6 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
                     # Strip white spaces on the left
                     if tok_extended.lstrip and left:
                         tokens[i - 1] = left.rstrip()  # Opposite here
-                else:
-                    # We strip left and right by default
-                    if right:
-                        tokens[i + 1] = right.lstrip()
-                    if left:
-                        tokens[i - 1] = left.rstrip()
         # ["This is something", "<special_token_1>", "else"]
         tokenized_text = []
         for token in tokens:
diff --git a/paddlex/inference/models/common/vlm/generation/configuration_utils.py b/paddlex/inference/models/common/vlm/generation/configuration_utils.py
index 83f4db0051..4f12772346 100644
--- a/paddlex/inference/models/common/vlm/generation/configuration_utils.py
+++ b/paddlex/inference/models/common/vlm/generation/configuration_utils.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import copy
 import json
 import os
 import warnings
@@ -171,6 +172,9 @@ def __init__(self, **kwargs):
         # Validate the values of the attributes
         self.validate(is_init=True)
 
+    def to_dict(self):
+        return copy.deepcopy(self.__dict__)
+
     def __eq__(self, other):
         if not isinstance(other, GenerationConfig):
             return False
diff --git a/paddlex/inference/models/common/vlm/generation/utils.py b/paddlex/inference/models/common/vlm/generation/utils.py
index 89e67ace62..a438c91886 100644
--- a/paddlex/inference/models/common/vlm/generation/utils.py
+++ b/paddlex/inference/models/common/vlm/generation/utils.py
@@ -894,9 +894,9 @@ def generate(
                 # ['是的', '嗯嗯']
         """
         if generation_config is None:
-            if (
-                self.generation_config is None
-                or self.generation_config._from_model_config
+            if self.generation_config is None or (
+                self.generation_config._from_model_config
+                and self.config._has_non_default_generation_parameters()
             ):
                 new_generation_config = GenerationConfig.from_model_config(self.config)
                 if new_generation_config != self.generation_config:
@@ -1097,6 +1097,8 @@ def generate(
         if "logits_processors" in model_kwargs:
             model_kwargs.pop("logits_processors")
 
+        model_kwargs["use_cache"] = generation_config.use_cache
+
         stopping_criteria = (
             stopping_criteria
             if stopping_criteria is not None
@@ -1239,7 +1241,6 @@ def greedy_search(
         synced_gpus=False,
         **model_kwargs,
     ):
-        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
         logits_processors = (
             logits_processors
             if logits_processors is not None
@@ -1362,7 +1363,6 @@ def sample(
         synced_gpus=False,
         **model_kwargs,
     ):
-        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
 
         logits_processors = (
             logits_processors
@@ -1751,8 +1751,6 @@ def beam_search(
         synced_gpus=False,
         **model_kwargs,
     ):
-        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
-
         logits_processors = (
             logits_processors
             if logits_processors is not None
@@ -1958,7 +1956,6 @@ def group_beam_search(
         synced_gpus=False,
         **model_kwargs,
     ):
-        model_kwargs["use_cache"] = model_kwargs.get("use_cache", True)
         logits_processors = (
             logits_processors
             if logits_processors is not None
diff --git a/paddlex/inference/models/common/vlm/transformers/configuration_utils.py b/paddlex/inference/models/common/vlm/transformers/configuration_utils.py
index daa6a51a1d..5936c41717 100644
--- a/paddlex/inference/models/common/vlm/transformers/configuration_utils.py
+++ b/paddlex/inference/models/common/vlm/transformers/configuration_utils.py
@@ -865,9 +865,6 @@ def _dict_from_json_file(cls, json_file: Union[str, os.PathLike]):
     def __eq__(self, other):
         return self.__dict__ == other.__dict__
 
-    def __repr__(self):
-        return f"{self.__class__.__name__} {self.to_json_string()}"
-
     def to_diff_dict(self, saving_file=False) -> Dict[str, Any]:
         """
         Removes all attributes from config which correspond to the default config attributes for better readability and
diff --git a/paddlex/inference/models/common/vlm/transformers/masking_utils.py b/paddlex/inference/models/common/vlm/transformers/masking_utils.py
new file mode 100644
index 0000000000..cbdbcfb0c2
--- /dev/null
+++ b/paddlex/inference/models/common/vlm/transformers/masking_utils.py
@@ -0,0 +1,317 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file was adapted from https://github.com/huggingface/transformers/blob/05000aefe173bf7a10fa1d90e4c528585b45d3c7/src/transformers/masking_utils.py
+# Original copyright notice below:
+# coding=utf-8
+# Copyright 2025 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, List, Optional
+
+import paddle
+
+
+def and_masks(*mask_functions: List[Callable]) -> Callable:
+    """Returns a mask function that is the intersection of provided mask functions"""
+    if not all(callable(arg) for arg in mask_functions):
+        raise RuntimeError(
+            f"All inputs should be callable mask_functions: {mask_functions}"
+        )
+
+    def and_mask(batch_idx, head_idx, q_idx, kv_idx):
+        result = paddle.ones(q_idx.shape, dtype="bool")
+        for mask in mask_functions:
+            result = result & mask(batch_idx, head_idx, q_idx, kv_idx)
+        return result
+
+    return and_mask
+
+
+def or_masks(*mask_functions: List[Callable]) -> Callable:
+    """Returns a mask function that is the union of provided mask functions"""
+    if not all(callable(arg) for arg in mask_functions):
+        raise RuntimeError(
+            f"All inputs should be callable mask_functions: {mask_functions}"
+        )
+
+    def or_mask(batch_idx, head_idx, q_idx, kv_idx):
+        result = q_idx.new_zeros((), dtype="bool")
+        for mask in mask_functions:
+            result = result | mask(batch_idx, head_idx, q_idx, kv_idx)
+        return result
+
+    return or_mask
+
+
+def causal_mask_function(
+    batch_idx: int, head_idx: int, q_idx: int, kv_idx: int
+) -> bool:
+    """
+    This creates a basic lower-diagonal causal mask.
+    """
+    return kv_idx <= q_idx
+
+
+def prepare_padding_mask(
+    attention_mask, kv_length: int, kv_offset: int, _slice: bool = True
+):
+    """
+    From the 2D attention mask, prepare the correct padding mask to use by potentially padding it, and slicing
+    according to the `kv_offset` if `_slice` is `True`.
+    """
+    local_padding_mask = attention_mask
+    if attention_mask is not None:
+        # Pad it if necesary
+        if (padding_length := kv_length + kv_offset - attention_mask.shape[-1]) > 0:
+            local_padding_mask = paddle.nn.functional.pad(
+                attention_mask, (0, padding_length)
+            )
+        # For flex, we should not slice them, only use an offset
+        if _slice:
+            mask_indices = paddle.arange(kv_length)
+            mask_indices += kv_offset
+            local_padding_mask = local_padding_mask[:, mask_indices]
+    return local_padding_mask
+
+
+def _ignore_causal_mask_sdpa(
+    padding_mask,
+    query_length: int,
+    kv_length: int,
+    kv_offset: int,
+    local_attention_size: Optional[int] = None,
+) -> bool:
+    """
+    Detects whether the causal mask can be ignored in case PaddlePaddle's SDPA is used, rather relying on SDPA's `is_causal` argument.
+
+    In case no token is masked in the 2D `padding_mask` argument, if `query_length == 1` or
+    `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks,
+    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is
+    passed).
+    """
+    if padding_mask is not None and padding_mask.shape[-1] > kv_length:
+        mask_indices = paddle.arange(kv_length)
+        mask_indices += kv_offset
+        padding_mask = padding_mask[:, mask_indices]
+
+    if (
+        (query_length == 1 or (kv_length == query_length))
+        and (local_attention_size is None or kv_length < local_attention_size)
+        and (
+            padding_mask is None
+            or (
+                padding_mask.all()
+                if query_length == 1
+                else padding_mask[:, :query_length].all()
+            )
+        )
+    ):
+        return True
+
+    return False
+
+
+def padding_mask_function(padding_mask: paddle.Tensor) -> Callable:
+    """
+    This return the mask_function function corresponding to a 2D padding mask.
+    """
+
+    def inner_mask(batch_idx: int, head_idx: int, q_idx: int, kv_idx: int) -> bool:
+        # Note that here the mask should ALWAYS be at least of the max `kv_index` size in the dimension 1. This is because
+        # we cannot pad it here in the mask_function as we don't know the final size, and we cannot try/except, as it is not
+        # vectorizable on accelerator devices
+        return padding_mask[batch_idx, kv_idx]
+
+    return inner_mask
+
+
+def sdpa_mask(
+    batch_size: int,
+    cache_position,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask=None,
+    local_size: Optional[int] = None,
+    allow_is_causal_skip: bool = True,
+    **kwargs,
+):
+    q_length = cache_position.shape[0]
+    # Potentially pad the 2D mask, and slice it correctly
+    padding_mask = prepare_padding_mask(
+        attention_mask, kv_length, kv_offset, _slice=False
+    )
+
+    # Under specific conditions, we can avoid materializing the mask, instead relying on the `is_causal` argument
+    if allow_is_causal_skip and _ignore_causal_mask_sdpa(
+        padding_mask, q_length, kv_length, kv_offset, local_size
+    ):
+        return None
+
+    kv_arange = paddle.arange(kv_length)
+    kv_arange += kv_offset
+
+    # Potentially add the padding 2D mask
+    if padding_mask is not None:
+        mask_function = and_masks(mask_function, padding_mask_function(padding_mask))
+
+    batch_arange = paddle.arange(batch_size)
+    batch_idx = batch_arange.reshape([-1, 1, 1, 1])
+    head_arange = paddle.arange(1)
+    head_idx = head_arange.reshape([1, -1, 1, 1])
+    q_idx = cache_position.reshape([1, 1, -1, 1])
+    kv_idx = kv_arange.reshape([1, 1, 1, -1])
+    causal_mask = mask_function(batch_idx, head_idx, q_idx, kv_idx)
+
+    return causal_mask
+
+
+def eager_mask(
+    batch_size: int,
+    cache_position,
+    kv_length: int,
+    kv_offset: int = 0,
+    mask_function: Callable = causal_mask_function,
+    attention_mask=None,
+    dtype=paddle.float32,
+    **kwargs,
+):
+    # The masks for eager attention are simply boolean mask from sdpa, casted to 0 and -inf
+    _ = kwargs.pop("allow_is_causal_skip", None)
+    mask = sdpa_mask(
+        batch_size=batch_size,
+        cache_position=cache_position,
+        kv_length=kv_length,
+        kv_offset=kv_offset,
+        mask_function=mask_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=False,
+        **kwargs,
+    )
+    min_dtype = paddle.finfo(dtype).min
+    # we need 0s where the tokens should be taken into account, and -inf otherwise (mask is already of boolean type)
+    mask = paddle.where(
+        mask,
+        paddle.to_tensor(0.0, dtype=dtype),
+        paddle.to_tensor(min_dtype, dtype=dtype),
+    )
+    return mask
+
+
+ALL_MASK_ATTENTION_FUNCTIONS = {
+    "sdpa": sdpa_mask,
+    "eager": eager_mask,
+}
+
+
+def _preprocess_mask_arguments(
+    attn_implementation,
+    input_embeds,
+    attention_mask,
+    cache_position,
+    past_key_values,
+    layer_idx,
+):
+    """
+    Perform some common pre-processing of the mask arguments we get from the modeling code. Mostly determine the
+    key-value length and offsets, and if we should early exit or not.
+    """
+    # If the mask is already 4D, simply return as-is (it was already prepared, or it is custom)
+    if paddle.is_tensor(attention_mask) and len(attention_mask.shape) == 4:
+        return True, attention_mask, None, None, None
+
+    if attn_implementation not in ALL_MASK_ATTENTION_FUNCTIONS:
+        return True, None, None, None
+
+    if attention_mask is not None and attention_mask.ndim == 2:
+        attention_mask = attention_mask.astype("bool")
+
+    # If using a cache, it can give all informations about mask sizes based on seen tokens
+    if past_key_values is not None:
+        kv_length, kv_offset = past_key_values.get_mask_sizes(cache_position, layer_idx)
+    # Otherwise, the sizes are simply the input sizes
+    else:
+        kv_length, kv_offset = input_embeds.shape[1], 0
+
+    return False, attention_mask, kv_length, kv_offset
+
+
+def create_causal_mask(
+    attn_implementation,
+    input_embeds,
+    attention_mask,
+    cache_position,
+    past_key_values,
+    position_ids=None,
+    or_mask_function=None,
+    and_mask_function=None,
+):
+    # `position_ids` is currently not used
+    # If we have an HybridCache structure, here we want to create the mask for the full layers
+    if hasattr(past_key_values, "is_sliding") and False in past_key_values.is_sliding:
+        layer_idx = past_key_values.is_sliding.index(False)
+    else:
+        layer_idx = 0
+
+    early_exit, attention_mask, kv_length, kv_offset = _preprocess_mask_arguments(
+        attn_implementation,
+        input_embeds,
+        attention_mask,
+        cache_position,
+        past_key_values,
+        layer_idx,
+    )
+    if early_exit:
+        return attention_mask
+
+    batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
+    mask_factory_function = causal_mask_function
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[attn_implementation]
+
+    allow_is_causal_skip = (
+        not past_key_values.is_compileable if past_key_values is not None else True
+    )
+
+    # Allow slight deviations from causal mask
+    if or_mask_function is not None:
+        mask_factory_function = or_masks(mask_factory_function, or_mask_function)
+        allow_is_causal_skip = False
+    if and_mask_function is not None:
+        mask_factory_function = and_masks(mask_factory_function, and_mask_function)
+        allow_is_causal_skip = False
+
+    # We now create the mask
+    causal_mask = mask_interface(
+        batch_size=batch_size,
+        cache_position=cache_position,
+        kv_length=kv_length,
+        kv_offset=kv_offset,
+        mask_function=mask_factory_function,
+        attention_mask=attention_mask,
+        allow_is_causal_skip=allow_is_causal_skip,  # additional kwarg for sdpa
+        dtype=dtype,  # Additional kwarg for eager
+    )
+    return causal_mask
diff --git a/paddlex/inference/models/common/vlm/transformers/model_utils.py b/paddlex/inference/models/common/vlm/transformers/model_utils.py
index d409f5f889..feede87ef2 100644
--- a/paddlex/inference/models/common/vlm/transformers/model_utils.py
+++ b/paddlex/inference/models/common/vlm/transformers/model_utils.py
@@ -1544,23 +1544,15 @@ def _fuse_or_split_keys(
             )
 
         if len(unexpected_keys) > 0:
-            if logging.logging.level < 20:
-                logging.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                    f" initializing {model.__class__.__name__}: {sorted(unexpected_keys)}\n- This IS expected if you are"
-                    f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
-                    " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                    " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                    f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
-                    " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
-                )
-            else:
-                logging.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                    f" initializing the model, - This IS expected if you are"
-                    f" initializing the model from a checkpoint of a model trained on another task or"
-                    " with another architecture."
-                )
+            logging.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {sorted(unexpected_keys)}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or"
+                " with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical"
+                " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
+            )
         else:
             logging.info(
                 f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n"
diff --git a/paddlex/inference/models/doc_vlm/modeling/__init__.py b/paddlex/inference/models/doc_vlm/modeling/__init__.py
index e4ad8559ef..565fd61d49 100644
--- a/paddlex/inference/models/doc_vlm/modeling/__init__.py
+++ b/paddlex/inference/models/doc_vlm/modeling/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 from .GOT_ocr_2_0 import PPChart2TableInference
+from .ppocrvl import PPOCRVLForConditionalGeneration
 from .qwen2_5_vl import PPDocBee2Inference
 from .qwen2_vl import PPDocBeeInference, Qwen2VLForConditionalGeneration
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/__init__.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/__init__.py
new file mode 100644
index 0000000000..f2b00b31b0
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._ppocrvl import PPOCRVLForConditionalGeneration
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_config.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_config.py
new file mode 100644
index 0000000000..dc99659227
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_config.py
@@ -0,0 +1,182 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/configuration_keye.py
+# Original header:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ....common.vlm.transformers import PretrainedConfig
+
+
+class PPOCRVisionConfig(PretrainedConfig):
+    model_type = "ppocrvl"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_channels=3,
+        image_size=224,
+        patch_size=14,
+        hidden_act="gelu_pytorch_tanh",
+        layer_norm_eps=1e-6,
+        attention_dropout=0.0,
+        spatial_merge_size=2,
+        temporal_patch_size=2,
+        tokens_per_second=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.spatial_merge_size = spatial_merge_size
+        self.temporal_patch_size = temporal_patch_size
+        self.tokens_per_second = tokens_per_second
+
+
+class PPOCRVLConfig(PretrainedConfig):
+    model_type = "ppocrvl"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"vision_config": PPOCRVisionConfig}
+
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=768,
+        intermediate_size=11008,
+        max_position_embeddings=32768,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        image_token_id=101304,
+        video_token_id=101305,
+        vision_start_token_id=101306,
+        rope_scaling=None,
+        rms_norm_eps=1e-6,
+        use_cache=False,
+        use_flash_attention=False,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        head_dim=128,
+        hidden_act="silu",
+        use_bias=False,
+        rope_theta=10000,
+        weight_share_add_bias=True,
+        ignored_index=-100,
+        attention_probs_dropout_prob=0.0,
+        hidden_dropout_prob=0.0,
+        compression_ratio: float = 1.0,
+        num_key_value_heads=None,
+        max_sequence_length=None,
+        tie_word_embeddings=False,
+        vision_config=None,
+        **kwargs,
+    ):
+        # Set default for tied embeddings if not specified.
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        if isinstance(vision_config, dict):
+            self.vision_config = self.sub_configs["vision_config"](**vision_config)
+        elif vision_config is None:
+            self.vision_config = self.sub_configs["vision_config"]()
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.rope_scaling = rope_scaling
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_flash_attention = use_flash_attention
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        self.vision_start_token_id = vision_start_token_id
+        self.head_dim = head_dim
+        if hidden_act != "silu":
+            raise NotImplementedError
+        self.hidden_act = hidden_act
+        self.hidden_size = hidden_size
+        self.use_bias = use_bias
+        self.weight_share_add_bias = weight_share_add_bias
+        self.rope_theta = rope_theta
+        self.ignored_index = ignored_index
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.compression_ratio = compression_ratio
+        self.num_key_value_heads = num_key_value_heads
+        self.max_sequence_length = max_sequence_length
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+        # Currently, these configuration items are hard-coded
+        self.fuse_rms_norm = True
+        self.use_sparse_flash_attn = True
+        self.use_var_len_flash_attn = False
+        self.scale_qk_coeff = 1.0
+        self.fuse_softmax_mask = False
+        self.use_sparse_head_and_loss_fn = False
+        self.use_recompute_loss_fn = False
+        self.use_fused_head_and_loss_fn = False
+        self.fuse_linear = False
+        self.token_balance_seqlen = False
+        self.use_rmsnorm = True
+        self.fuse_ln = False
+        self.cachekv_quant = False
+        self.fuse_swiglu = False
+        self.freq_allocation = 20
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_distributed/__init__.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_distributed/__init__.py
new file mode 100644
index 0000000000..64c5821cf2
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_distributed/__init__.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Distributed utils
+"""
+
+# TODO: Support XPU
+
+import paddle
+
+from .common_dist_utils import (
+    AllGatherVarlenOp,
+    ColumnParallelLinear,
+    ColumnSequenceParallelLinear,
+    GatherOp,
+    RowParallelLinear,
+    RowSequenceParallelLinear,
+    RRColumnSequenceParallelLinear,
+    RRRowSequenceParallelLinear,
+    ScatterOp,
+    get_hcg,
+    mark_as_sequence_parallel_parameter,
+    sequence_parallel_sparse_mask_labels,
+)
+
+__all__ = [
+    "ColumnParallelLinear",
+    "ColumnSequenceParallelLinear",
+    "RowParallelLinear",
+    "RowSequenceParallelLinear",
+    "GatherOp",
+    "ScatterOp",
+    "mark_as_sequence_parallel_parameter",
+    "ParallelCrossEntropy",
+    "get_rng_state_tracker",
+    "parallel_matmul",
+    "RRColumnSequenceParallelLinear",
+    "RRRowSequenceParallelLinear",
+    "AllGatherVarlenOp",
+    "sequence_parallel_sparse_mask_labels",
+    "get_hcg",
+]
+
+
+def parallel_matmul(
+    x,
+    y,
+    bias=None,
+    transpose_y=False,
+    tensor_parallel_degree=1,
+    tensor_parallel_output=True,
+    fuse_linear=False,
+    training=None,
+):
+    """
+    Parallel matmul wrapper.
+
+    Args:
+        x (Tensor): Input tensor.
+        y (Tensor): Weight tensor.
+        bias (Tensor, optional): Bias tensor. Default is None.
+        transpose_y (bool, optional): Whether to transpose y. Default is False.
+        tensor_parallel_degree (int, optional): Tensor parallel degree. Default is 1.
+        tensor_parallel_output (bool, optional): Whether to output tensor parallel. Default is True.
+        fuse_linear (bool, optional): Whether to fuse linear. Default is False.
+        training (bool, optional): Training state. Default is None.
+    Returns:
+        Tensor: Output tensor.
+    """
+    if paddle.is_compiled_with_xpu():
+        from .common_dist_utils import _parallel_matmul as default_parallel_matmul
+
+        return default_parallel_matmul(
+            x,
+            y,
+            bias=bias,
+            transpose_y=transpose_y,
+            tensor_parallel_degree=tensor_parallel_degree,
+            tensor_parallel_output=tensor_parallel_output,
+            fuse_linear=fuse_linear,
+        )
+    else:
+        from .common_dist_utils import _parallel_matmul
+
+    return _parallel_matmul(
+        x,
+        y,
+        bias=bias,
+        transpose_y=transpose_y,
+        tensor_parallel_degree=tensor_parallel_degree,
+        tensor_parallel_output=tensor_parallel_output,
+        fuse_linear=fuse_linear,
+    )
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_distributed/common_dist_utils.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_distributed/common_dist_utils.py
new file mode 100644
index 0000000000..bc69262193
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_distributed/common_dist_utils.py
@@ -0,0 +1,713 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Common distributed utils.
+"""
+
+import paddle
+import paddle.nn.functional as F
+from paddle import distributed as dist
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+from paddle.distributed.fleet.meta_parallel import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+)
+from paddle.distributed.fleet.utils.sequence_parallel_utils import (
+    AllGatherOp,
+    ColumnSequenceParallelLinear,
+    GatherOp,
+    ReduceScatterOp,
+    RowSequenceParallelLinear,
+    ScatterOp,
+    all_gather,
+    mark_as_sequence_parallel_parameter,
+    scatter,
+)
+from paddle.incubate.tensor.manipulation import create_async_load
+
+from .._refined_recompute.utils import RefinedRecomputeFunction
+
+__all__ = [
+    "get_hcg",
+    "_parallel_matmul",
+    "scatter_axis",
+    "mp_slice",
+    "all_gather_varlen",
+    "ColumnParallelLinear",
+    "ColumnSequenceParallelLinear",
+    "RowParallelLinear",
+    "RowSequenceParallelLinear",
+    "GatherOp",
+    "ScatterOp",
+    "mark_as_sequence_parallel_parameter",
+    "RRColumnSequenceParallelLinear",
+    "RRRowSequenceParallelLinear",
+    "AllGatherVarlenOp",
+    "sequence_parallel_sparse_mask_labels",
+    "get_async_loader",
+    "hack_offload_wait",
+    "hack_reload_wait",
+    "all_gather_group",
+    "reduce_scatter_group",
+]
+
+
+def get_hcg():
+    """
+    Get hybrid communicate group.
+    """
+    return fleet.get_hybrid_communicate_group()
+
+
+def _parallel_matmul(
+    x,
+    y,
+    bias=None,
+    transpose_y=False,
+    tensor_parallel_degree=1,
+    tensor_parallel_output=True,
+    fuse_linear=False,
+):
+    """
+    Performs parallel matrix multiplication with tensor model parallelism support.
+
+    Args:
+        x (paddle.Tensor): Input tensor with shape [batch_size, seq_len, hidden_size]
+        y (Union[paddle.Tensor, EagerParamBase]): Weight matrix which can be:
+            - Regular tensor
+            - Distributed parameter in tensor parallel mode
+        bias (Optional[paddle.Tensor]): Optional bias tensor
+        transpose_y (bool): Whether to transpose the 'y' matrix before multiplication
+        tensor_parallel_degree (int): Degree of tensor model parallelism (default: 1)
+        tensor_parallel_output (bool): Whether to keep output in tensor parallel format
+            or gather across devices (default: True)
+        fuse_linear (bool): Whether to use fused linear operation for optimization
+
+    Returns:
+        paddle.Tensor
+
+    Raises:
+        AssertionError: If tensor parallel is enabled but weight is not distributed
+        AttributeError: If called without distributed.launch context
+    """
+    if tensor_parallel_degree > 1:
+        if isinstance(y, paddle.base.framework.EagerParamBase):
+            assert y.is_distributed
+        # if not running under distributed.launch, it will raise AttributeError: 'Fleet' object has no attribute '_hcg'
+        pg = fleet.get_hybrid_communicate_group().get_model_parallel_group()
+        input_parallel = paddle.distributed.collective._c_identity(x, group=pg)
+
+        if transpose_y:
+            logits = paddle.matmul(input_parallel, y, transpose_y=True)
+            if bias is not None:
+                logits += bias
+        else:
+            if fuse_linear:
+                logits = paddle.incubate.nn.functional.fused_linear(
+                    input_parallel, y, bias
+                )
+            else:
+                logits = F.linear(input_parallel, y, bias)
+
+        if tensor_parallel_output:
+            return logits
+
+        return paddle.distributed.collective._c_concat(logits, group=pg)
+
+    else:
+        if fuse_linear:
+            logits = paddle.incubate.nn.functional.fused_linear(
+                x, y, bias, transpose_weight=transpose_y
+            )
+        else:
+            logits = paddle.matmul(x, y, transpose_y=transpose_y)
+            if bias is not None:
+                logits += bias
+        return logits
+
+
+def scatter_axis(input, group=None, axis=0):
+    """
+    Uniformly splits the `input` along dimension 0 across model parallel groups.
+    This API is not related to `distributed.scatter`.
+
+    Args:
+        input: Input tensor to be split
+        group: Communication group for parallel processing (default: model parallel group)
+        axis: Dimension along which to split (default: 0)
+
+    Returns:
+        A slice of the input tensor corresponding to this rank's portion
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    rank = group.rank
+    seq_len = input.shape[axis]
+    assert seq_len % parallelism == 0, (
+        f"Input sequence length {seq_len} can't be divided exactly"
+        f" by sequence parallelism {parallelism}"
+    )
+    interval = seq_len // parallelism
+    input = paddle.slice(
+        input, axes=[axis], starts=[interval * rank], ends=[interval * (rank + 1)]
+    )
+    # slice uses stride, so we maintain the memory of whole input, use assign to free the whole input
+    # which can avoid OOM.
+    input = paddle.assign(input)
+    return input
+
+
+def mp_slice(x, indices=None, group=None, axis=0):
+    """
+    Slices tensor `x` along dimension 0 according to `indices` without communication.
+
+    Args:
+        x: Input tensor to be sliced
+        indices: List of indices defining how to slice the tensor
+        group: Communication group for parallel processing (default: model parallel group)
+        axis: Dimension along which to slice (default: 0)
+
+    Returns:
+        A slice of the input tensor corresponding to this rank's portion
+    """
+    if indices is None:
+        return scatter(x, group, axis)
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return x
+    rank = group.rank
+    assert len(indices) == parallelism, (len(indices), parallelism)
+    indices = F.pad(paddle.to_tensor(indices).cumsum(0), [1, 0])
+    input = paddle.slice(
+        x, axes=[axis], starts=[indices[rank]], ends=[indices[rank + 1]]
+    )
+    input = paddle.assign(input)
+    return input
+
+
+def all_gather_varlen(input, indices, group=None, axis=0, sync_op=True):
+    """
+    Variable-length version of `all_gather` that behaves similarly to `distributed.all_gather`.
+
+    Args:
+        input: Local tensor to be gathered
+        indices: List of sizes from each rank indicating how much to gather from each
+        group: Communication group for parallel processing (default: model parallel group)
+        axis: Dimension along which to gather (only 0 is supported)
+        sync_op: Whether to synchronize the operation
+
+    Returns:
+        A concatenated tensor containing all gathered data
+    """
+    assert axis == 0, "only support axis=0"
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    input_sizes = [len(input)] * parallelism
+    output_sizes = indices
+    out = paddle.empty([sum(indices)] + input.shape[1:], dtype=input.dtype)
+    task = dist.stream.alltoall_single(
+        out,
+        (
+            paddle.concat([input] * parallelism, 0) if len(input) else input
+        ),  # TODO: check this
+        output_sizes,  # input-size
+        input_sizes,
+        group=group,
+        sync_op=sync_op,
+        use_calc_stream=sync_op,
+    )
+    task.wait()
+    return out
+
+
+class ReduceScatterGroupOp(PyLayer):
+    """
+    Perform group reduce scatter.
+    """
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """Forward pass: Reduce-Scatter operation
+        Args:
+            input (Tensor):  Input tensor with shape [s, b, h].
+                            The 's' dimension will be split across model parallel group.
+            group (ProcessGroup): Model parallel process group,
+                                uses global group by default.
+        Returns:
+            Tensor: Output tensor after Reduce-Scatter with shape [s/n, b, h],
+                   each device holds partial data of the original input.
+        """
+        ctx.group = group
+        return reduce_scatter_group(input, group=group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """Backward pass: All-Gather operation
+        Args:
+            grad (Tensor): Upstream gradient with shape [s/n, b, h]
+        Returns:
+            Tensor: Full gradient after All-Gather with restored shape [s, b, h],
+                   aggregating gradients from all devices in model parallel group.
+        """
+        return all_gather_group(grad, group=ctx.group)
+
+
+class AllGatherGroupOp(PyLayer):
+    """
+    Perform group allgather.
+    """
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """Forward pass: All-Gather operation
+        Args:
+            input (Tensor):  Partitioned tensor with shape [s/n, b, h]
+                            The 's' dimension is distributed across devices
+            group (ProcessGroup): Model parallel process group,
+                                uses global group by default
+        Returns:
+            Tensor: Assembled tensor after All-Gather with shape [s, b, h],
+                   containing full parameter from all devices
+        """
+        ctx.group = group
+        return all_gather_group(input, group=group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """Backward pass: Reduce-Scatter operation
+        Args:
+            grad (Tensor): Full gradient tensor with shape [s, b, h]
+        Returns:
+            Tensor: Scattered gradient with shape [s/n, b, h],
+                   distributing reduced gradients to each device
+        """
+        return reduce_scatter_group(grad, group=ctx.group)
+
+
+class RRColumnSequenceParallelLinear(ColumnSequenceParallelLinear):
+    """
+    ColumnSequenceParallelLinear with refined recompute.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight_attr=None,
+        has_bias=None,
+        gather_output=True,
+        fuse_matmul_bias=False,
+        mp_group=None,
+        use_rr=False,
+        name=None,
+    ):
+        """
+        Initializes a ColumnSequenceParallelLinear module.
+
+        Args:
+            in_features (int): The number of input features.
+            out_features (int): The number of output features.
+            weight_attr (ParamAttr, optional): The parameter attribute for the learnable
+                weight matrix. Default: None.
+            has_bias (bool, optional): Whether the layer uses a bias. By default, it is set to False.
+                If ``has_bias`` is set to False, no bias term is used. If ``has_bias`` is set to True,
+                a bias vector is used. Default: None, which means inherit the value of `has_bias`
+                from the current instance's `has_bias`.
+            gather_output (bool, optional): Whether to gather all outputs from all ranks during forward pass.
+                Default: True. If True, all outputs from all ranks are gathered during forward pass, which
+                makes sure that each example's output is produced only once. If False, all outputs are
+                produced on each rank separately, and the outputs from different ranks may overlap.
+                This can save communication time but may cause slower convergence. Default: True.
+            fuse_matmul_bias (bool, optional): Whether to fuse matmul and bias into one op. Default: False.
+            mp_group (paddle.distributed.Group, optional): The group for model parallel. Default: None.
+            use_rr (bool, optional): Whether to use refined rcompute. Default: False.
+            name (str, optional): Name for the instance to use in tracebacks. Default: None.
+        """
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            weight_attr=weight_attr,
+            has_bias=has_bias,
+            gather_output=gather_output,
+            fuse_matmul_bias=fuse_matmul_bias,
+            mp_group=mp_group,
+            name=name,
+        )
+
+        self._rr_column_ln = RefinedRecomputeFunction() if use_rr else None
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+        if has_bias and self.bias.is_distributed:
+            self.bias.split_axis = 0
+
+    def forward(self, x):
+        """
+        Forward pass function that computes the product of the input tensor and model parameters.
+
+        Args:
+            x (paddle.Tensor): Input tensor with shape (batch_size, seq_len, hidden_size) or (batch_size, hidden_size).
+            If sequence parallel is True, the shape is (seq_len, batch_size, hidden_size).
+
+        Returns:
+            paddle.Tensor: Returns a tensor with shape (batch_size, seq_len, hidden_size) or (batch_size, hidden_size).
+            If sequence parallel is True, the shape is (seq_len, batch_size, hidden_size).
+        """
+        # sequence parallelism is same as model parallelism
+        # if sequence parallel is true, input shape is [s, b, h]
+        # else input shape is [b, s, h]
+        if self.is_mp:
+            input_parallel = AllGatherOp.apply(x)
+        else:
+            input_parallel = x
+
+        if (
+            self._rr_column_ln is not None and self.training
+        ):  # in eval mode, do not use refined recompute
+            output = self._rr_column_ln(
+                self.linear,
+                x=input_parallel,
+                weight=self.weight,
+                bias=self.bias,
+            )
+        else:
+            output = self.linear(
+                input_parallel, self.weight, self.bias, name=self._name
+            )
+        return output
+
+
+class RRRowSequenceParallelLinear(RowSequenceParallelLinear):
+    """
+    RowSequenceParallelLinear with refined recompute.
+    """
+
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight_attr=None,
+        has_bias=True,
+        input_is_parallel=False,
+        fuse_matmul_bias=False,
+        mp_group=None,
+        use_rr=False,
+        name=None,
+    ):
+        """
+        Args:
+            in_features (int): The number of input features.
+            out_features (int): The number of output features.
+            weight_attr (ParamAttr, optional): The parameter attribute for the learnable
+                weight matrix. Defaults to None. If it is None, the system will
+                generate a default Attribute object.
+            has_bias (bool, optional): Whether the layer uses a bias term. Defaults to True.
+            input_is_parallel (bool, optional): Whether the input is parallel. Defaults to False.
+            fuse_matmul_bias (bool, optional): Whether to fuse matmul and bias into one kernel. Defaults to False.
+            mp_group (Group, optional): Model parallel group. Defaults to None.
+            use_rr (bool, optional): Whether to use refined rr. Defaults to False.
+            name (str, optional): Name of the layer. Defaults to None.
+        """
+        super().__init__(
+            in_features=in_features,
+            out_features=out_features,
+            weight_attr=weight_attr,
+            has_bias=has_bias,
+            input_is_parallel=input_is_parallel,
+            fuse_matmul_bias=fuse_matmul_bias,
+            mp_group=mp_group,
+            name=name,
+        )
+
+        self._rr_row_ln = RefinedRecomputeFunction() if use_rr else None
+
+        if self.weight.is_distributed:
+            self.weight.split_axis = 0
+
+    def forward(self, x):
+        """
+        Forward pass function that computes the product of the input tensor and model parameters.
+
+        Args:
+            x (paddle.Tensor): Input tensor with shape (batch_size, in_features).
+
+        Returns:
+            paddle.Tensor: Returns a tensor with shape (batch_size, out_features).
+        """
+        input_parallel = x
+        if self.is_mp:
+            if self.mp_scale is not None:
+                bias = self.mp_scale(self.bias, self.world_size)
+            else:
+                bias = None
+
+            def linear_reduce_scatter(input, weight, bias=None, name=None):
+                output = self.linear(input, weight=weight, bias=bias, name=name)
+                return ReduceScatterOp.apply(output)
+
+            if (
+                self._rr_row_ln is not None and self.training
+            ):  # in eval mode, do not use refined recompute
+                output_ = self._rr_row_ln(
+                    linear_reduce_scatter,
+                    input_parallel,
+                    self.weight,
+                    bias=bias,
+                    name=self._name,
+                )
+            else:
+                output_ = linear_reduce_scatter(
+                    input_parallel, self.weight, bias=bias, name=self._name
+                )
+
+            # if self.bias is not none, sequence parallel will use
+            # register_hook to all_reduce self.bias
+            if bias is None and self.bias is not None:
+                output = output_ + self.bias
+            else:
+                output = output_
+        else:
+            output = self.linear(
+                input_parallel, self.weight, self.bias, name=self._name
+            )
+        return output
+
+
+class AllGatherVarlenOp(PyLayer):
+    """
+    A custom PyLayer that performs variable-length allgather operation.
+
+    This operation handles tensors with different shapes across ranks by:
+    1. Gathering shape information from all ranks
+    2. Padding tensors to maximum size
+    3. Performing allgather
+    4. Reconstructing the original variable-length tensors
+    """
+
+    @staticmethod
+    def forward(ctx, input):
+        """Forward pass for variable-length allgather operation.
+
+        Args:
+            ctx: PyLayer context for saving state
+            input (Tensor): Input tensor to be gathered (may have different sizes across ranks)
+
+        Returns:
+            Tensor: Concatenated output from all ranks with original lengths
+        """
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+
+        shape0 = paddle.to_tensor([input.shape[0]])
+        shape0_all = paddle.empty(shape=[group.nranks], dtype=shape0.dtype)
+        dist.stream.all_gather(shape0_all, shape0, group=group, use_calc_stream=True)
+        shape0_all = shape0_all.numpy()
+        max_shape0 = shape0_all.max()
+
+        indices = []
+        for idx, s in enumerate(shape0_all):
+            offset = idx * max_shape0
+            indices.extend(list(range(offset, offset + s)))
+        indices = paddle.to_tensor(indices)
+
+        padding = max_shape0 - input.shape[0]
+
+        ctx.shape0 = input.shape[0]
+        ctx.max_shape0 = max_shape0
+        ctx.shape0_all = shape0_all
+        ctx.padding = padding
+        ctx.indices = indices
+
+        if padding > 0:
+            input_shape = input.shape
+            input_shape[0] = padding
+            padding_tensor = paddle.empty(shape=input_shape, dtype=input.dtype)
+            input = paddle.concat([input, padding_tensor], axis=0)
+        output = all_gather(input)
+        output = paddle.gather(output, indices, axis=0)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad):
+        """Backward pass for variable-length allgather operation.
+
+        Args:
+            ctx: PyLayer context with saved state
+            grad (Tensor): Gradient flowing back through the graph
+
+        Returns:
+            Tensor: Scattered gradient with original variable lengths
+        """
+        input_shape = grad.shape
+        input_shape[0] = ctx.max_shape0 * ctx.shape0_all.shape[0]
+        output = paddle.zeros(shape=input_shape, dtype=grad.dtype)
+
+        # grad = paddle.put_along_axis(output, ctx.indices, grad, axis=0)
+        grad = paddle.scatter(output, ctx.indices, grad)
+        grad = scatter(grad)
+
+        if ctx.padding > 0:
+            grad = grad[: ctx.shape0]
+        return grad
+
+
+def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
+    """
+    Processes sparse labels in sequence parallel training by gathering non-ignored labels across all ranks.
+
+    This function handles the case where labels may contain ignored values (typically -100) by:
+    1. Distributing labels across model parallel ranks
+    2. Identifying and gathering only valid (non-ignored) labels
+    3. Performing a variable-length allgather operation to collect all valid labels
+
+    Args:
+        labels (paddle.Tensor): The input label tensor which may contain ignore_label values.
+                              Shape should be compatible with model parallel distribution.
+        ignore_label (int, optional): The value used to indicate labels that should be ignored.
+                                     Defaults to -100 (common convention in NLP tasks).
+
+    Returns:
+        tuple: Contains two elements:
+            - labels_all_gather (paddle.Tensor): Concatenated tensor of all non-ignored labels
+                                               from all model parallel ranks.
+            - tgt_index (paddle.Tensor): Indices of the non-ignored labels in the local rank's
+                                        portion of the original labels tensor.
+
+    Note:
+        - This function assumes sequence parallel training is being used.
+        - If a rank has no valid labels (all ignored), it will still contribute one dummy label
+          (index 0) to maintain consistency in the distributed computation.
+        - The returned tgt_index can be used to reconstruct the original label positions.
+    """
+    hcg = fleet.get_hybrid_communicate_group()
+    group = hcg.get_model_parallel_group()
+    labels = labels.flatten()
+    labels_local = paddle.split(labels, group.nranks)[group.rank]
+
+    tgt_index = paddle.nonzero(labels_local != ignore_label).reshape([-1])
+    if tgt_index.numel() == 0:
+        tgt_index = paddle.to_tensor([0])
+
+    labels_local_gather = paddle.gather(labels_local, tgt_index, axis=0)
+    labels_all_gather = AllGatherVarlenOp.apply(labels_local_gather)
+    return labels_all_gather, tgt_index
+
+
+async_loader = None
+
+
+def get_async_loader():
+    """get_async_loader"""
+    global async_loader
+    if not hasattr(fleet.fleet, "_hcg"):
+        if async_loader is None:
+            async_loader = create_async_load()
+        return async_loader
+
+    hcg = get_hcg()
+    if not hasattr(hcg, "async_loader"):
+        hcg.async_loader = create_async_load()
+    return hcg.async_loader
+
+
+def hack_offload_wait(task):
+    """hack_offload_wait"""
+    task.cpu_wait()
+
+
+def hack_reload_wait(task):
+    """hack_offload_wait"""
+    task.cuda_wait()
+
+
+def all_gather_group(input, group=None, axis=0):
+    """Perform collective all-gather operation across a process group with axis control.
+
+    Functional Behavior:
+      - Aggregates input tensors from all processes in the specified group
+      - Supports concatenation along arbitrary dimensions (axis parameter)
+      - Optimizes for axis=0 via direct shape expansion to avoid concatenation overhead
+
+    Args:
+        input (Tensor):        Local tensor to be gathered (shape: [..., D, ...])
+        group (ProcessGroup):  Communication group (defaults to model parallel group)
+        axis (int):            Concatenation dimension (default=0)
+
+    Returns:
+        Tensor: Concatenated tensor combining inputs from all processes:
+                - When axis=0: shape [D*N, ...] (N = group size)
+                - Otherwise:   shape [..., D*N, ...] along specified axis
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    output_shape = input.shape
+    if axis == 0:
+        output_shape[axis] = output_shape[axis] * parallelism
+        output = paddle.empty(shape=output_shape, dtype=input.dtype)
+        dist.stream.all_gather(output, input, group=group, use_calc_stream=True)
+        return output
+    outputs = [
+        paddle.empty(output_shape, dtype=input.dtype) for _ in range(parallelism)
+    ]
+    dist.stream.all_gather(outputs, input, group=group, use_calc_stream=True)
+    output = paddle.concat(outputs, axis=axis)
+    return output
+
+
+def reduce_scatter_group(input, group=None):
+    """Perform reduce-scatter collective operation across a process group.
+
+    Functional Behavior:
+      - Aggregates (sums) input tensors across all processes in the group
+      - Scatters the reduced result equally to all participants
+      - Operates along the first dimension (axis=0) of the input tensor
+
+    Args:
+        input (Tensor):        Local tensor to reduce (shape: [N*K, ...] where N=group_size)
+        group (ProcessGroup): Communication group (defaults to model parallel group)
+
+    Returns:
+        Tensor: Scattered portion of reduced tensor with shape [K, ...]
+    """
+    if group is None:
+        hcg = fleet.get_hybrid_communicate_group()
+        group = hcg.get_model_parallel_group()
+    parallelism = group.nranks
+    if parallelism == 1:
+        return input.clone()
+    output_shape = input.shape
+    assert (
+        input.shape[0] % parallelism == 0
+    ), f"Input sequence length {input.shape[0]} can't be divided exactly by sequence parallelism {parallelism}"
+    output_shape[0] = output_shape[0] // parallelism
+    output = paddle.empty(shape=output_shape, dtype=input.dtype)
+    dist.stream.reduce_scatter(
+        output, input, op=dist.ReduceOp.SUM, group=group, use_calc_stream=True
+    )
+    return output
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_ernie.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_ernie.py
new file mode 100644
index 0000000000..7bc3b7f6ad
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_ernie.py
@@ -0,0 +1,2363 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Paddle Ernie model"""
+
+import contextlib
+import functools
+from functools import partial
+from typing import Optional, Tuple
+
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+from paddle import incubate, nn, tensor
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+from paddle.distributed.fleet.layers.mpu import mp_ops
+from paddle.distributed.fleet.layers.mpu.mp_layers import (
+    ColumnParallelLinear,
+    RowParallelLinear,
+    VocabParallelEmbedding,
+)
+from paddle.distributed.fleet.meta_parallel import (
+    ParallelCrossEntropy,
+    get_rng_state_tracker,
+)
+from paddle.distributed.fleet.utils import recompute
+
+from ......utils import logging
+from ....common.vlm.transformers import PretrainedModel
+from ....common.vlm.transformers.model_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+)
+from ._config import PPOCRVLConfig
+from ._distributed import (
+    AllGatherVarlenOp,
+    ColumnParallelLinear,
+    ColumnSequenceParallelLinear,
+    GatherOp,
+    RowParallelLinear,
+    RowSequenceParallelLinear,
+    RRColumnSequenceParallelLinear,
+    RRRowSequenceParallelLinear,
+    mark_as_sequence_parallel_parameter,
+    parallel_matmul,
+    sequence_parallel_sparse_mask_labels,
+)
+from ._fusion_ops import (
+    Linear,
+    fused_rms_norm_ext,
+    fused_swiglu,
+    fusion_flash_attention,
+)
+from ._sequence_parallel_utils import ScatterOp
+
+
+def calc_lm_head_logits(
+    config, hidden_states, weight, bias, tensor_parallel_output=None, training=True
+):
+    """
+    Calculate language model head logits with support for various parallelization strategies.
+
+    This is the core function that computes the final output logits for a language model,
+    handling sequence parallelism and tensor parallelism configurations.
+
+    Args:
+        config (PPOCRVLConfig): Model configuration.
+        hidden_states (Tensor): Hidden states from the transformer layers
+        weight (Tensor): Weight matrix for the language model head
+        bias (Tensor): Bias vector for the language model head
+        tensor_parallel_output (bool, optional): Override for tensor parallel output behavior.
+                                               If None, uses config.tensor_parallel_output.
+                                               Defaults to None.
+        training (bool, optional): Whether in training mode. Defaults to True.
+
+    Returns:
+        Tensor: The computed logits for language modeling.
+    """
+    if config.sequence_parallel:
+        if config.use_sparse_head_and_loss_fn:
+            pass  # Nothing needs to be done.
+        else:
+            hidden_states = GatherOp.apply(hidden_states)
+            max_sequence_length = config.max_sequence_length
+            hidden_states = hidden_states.reshape(
+                [-1, max_sequence_length, hidden_states.shape[-1]]
+            )
+
+    if tensor_parallel_output is None:
+        tensor_parallel_output = config.tensor_parallel_output
+    logits = parallel_matmul(
+        hidden_states,
+        weight,
+        bias=bias,
+        transpose_y=config.tie_word_embeddings,
+        tensor_parallel_degree=config.tensor_parallel_degree,
+        tensor_parallel_output=tensor_parallel_output,
+        fuse_linear=config.fuse_linear,
+        training=training,
+    )
+
+    return logits
+
+
+def subbatch(f, arg_idx, axis, bs, out_idx, use_recompute=False, same_arg_idx={}):
+    """
+    Converts a function to one that applies to subbatch of an input dimension.
+    This is useful for processing large tensors in smaller chunks to reduce memory usage.
+
+    Args:
+        f (Callable): Original function to be converted to subbatch processing.
+        arg_idx ([int]): Indices of the inputs to be subbatched.
+        axis ([int]): Indices of the dimensions to be subbatched for each input.
+        bs (int): Subbatch size (number of elements to process at once).
+        out_idx (int): Index of the output dimension that needs stacking.
+        use_recompute (bool, optional): Whether to use recomputation for memory savings. Defaults to False.
+        same_arg_idx (dict, optional): Mapping of argument indices that share the same tensor.
+                                     e.g. {1: 0} means args[1] == args[0], avoiding duplicate slicing.
+
+    Returns:
+        Callable: Converted function that processes inputs in subbatches.
+    """
+
+    @functools.wraps(f)
+    def wrapper(*args, **kwargs):
+
+        assert len(arg_idx) == len(
+            axis
+        ), "Number of batching args and number of batching dims should match."
+
+        inps = [args[i] for i in arg_idx]
+        axis_width = [inp.shape[d] for inp, d in zip(inps, axis)]
+        assert len(set(axis_width)) == 1, "Batch sizes should be kept equal."
+
+        inp_axis = {inp: d for inp, d in zip(inps, axis)}
+
+        axis_width = axis_width[0]
+        if axis_width < bs:
+            return f(*args, **kwargs)
+
+        outs = []
+        for slice_at in np.arange(0, axis_width, bs):
+            _args = []
+            for i, inp in enumerate(args):
+                if i in same_arg_idx:
+                    assert (
+                        i > same_arg_idx[i]
+                    ), f"expect i > same_arg_idx[i], but got i: {i} and same_arg_idx[i]: {same_arg_idx[i]}"
+                    _args.append(_args[same_arg_idx[i]])
+                elif i in arg_idx:
+                    inp = inp.slice(
+                        [inp_axis[inp]],
+                        [slice_at],
+                        [min(inp.shape[inp_axis[inp]], slice_at + bs)],
+                    )
+                    _args.append(inp)
+                else:
+                    _args.append(inp)
+            if use_recompute:
+                out = paddle.distributed.fleet.utils.recompute(f, *_args, **kwargs)
+            else:
+                out = f(*_args, **kwargs)
+            outs.append(out)
+
+        return paddle.concat(outs, out_idx)
+
+    return wrapper
+
+
+def _rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return paddle.concat((-x2, x1), axis=-1)
+
+
+def _apply_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    mrope_section = mrope_section * 2
+    cos = paddle.concat(
+        [m[i % 3] for i, m in enumerate(cos.split(mrope_section, axis=-1))], axis=-1
+    ).unsqueeze(unsqueeze_dim)
+    sin = paddle.concat(
+        [m[i % 3] for i, m in enumerate(sin.split(mrope_section, axis=-1))], axis=-1
+    ).unsqueeze(unsqueeze_dim)
+
+    q_embed = (q * cos) + (_rotate_half(q) * sin)
+    k_embed = (k * cos) + (_rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class FusedDropoutImpl(nn.Layer):
+    """
+    Fused dropout implementation with residual connection support.
+
+    This layer combines dropout and residual addition in a single operation for better performance,
+    particularly on GPU devices. The dropout is conditionally applied based on the probability.
+
+    Args:
+        prob (float): Dropout probability (between 0 and 1)
+        mode (str): Dropout mode, either 'upscale_in_train' or 'downscale_in_infer'
+
+    Attributes:
+        prob (float): Stores the dropout probability
+        mode (str): Stores the dropout mode
+        dropout (nn.Dropout): The actual dropout layer instance
+    """
+
+    def __init__(self, prob, mode):
+        """
+        Initialize the fused dropout layer.
+
+        Args:
+            prob (float): Dropout probability (0 means no dropout)
+            mode (str): Dropout mode ('upscale_in_train' or 'downscale_in_infer')
+        """
+        super().__init__()
+        self.prob = prob
+        self.mode = mode
+        self.dropout = nn.Dropout(p=prob, mode=mode)
+
+    def forward(self, x, y):
+        """
+        Forward pass of the fused dropout layer.
+
+        Args:
+            x (Tensor): Input tensor to potentially apply dropout on
+            y (Tensor): Residual tensor to add to the (possibly dropped out) x
+
+        Returns:
+            Tensor: Result of x (with optional dropout) + y
+        """
+        if self.prob > 0:
+            x = self.dropout(x)
+        output = x + y
+
+        return output
+
+
+class RMSNorm(nn.Layer):
+    """
+    Root Mean Square Layer Normalization (RMSNorm) implementation.
+
+    RMSNorm is a simplified version of LayerNorm that focuses on the root mean square of inputs,
+    omitting the mean-centering operation. This provides computational efficiency while maintaining
+    good performance.
+
+    """
+
+    def __init__(self, config):
+        """
+        Initialize RMSNorm layer.
+
+        Args:
+            config (PPOCRVLConfig): Model configuration.
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.weight = paddle.create_parameter(
+            shape=[self.hidden_size],
+            dtype=paddle.get_default_dtype(),
+            default_initializer=nn.initializer.Constant(1.0),
+        )
+        self.variance_epsilon = config.rms_norm_eps
+        self.config = config
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+
+    def forward(self, hidden_states):
+        """
+        Apply RMS normalization to input hidden states.
+
+        Args:
+            hidden_states (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+
+        Returns:
+            Tensor: Normalized output tensor of same shape as input
+
+        Note:
+            - Uses fused kernel if config.fuse_rms_norm is True for better performance
+            - Otherwise computes RMSNorm manually:
+                1. Compute variance of features
+                2. Apply reciprocal square root normalization
+                3. Scale by learned weight parameter
+            - Maintains original dtype for numerical stability during computation
+        """
+        if self.config.fuse_rms_norm:
+            return fused_rms_norm_ext(
+                hidden_states, self.weight, self.variance_epsilon
+            )[0].astype(self.weight.dtype)
+        with paddle.amp.auto_cast(False):
+            variance = hidden_states.astype("float32").pow(2).mean(-1, keepdim=True)
+            hidden_states = (
+                paddle.rsqrt(variance + self.variance_epsilon) * hidden_states
+            )
+        return hidden_states.astype(self.weight.dtype) * self.weight
+
+
+class LayerNorm(nn.LayerNorm):
+    """
+    Layer Normalization (LayerNorm) implementation with optional optimizations.
+
+    This extends PaddlePaddle's built-in LayerNorm with:
+    1. Sequence parallelism support
+    2. Fast fused kernel implementation option
+    3. Configurable epsilon value
+
+    """
+
+    def __init__(self, config):
+        """
+        Initialize LayerNorm with configuration.
+
+        Args:
+            config (PPOCRVLConfig): Model configuration contains normalization parameters and flags.
+        """
+        super().__init__(config.hidden_size, epsilon=config.rms_norm_eps)
+        self.config = config
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.weight)
+            mark_as_sequence_parallel_parameter(self.bias)
+
+
+class KeyeRotaryEmbedding(nn.Layer):
+    def __init__(self, config: PPOCRVLConfig, device=None):
+        super().__init__()
+        self.rope_kwargs = {}
+        if config is None:
+            raise NotImplementedError
+        else:
+            # BC: "rope_type" was originally "type"
+            if config.rope_scaling is not None:
+                self.rope_type = config.rope_scaling.get(
+                    "rope_type", config.rope_scaling.get("type")
+                )
+            else:
+                self.rope_type = "default"
+
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and config.rope_scaling is not None:
+            self.rope_type = config.rope_scaling.get(
+                "rope_type", config.rope_scaling.get("type")
+            )
+        else:
+            self.rope_type = "default"
+
+        self.config = config
+        if self.rope_type == "default":
+            dim = config.head_dim
+            inv_freq = 1.0 / (
+                config.rope_theta
+                ** (paddle.arange(0, dim, 2, dtype="int64").astype("float32") / dim)
+            )
+            self.attention_scaling = 1.0
+        else:
+            raise ValueError(f"Unsupported rope type: {self.rope_type}")
+
+        self.register_buffer("inv_freq", inv_freq, persistable=False)
+        self.original_inv_freq = self.inv_freq
+
+    @paddle.no_grad()
+    def forward(self, x, position_ids):
+        # Core RoPE block. In contrast to other models, Keye has different position ids for the grids
+        # So we expand the inv_freq to shape (3, ...)
+        inv_freq_expanded = (
+            self.inv_freq[None, None, :, None]
+            .cast("float32")
+            .expand((3, position_ids.shape[1], -1, 1))
+        )
+        position_ids_expanded = position_ids[:, :, None, :].cast(
+            "float32"
+        )  # shape (3, bs, 1, positions)
+        with paddle.amp.auto_cast(enable=False):
+            freqs = (
+                inv_freq_expanded.cast("float32")
+                @ position_ids_expanded.cast("float32")
+            ).transpose((0, 1, 3, 2))
+            emb = paddle.concat((freqs, freqs), axis=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+
+        # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention
+        cos = cos * self.attention_scaling
+        sin = sin * self.attention_scaling
+
+        return cos.astype(x.dtype), sin.astype(x.dtype)
+
+
+class Ernie4_5MLP(nn.Layer):
+    """
+    Ernie4_5MLP - Gated Multi-Layer Perceptron module used in Ernie model.
+    """
+
+    def __init__(self, config, layer_idx=0):
+        """
+        Initialize the MLP module with configuration options.
+
+        Args:
+            config (PPOCRVLConfig): Model configurations.
+            layer_idx (int): Index of current layer (default: 0)
+        """
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+
+        if config.tensor_parallel_degree > 1:
+            ColumnLN = (
+                ColumnSequenceParallelLinear
+                if config.sequence_parallel
+                else ColumnParallelLinear
+            )
+            RowLN = (
+                RowSequenceParallelLinear
+                if config.sequence_parallel
+                else RowParallelLinear
+            )
+
+            column_ln_configs = {}
+            if (
+                config.recompute
+                and config.sequence_parallel
+                and config.skip_recompute_ops[layer_idx].get("mlp_column_ln", False)
+            ):
+                ColumnLN = RRColumnSequenceParallelLinear
+                column_ln_configs = {"use_rr": True}
+            self.up_gate_proj = ColumnLN(
+                self.hidden_size,
+                self.intermediate_size * 2,
+                gather_output=False,
+                has_bias=config.use_bias,
+                fuse_matmul_bias=config.fuse_linear,
+                **column_ln_configs,
+            )
+        else:
+            LinearFN = paddle.incubate.nn.FusedLinear if config.fuse_linear else Linear
+            self.up_gate_proj = LinearFN(
+                self.hidden_size, self.intermediate_size * 2, bias_attr=config.use_bias
+            )
+
+        if config.tensor_parallel_degree > 1:
+            row_ln_configs = {}
+            if (
+                config.recompute
+                and config.sequence_parallel
+                and config.skip_recompute_ops[layer_idx].get("mlp_row_ln", False)
+            ):
+                RowLN = RRRowSequenceParallelLinear
+                row_ln_configs = {"use_rr": True}
+            self.down_proj = RowLN(
+                self.intermediate_size,
+                self.hidden_size,
+                input_is_parallel=True,
+                has_bias=config.use_bias,
+                fuse_matmul_bias=config.fuse_linear,
+                **row_ln_configs,
+            )
+        else:
+            LinearFN = paddle.incubate.nn.FusedLinear if config.fuse_linear else Linear
+            self.down_proj = LinearFN(
+                self.intermediate_size, self.hidden_size, bias_attr=config.use_bias
+            )
+
+        self.fuse_swiglu = config.fuse_swiglu
+        if self.fuse_swiglu:
+            assert fused_swiglu is not None, "fused_swiglu operator is not found."
+
+    def forward(self, x):
+        """
+        Forward pass through the MLP module.
+
+        Args:
+            x (Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+
+        Returns:
+            Tensor: Output tensor of shape [batch_size, seq_len, hidden_size]
+
+        Note:
+            Implements SwiGLU activation: swish(Wx) * (Vx) where W and V are
+            the first and second halves of up_gate_proj output respectively.
+        """
+        if self.fuse_swiglu:
+            x = self.up_gate_proj(x)
+            x = fused_swiglu(x)
+        else:
+            gate, x = self.up_gate_proj(x).chunk(2, axis=-1)
+            x = F.silu(gate) * x
+        return self.down_proj(x)
+
+
+class Ernie4_5Attention(nn.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config, layer_idx=0):
+        """Initialize the attention layer.
+
+        Args:
+            config (PPOCRVLConfig): Model configuration.
+            layer_idx (int, optional): Index in transformer stack. Defaults to 0.
+        """
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        if getattr(config, "head_dim", None) is None:
+            self.head_dim = self.hidden_size // self.num_heads
+        else:
+            self.head_dim = config.head_dim
+        self.is_gqa = (
+            config.num_key_value_heads is not None
+            and config.num_key_value_heads != self.num_heads
+        )
+
+        self.rope_scaling = config.rope_scaling
+
+        self.freq_allocation = config.get("freq_allocation", 0)
+
+        if config.tensor_parallel_degree > 1:
+            assert (
+                self.num_heads % config.tensor_parallel_degree == 0
+            ), f"num_heads: {self.num_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+            self.num_heads = self.num_heads // config.tensor_parallel_degree
+            if self.is_gqa:
+                assert (
+                    self.num_key_value_heads % config.tensor_parallel_degree == 0
+                ), f"num_heads: {self.num_key_value_heads}, tensor_parallel_degree: {config.tensor_parallel_degree}"
+                self.num_key_value_heads = (
+                    self.num_key_value_heads // config.tensor_parallel_degree
+                )
+        if self.is_gqa:
+            logging.info(
+                f"use GQA - num_heads: {self.num_heads}- num_key_value_heads: {self.num_key_value_heads}"
+            )
+            assert (
+                self.num_heads % self.num_key_value_heads == 0
+            ), f"num_heads: {self.num_heads}, num_key_value_heads: {self.num_key_value_heads}"
+            if getattr(config, "head_dim", None) is None:
+                kv_hidden_size = (
+                    self.hidden_size // self.num_heads * self.num_key_value_heads
+                )
+            else:
+                kv_hidden_size = self.head_dim * config.num_key_value_heads
+                q_hidden_size = self.head_dim * config.num_attention_heads
+        else:
+            q_hidden_size = kv_hidden_size = self.head_dim * config.num_attention_heads
+
+        if config.tensor_parallel_degree > 1:
+            column_ln_configs = {}
+            ColumnLN = (
+                ColumnSequenceParallelLinear
+                if config.sequence_parallel
+                else ColumnParallelLinear
+            )
+            RowLN = (
+                RowSequenceParallelLinear
+                if config.sequence_parallel
+                else RowParallelLinear
+            )
+            if (
+                config.recompute
+                and config.sequence_parallel
+                and config.skip_recompute_ops[layer_idx].get(
+                    "attention_column_ln", False
+                )
+            ):
+                ColumnLN = RRColumnSequenceParallelLinear
+                column_ln_configs = {"use_rr": True}
+
+            if getattr(config, "head_dim", None) is None:
+                qkv_hidden_size = (
+                    self.hidden_size * 3
+                    if not self.is_gqa
+                    else self.hidden_size + kv_hidden_size * 2
+                )
+            else:
+                qkv_hidden_size = q_hidden_size + kv_hidden_size * 2
+            self.qkv_proj = ColumnLN(
+                self.hidden_size,
+                qkv_hidden_size,
+                has_bias=config.use_bias,
+                gather_output=False,
+                fuse_matmul_bias=config.fuse_linear,
+                **column_ln_configs,
+            )
+        else:
+            LinearFN = paddle.incubate.nn.FusedLinear if config.fuse_linear else Linear
+            if getattr(config, "head_dim", None) is None:
+                qkv_hidden_size = (
+                    self.hidden_size * 3
+                    if not self.is_gqa
+                    else self.hidden_size + kv_hidden_size * 2
+                )
+            else:
+                qkv_hidden_size = q_hidden_size + kv_hidden_size * 2
+            self.qkv_proj = LinearFN(
+                self.hidden_size,
+                qkv_hidden_size,
+                bias_attr=config.use_bias,
+            )
+
+        if config.tensor_parallel_degree > 1:
+            row_ln_configs = {}
+            if (
+                config.recompute
+                and config.sequence_parallel
+                and config.skip_recompute_ops[layer_idx].get("attention_row_ln", False)
+            ):
+                RowLN = RRRowSequenceParallelLinear
+                row_ln_configs = {"use_rr": True}
+
+            self.o_proj = RowLN(
+                (
+                    self.hidden_size
+                    if getattr(config, "head_dim", None) is None
+                    else q_hidden_size
+                ),
+                self.hidden_size,
+                has_bias=config.use_bias,
+                input_is_parallel=True,
+                fuse_matmul_bias=config.fuse_linear,
+                **row_ln_configs,
+            )
+        else:
+            LinearFN = paddle.incubate.nn.FusedLinear if config.fuse_linear else Linear
+            self.o_proj = LinearFN(
+                (
+                    self.hidden_size
+                    if getattr(config, "head_dim", None) is None
+                    else q_hidden_size
+                ),
+                self.hidden_size,
+                bias_attr=config.use_bias,
+            )
+        self.config = config
+
+        self._rr_flash_attn = None
+        if config.recompute and config.skip_recompute_ops[layer_idx].get(
+            "flash_attn", False
+        ):
+            # TODO
+            raise NotImplementedError
+
+        self.set_attn_func()
+
+    def set_attn_func(self):
+        """Configure attention function based on settings.
+
+        Selects between flash/core attention.
+        """
+        config = self.config
+        if config.use_flash_attention:
+            self.attn_func = self._flash_attention_wrapper
+        else:
+            self.attn_func = self.core_attn
+
+        if config.cachekv_quant:
+            # TODO: Support `cachekv_quant`
+            raise NotImplementedError
+
+    def forward(
+        self,
+        hidden_states,
+        position_embeddings,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        attn_mask_start_row_indices: Optional[paddle.Tensor] = None,
+        position_ids: Optional[Tuple[paddle.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        token_type_ids: Optional[Tuple[paddle.Tensor]] = None,  # MLLM
+    ) -> Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[Tuple[paddle.Tensor]]]:
+        """Compute attention outputs.
+
+        Args:
+            hidden_states (paddle.Tensor): Input tensor [bsz, seq_len, hidden_size]
+            position_embeddings (paddle.Tensor): Position embeddings
+            past_key_value (Optional[Tuple[paddle.Tensor, paddle.Tensor]]): Cached key/value states
+            attention_mask (Optional[paddle.Tensor]): Attention mask tensor
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Variable length attention indices
+            position_ids (Optional[paddle.Tensor]): Position indices for RoPE
+            output_attentions (bool): Return attention weights if True
+            use_cache (bool): Cache key/value states if True
+
+        Returns:
+            Tuple containing:
+                - attention_output: [bsz, seq_len, hidden_size]
+                - attention_weights: Optional attention probabilities
+                - updated_key_value_cache: Optional updated cache
+        """
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids[:, :-1]
+        if self.config.sequence_parallel:
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids.reshape([-1])
+                token_type_ids = ScatterOp.apply(token_type_ids)
+                token_type_ids.stop_gradient = True
+            max_sequence_length = self.config.max_sequence_length
+            bsz = (
+                hidden_states.shape[0]
+                * self.config.tensor_parallel_degree
+                // max_sequence_length
+            )
+            q_len = max_sequence_length
+        else:
+            bsz, q_len, _ = hidden_states.shape
+        query_states = key_states = value_states = mix_layer = None
+        mix_layer = self.qkv_proj(hidden_states)
+        if self.is_gqa:
+            query_states, key_states, value_states = paddle.split(
+                mix_layer.reshape([bsz, q_len, -1, self.head_dim]),
+                [self.num_heads, self.num_key_value_heads, self.num_key_value_heads],
+                axis=2,
+            )
+            mix_layer = None
+        else:
+            mix_layer = mix_layer.reshape(
+                [bsz, q_len, self.num_heads, 3 * self.head_dim]
+            )
+
+        if mix_layer is not None:
+            has_gradient = not mix_layer.stop_gradient
+        else:
+            has_gradient = not (
+                query_states.stop_gradient
+                and key_states.stop_gradient
+                and value_states.stop_gradient
+            )
+        if (
+            self.config.recompute
+            and self.config.recompute_granularity == "core_attn"
+            and has_gradient
+        ):
+            assert past_key_value is None, "do not use kv cache in recompute"
+            assert not use_cache
+            attn_output, attn_weights, past_key_value = recompute(
+                self.rope_attn,
+                mix_layer,
+                query_states,
+                key_states,
+                value_states,
+                position_embeddings,
+                attention_mask,
+                position_ids,
+                output_attentions,
+                past_key_value,
+                use_cache,
+                attn_mask_start_row_indices,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            attn_output, attn_weights, past_key_value = self.rope_attn(
+                mix_layer=mix_layer,
+                query_states=query_states,
+                key_states=key_states,
+                value_states=value_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                past_key_value=past_key_value,
+                use_cache=use_cache,
+                attn_mask_start_row_indices=attn_mask_start_row_indices,
+            )
+        if self.config.sequence_parallel:
+            attn_output = attn_output.reshape([-1, attn_output.shape[-1]])
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_wrapper(
+        self,
+        q,
+        k,
+        v,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        seq_length=None,
+    ):
+        """Optimized flash attention implementation.
+
+        Args:
+            q (paddle.Tensor): Query tensor
+            k (paddle.Tensor): Key tensor
+            v (paddle.Tensor): Value tensor
+            attention_mask (Optional[paddle.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Variable length indices
+            seq_length (Optional[int]): Sequence length
+
+        Returns:
+            paddle.Tensor: Attention output tensor
+        """
+        return fusion_flash_attention(
+            q,
+            k,
+            v,
+            self.training,
+            self.config.attention_probs_dropout_prob,
+            self.config.use_sparse_flash_attn,
+            attention_mask,
+            attn_mask_start_row_indices,
+            seq_length,
+            self.config.use_var_len_flash_attn,
+            self._rr_flash_attn if self.training else None,
+        )
+
+    def core_attn(
+        self,
+        q,
+        k,
+        v,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        seq_length=None,
+    ):
+        """Standard self-attention implementation.
+
+        Args:
+            q (paddle.Tensor): Query tensor
+            k (paddle.Tensor): Key tensor
+            v (paddle.Tensor): Value tensor
+            attention_mask (Optional[paddle.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Variable length indices
+            seq_length (Optional[int]): Sequence length
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]: Attention output and weights
+        """
+        perm = [
+            0,
+            2,
+            1,
+            3,
+        ]  # [1, 2, 0, 3] if self.sequence_parallel else [0, 2, 1, 3]
+        origin_dtype = q.dtype
+
+        q = tensor.transpose(x=q, perm=perm)
+        k = tensor.transpose(x=k, perm=perm)
+        v = tensor.transpose(x=v, perm=perm)
+
+        replicate = self.config.num_attention_heads // self.config.num_key_value_heads
+        k = paddle.repeat_interleave(k, replicate, axis=1)
+        v = paddle.repeat_interleave(v, replicate, axis=1)
+
+        scale_qk_coeff = self.config.scale_qk_coeff * self.head_dim**0.5
+        product = paddle.matmul(x=q.scale(1.0 / scale_qk_coeff), y=k, transpose_y=True)
+
+        product = product.cast(paddle.float32)
+        if self.config.scale_qk_coeff != 1.0:
+            product = product.scale(self.config.scale_qk_coeff)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask.cast(paddle.float32)
+            if self.config.fuse_softmax_mask:
+                weights = incubate.softmax_mask_fuse(product, attention_mask)
+            else:
+                product = product + attention_mask
+                weights = F.softmax(product)
+        else:
+            weights = incubate.softmax_mask_fuse_upper_triangle(product)
+
+        weights = weights.cast(origin_dtype)
+
+        if self.config.attention_probs_dropout_prob:
+            with get_rng_state_tracker().rng_state("local_seed"):
+                weights = F.dropout(
+                    weights,
+                    self.config.attention_probs_dropout_prob,
+                    training=self.training,
+                    mode="upscale_in_train",
+                )
+
+        out = paddle.matmul(weights, v)
+
+        # combine heads
+        out = tensor.transpose(out, perm=[0, 2, 1, 3])
+        # If sequence_parallel is true, out shape is [s, b, h] after reshape
+        # else out shape is [b, s, h]
+        out = tensor.reshape(x=out, shape=[0, 0, -1])
+
+        return out, weights
+
+    def rope_attn(
+        self,
+        mix_layer,
+        query_states,
+        key_states,
+        value_states,
+        position_embeddings,
+        attention_mask,
+        position_ids,
+        output_attentions=False,
+        past_key_value=None,
+        use_cache=False,
+        attn_mask_start_row_indices=None,
+    ):
+        if mix_layer is not None:
+            query_states, key_states, value_states = paddle.split(mix_layer, 3, axis=-1)
+        query_states_dtype = query_states.dtype
+
+        kv_seq_len = position_ids.max() + 1
+        offset = 0
+        if past_key_value is not None:
+            # LLM
+            offset = past_key_value[0].shape[-3]
+            kv_seq_len += offset
+
+        query_states = query_states.astype(query_states_dtype)
+        key_states = key_states.astype(query_states_dtype)
+
+        if position_ids.dim() == 3 and position_ids.shape[0] > 1:
+            position_ids = position_ids[0:1]
+
+        cos, sin = position_embeddings
+        query_states, key_states = _apply_multimodal_rotary_pos_emb(
+            query_states, key_states, cos, sin, self.rope_scaling["mrope_section"], 2
+        )
+
+        if past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = paddle.concat([past_key_value[0], key_states], axis=1)
+            value_states = paddle.concat([past_key_value[1], value_states], axis=1)
+
+        # NOTE(for generation): use list instead of tuple to store the cache
+        # tensors, so that we can clear the cache tensors for memory efficiency.
+        past_key_value = [key_states, value_states] if use_cache else None
+        seq_length = query_states.shape[1]
+        attn_output, attn_weights = self.attn_func(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            attn_mask_start_row_indices,
+            seq_length,
+        )
+        return attn_output, attn_weights, past_key_value
+
+
+class FusedHeadParallelCrossEntropy(PyLayer):
+    """Fused parallel cross-entropy loss computation for large sequence lengths.
+
+    Combines head projection and loss computation with optimized memory usage for long sequences,
+    supporting tensor parallel training.
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        hidden_states,
+        weight,
+        bias,
+        labels,
+        tensor_parallel_degree,
+        mp_group=None,
+        ignore_index=-100,
+        seq_chunk_size=8192,
+        transpose_y=False,
+        fuse_linear=False,
+        training=True,
+    ):
+        """Forward pass for parallel cross-entropy computation.
+
+        Args:
+            ctx: Context object for saving tensors between forward/backward
+            hidden_states (paddle.Tensor): Input tensor of shape [batch_size*seq_len, hidden_size]
+            weight (paddle.Tensor): Weight matrix for projection
+            bias (Optional[paddle.Tensor]): Optional bias vector
+            labels (paddle.Tensor): Target labels tensor of shape [batch_size*seq_len]
+            tensor_parallel_degree (int): Degree of tensor parallelism
+            mp_group (Optional[dist.Group]): Model parallel group. Defaults to None (auto-detect)
+            ignore_index (int): Index to ignore in loss computation. Defaults to -100
+            seq_chunk_size (int): Chunk size for processing long sequences. Defaults to 8192
+            transpose_y (bool): Whether to transpose weight matrix. Defaults to False
+            fuse_linear (bool): Whether to use fused linear ops. Defaults to False
+            training (bool): Whether in training mode. Defaults to True
+
+        Returns:
+            Tuple[paddle.Tensor, paddle.Tensor]:
+                - loss: Computed loss tensor
+                - gathered_labels: Concatenated labels from all parallel groups
+        """
+
+        ctx.tensor_parallel_degree = tensor_parallel_degree
+        ctx.ignore_index = ignore_index
+        ctx.seq_chunk_size = seq_chunk_size
+        ctx.transpose_y = transpose_y
+        ctx.fuse_linear = fuse_linear
+        ctx.training = training
+
+        ctx.hidden_states_shape = hidden_states.shape
+
+        ctx.mp_group = (
+            fleet.get_hybrid_communicate_group().get_model_parallel_group()
+            if mp_group is None
+            else mp_group
+        )
+        ctx.rank = ctx.mp_group.rank
+        ctx.world_size = ctx.mp_group.nranks
+
+        loss_all = []
+        labels_all = []
+        with paddle.no_grad():
+            labels = labels.reshape_([-1])
+            hidden_states = hidden_states.reshape_([-1, hidden_states.shape[-1]])
+
+            num_tokens_per_rank = []
+            dist.stream.all_gather(
+                num_tokens_per_rank,
+                paddle.to_tensor(hidden_states.shape[0], dtype=paddle.int32),
+                group=ctx.mp_group,
+            )
+            ctx.num_tokens_per_rank = num_tokens_per_rank
+
+            for idx in range(ctx.world_size):
+                if idx == ctx.rank:
+                    hidden_states_recv = hidden_states
+                    labels_recv = labels
+                else:
+                    hidden_states_recv = paddle.empty(
+                        [ctx.num_tokens_per_rank[idx], hidden_states.shape[-1]],
+                        dtype=hidden_states.dtype,
+                    )
+                    labels_recv = paddle.empty(
+                        [ctx.num_tokens_per_rank[idx]], dtype=labels.dtype
+                    )
+
+                dist.stream.broadcast(
+                    hidden_states_recv, src=ctx.mp_group.ranks[idx], group=ctx.mp_group
+                )
+                dist.stream.broadcast(
+                    labels_recv, src=ctx.mp_group.ranks[idx], group=ctx.mp_group
+                )
+
+                seq_len = hidden_states_recv.shape[0]
+                num_chunk = (seq_len + ctx.seq_chunk_size - 1) // ctx.seq_chunk_size
+
+                loss_chunk = []
+                for chunk_idx in range(num_chunk):
+                    start = chunk_idx * ctx.seq_chunk_size
+                    end = min(start + ctx.seq_chunk_size, seq_len)
+                    hidden_states_chunk = hidden_states_recv._slice(start, end)
+                    labels_chunk = labels_recv._slice(start, end)
+
+                    logits = parallel_matmul(
+                        hidden_states_chunk,
+                        weight,
+                        bias=bias,
+                        transpose_y=ctx.transpose_y,
+                        tensor_parallel_degree=ctx.tensor_parallel_degree,
+                        tensor_parallel_output=True,
+                        fuse_linear=ctx.fuse_linear,
+                        training=ctx.training,
+                    )
+
+                    with paddle.amp.auto_cast(False):
+                        loss = mp_ops._c_softmax_with_cross_entropy(
+                            logits.cast("float32"),
+                            labels_chunk.unsqueeze(-1),
+                            group=ctx.mp_group,
+                            ignore_index=ctx.ignore_index,
+                        )
+                        loss_chunk.append(loss)
+                loss_all.append(paddle.concat(loss_chunk, axis=0))
+                labels_all.append(labels_recv)
+
+            ctx.loss_concat_sections = [loss.shape[0] for loss in loss_all]
+            loss_all = paddle.concat(loss_all, axis=0)
+            labels_all = paddle.concat(labels_all, axis=0)
+
+            tensor_inputs = [hidden_states, weight, bias, labels]
+            ctx.save_for_backward(*tensor_inputs)
+
+        return loss_all, labels_all
+
+    @staticmethod
+    def backward(ctx, loss_all_grad, labels_all_grad):
+        """Backward pass for parallel cross-entropy computation.
+
+        Args:
+            ctx: Context object with saved tensors from forward
+            loss_all_grad (paddle.Tensor): Gradient of loss
+            labels_all_grad (paddle.Tensor): Gradient of labels (unused)
+
+        Returns:
+            Tuple[paddle.Tensor, Optional[paddle.Tensor], Optional[paddle.Tensor], None]:
+                - hidden_states_grad: Gradient for input hidden states
+                - weight_grad: Gradient for weight matrix (None if not trainable)
+                - bias_grad: Gradient for bias vector (None if not trainable or not provided)
+                - None: Placeholder for labels gradient
+        """
+
+        hidden_states, weight, bias, labels = ctx.saved_tensor()
+
+        loss_all_grad_list = paddle.split(
+            loss_all_grad, ctx.loss_concat_sections, axis=0
+        )
+
+        def detach_variable(inp):
+            if inp is None:
+                return None
+            x = inp.detach()
+            x.stop_gradient = inp.stop_gradient
+            return x
+
+        if weight.stop_gradient is False:
+            weight_main_grad = paddle.zeros(weight.shape, dtype=paddle.float32)
+        else:
+            weight_main_grad = None
+        if bias is not None and bias.stop_gradient is False:
+            bias_main_grad = paddle.zeros(bias.shape, dtype=paddle.float32)
+        else:
+            bias_main_grad = None
+
+        hidden_states = detach_variable(hidden_states)
+        weight = detach_variable(weight)
+        bias = detach_variable(bias)
+        labels = detach_variable(labels)
+
+        with paddle.base.dygraph.guard():
+            tracer = paddle.base.framework._dygraph_tracer()
+            tracer._has_grad = True
+
+            for idx in range(ctx.world_size):
+                if idx == ctx.rank:
+                    hidden_states_recv = hidden_states
+                    labels_recv = labels
+                else:
+                    hidden_states_recv = paddle.empty(
+                        [ctx.num_tokens_per_rank[idx], hidden_states.shape[-1]],
+                        dtype=hidden_states.dtype,
+                    )
+                    labels_recv = paddle.empty(
+                        [ctx.num_tokens_per_rank[idx]], dtype=labels.dtype
+                    )
+
+                dist.stream.broadcast(
+                    hidden_states_recv, src=ctx.mp_group.ranks[idx], group=ctx.mp_group
+                )
+                dist.stream.broadcast(
+                    labels_recv, src=ctx.mp_group.ranks[idx], group=ctx.mp_group
+                )
+                hidden_states_recv.stop_gradient = False
+
+                seq_len = hidden_states_recv.shape[0]
+                num_chunk = (seq_len + ctx.seq_chunk_size - 1) // ctx.seq_chunk_size
+
+                for chunk_idx in range(num_chunk):
+                    start = chunk_idx * ctx.seq_chunk_size
+                    end = min(start + ctx.seq_chunk_size, seq_len)
+                    hidden_states_chunk = hidden_states_recv.slice(
+                        axes=[0], starts=[start], ends=[end]
+                    )
+                    labels_chunk = labels_recv._slice(start, end)
+                    loss_grad_chunk = loss_all_grad_list[idx]._slice(start, end)
+
+                    logits = parallel_matmul(
+                        hidden_states_chunk,
+                        weight,
+                        bias=bias,
+                        transpose_y=ctx.transpose_y,
+                        tensor_parallel_degree=ctx.tensor_parallel_degree,
+                        tensor_parallel_output=True,
+                        fuse_linear=ctx.fuse_linear,
+                        training=ctx.training,
+                    )
+
+                    with paddle.amp.auto_cast(False):
+                        loss_chunk = mp_ops._c_softmax_with_cross_entropy(
+                            logits.cast("float32"),
+                            labels_chunk.unsqueeze(-1),
+                            group=ctx.mp_group,
+                            ignore_index=ctx.ignore_index,
+                        )
+
+                    with paddle.amp.auto_cast(enable=False):
+                        paddle.autograd.backward(loss_chunk, loss_grad_chunk)
+
+                    if weight_main_grad is not None:
+                        weight_main_grad.add_(weight.grad.cast(paddle.float32))
+                        weight.clear_gradient(True)
+                    if bias_main_grad is not None:
+                        bias_main_grad.add_(bias.grad.cast(paddle.float32))
+                        bias.clear_gradient(True)
+
+                if idx == ctx.rank:
+                    hidden_states_grad = hidden_states_recv.grad
+                    hidden_states_grad = hidden_states_grad.reshape(
+                        ctx.hidden_states_shape
+                    )
+
+        if weight_main_grad is not None:
+            weight_main_grad = weight_main_grad.astype(weight.dtype)
+        if bias_main_grad is not None:
+            bias_main_grad = bias_main_grad.astype(bias.dtype)
+
+        return (
+            hidden_states_grad,
+            weight_main_grad,
+            bias_main_grad,
+            None,
+        )
+
+
+class ErniePretrainingCriterion(paddle.nn.Layer):
+    """Criterion for ERNIE pretraining task."""
+
+    def __init__(self, config, return_tuple=True):
+        """Initialize the pretraining criterion.
+
+        Args:
+            config (PPOCRVLConfig): Model configuration.
+            return_tuple (bool): Whether to return loss as tuple (loss, loss_sum). Defaults to True.
+        """
+        super(ErniePretrainingCriterion, self).__init__()
+        self.ignored_index = getattr(config, "ignored_index", -100)
+        self.config = config
+        self.return_tuple = return_tuple
+        self.enable_parallel_cross_entropy = (
+            config.tensor_parallel_degree > 1 and config.tensor_parallel_output
+        )
+
+        if (
+            self.enable_parallel_cross_entropy
+        ):  # and False: # and lm_head is distributed
+            logging.info("using parallel cross entroy, take care")
+            self.loss_func = ParallelCrossEntropy()
+        else:
+            self.loss_func = paddle.nn.CrossEntropyLoss(
+                reduction="none",
+            )
+        self.token_balance_loss = config.token_balance_loss
+
+    def forward(self, prediction_scores, masked_lm_labels, loss_mask=None):
+        """Compute the pretraining loss.
+
+        Args:
+            prediction_scores (Union[paddle.Tensor, Tuple[paddle.Tensor, ...]]):
+                Either:
+                - Direct logits tensor [batch_size, seq_len, vocab_size]
+                - Tuple of (hidden_states, weight, bias) for sparse head computation
+            masked_lm_labels (paddle.Tensor): Target labels tensor [batch_size, seq_len]
+            loss_mask (Optional[paddle.Tensor]): Optional mask for valid tokens. Defaults to None.
+
+        Returns:
+            Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
+                - If return_tuple=False: Single loss tensor
+                - If return_tuple=True: Tuple of (normalized_loss, sum_loss)
+        """
+
+        if self.config.use_sparse_head_and_loss_fn:
+            hidden_states, outlinear_weight, outlinear_bias = prediction_scores[:3]
+
+            if self.config.sequence_parallel:
+                masked_lm_labels, sparse_label_idx = (
+                    sequence_parallel_sparse_mask_labels(
+                        masked_lm_labels, self.ignored_index
+                    )
+                )
+                sparse_label_idx = sparse_label_idx.reshape([-1, 1])
+                hidden_states = paddle.gather(hidden_states, sparse_label_idx, axis=0)
+                hidden_states = AllGatherVarlenOp.apply(hidden_states)
+            else:
+                masked_lm_labels = masked_lm_labels.flatten()
+                sparse_label_idx = paddle.nonzero(
+                    masked_lm_labels != self.ignored_index
+                ).flatten()
+                masked_lm_labels = paddle.take_along_axis(
+                    masked_lm_labels, sparse_label_idx, axis=0
+                )
+
+                hidden_states = hidden_states.reshape([-1, hidden_states.shape[-1]])
+                hidden_states = paddle.take_along_axis(
+                    hidden_states, sparse_label_idx.reshape([-1, 1]), axis=0
+                )
+
+            # `loss_mask` must be reset to None and re-calculate it in ErnieBotPretrainingCriterion
+            # when use use_sparse_head_and_loss_fn.
+            loss_mask = None
+            if self.config.use_recompute_loss_fn:
+                offload_kwargs = {}
+                if self.config.get("offload_lm_head", False):
+                    offload_kwargs["offload_indices"] = [1]
+                res = recompute(
+                    self.forward_impl_with_calc_logits,
+                    masked_lm_labels,
+                    loss_mask,
+                    hidden_states,
+                    outlinear_weight,
+                    outlinear_bias,
+                    **offload_kwargs,
+                )
+            else:
+                logits = calc_lm_head_logits(
+                    self.config,
+                    hidden_states,
+                    outlinear_weight,
+                    outlinear_bias,
+                    training=self.training,
+                )
+                res = self.forward_impl(logits, masked_lm_labels, loss_mask)
+        elif self.config.use_recompute_loss_fn:
+            if self.config.use_fused_head_and_loss_fn:
+                res = self.forward_impl_with_fused_head_loss_fn(
+                    masked_lm_labels, loss_mask, *prediction_scores
+                )
+            else:
+                assert isinstance(prediction_scores, tuple) and len(
+                    prediction_scores
+                ) in [3, 4], prediction_scores
+                res = recompute(
+                    self.forward_impl_with_calc_logits,
+                    masked_lm_labels,
+                    loss_mask,
+                    *prediction_scores,
+                )
+        else:
+            res = self.forward_impl(prediction_scores, masked_lm_labels, loss_mask)
+
+        return res
+
+    def forward_impl_with_fused_head_loss_fn(
+        self,
+        masked_lm_labels,
+        loss_mask,
+        hidden_states,
+        outlinear_weight,
+        outlinear_bias,
+    ):
+        """Compute loss with fused head and parallel cross-entropy.
+
+        Args:
+            masked_lm_labels (paddle.Tensor): Target labels tensor [batch_size, seq_len]
+            loss_mask (Optional[paddle.Tensor]): Optional mask for valid tokens
+            hidden_states (paddle.Tensor): Hidden states from transformer [batch_size, seq_len, hidden_size]
+            outlinear_weight (paddle.Tensor): Weight matrix for output projection
+            outlinear_bias (Optional[paddle.Tensor]): Optional bias for output projection
+
+        Returns:
+            Union[paddle.Tensor, Tuple[paddle.Tensor, paddle.Tensor]]:
+                Same return format as forward()
+        """
+        assert (
+            self.config.tensor_parallel_degree > 0
+        ), "use_fused_head_and_loss_fn require tensor_parallel_degree > 0"
+        masked_lm_loss, masked_lm_labels_all = FusedHeadParallelCrossEntropy.apply(
+            hidden_states,
+            outlinear_weight,
+            outlinear_bias,
+            masked_lm_labels,
+            self.config.tensor_parallel_degree,
+            ignore_index=self.ignored_index,
+            seq_chunk_size=self.config.get("loss_subbatch_seqlen", 32768),
+            transpose_y=self.config.tie_word_embeddings,
+            fuse_linear=self.config.fuse_linear,
+            training=self.training,
+        )
+        if loss_mask is None:
+            loss_mask = masked_lm_labels_all != self.ignored_index
+        if (~loss_mask).all():  # empty span
+            logging.warning(
+                f"encounter empty span when calculate loss, ignored_index={self.ignored_index}"
+            )
+            loss = paddle.mean(masked_lm_loss) * 0.0
+            loss_sum = masked_lm_loss.sum().detach()
+        else:
+            loss_mask = loss_mask.reshape([-1]).cast(paddle.float32)
+            # 逐位对齐, 全精度聚合
+            masked_lm_loss = paddle.sum(
+                masked_lm_loss.cast(paddle.float32).reshape([-1]) * loss_mask
+            )
+            loss = masked_lm_loss / loss_mask.sum()
+            if self.token_balance_loss:
+                _loss = masked_lm_loss / self.config.token_balance_seqlen
+                loss = _loss - _loss.detach() + loss.detach()  # for 对线
+            loss_sum = masked_lm_loss.sum().detach()
+        if not self.return_tuple:  # only used in pp
+            if self.training:
+                return loss
+            return loss_sum
+        return loss, loss_sum
+
+    def forward_impl_with_calc_logits(
+        self,
+        masked_lm_labels,
+        loss_mask,
+        hidden_states,
+        outlinear_weight,
+        outlinear_bias,
+    ):
+        """Compute logits then calculate loss.
+
+        Args:
+            Same as forward_impl_with_fused_head_loss_fn()
+
+        Returns:
+            Same return format as forward()
+        """
+
+        logits = calc_lm_head_logits(
+            self.config,
+            hidden_states,
+            outlinear_weight,
+            outlinear_bias,
+            training=self.training,
+        )
+
+        return self.forward_impl(logits, masked_lm_labels, loss_mask)
+
+    def loss_impl(self, prediction_scores, masked_lm_labels):
+        """Core loss computation without reduction.
+
+        Args:
+            prediction_scores (paddle.Tensor): Logits tensor [batch_size, seq_len, vocab_size]
+            masked_lm_labels (paddle.Tensor): Target labels tensor [batch_size, seq_len]
+
+        Returns:
+            paddle.Tensor: Unreduced loss tensor
+        """
+        prediction_scores = prediction_scores.cast("float32")
+        masked_lm_loss = self.loss_func(
+            prediction_scores, masked_lm_labels.unsqueeze(-1)
+        )
+        return masked_lm_loss
+
+    def forward_impl(self, prediction_scores, masked_lm_labels, loss_mask=None):
+        """Standard loss computation with reduction and masking.
+
+        Args:
+            prediction_scores (paddle.Tensor): Logits tensor [batch_size, seq_len, vocab_size]
+            masked_lm_labels (paddle.Tensor): Target labels tensor [batch_size, seq_len]
+            loss_mask (Optional[paddle.Tensor]): Optional mask for valid tokens
+
+        Returns:
+            Same return format as forward()
+        """
+        if self.enable_parallel_cross_entropy:
+            assert prediction_scores.shape[-1] != self.config.vocab_size, (
+                f"enable_parallel_cross_entropy, the vocab_size should be splited:"
+                f" {prediction_scores.shape[-1]}, {self.config.vocab_size}"
+            )
+
+        with paddle.amp.auto_cast(False):
+            prediction_scores_dims = len(prediction_scores.shape)
+            if prediction_scores_dims == 2 and prediction_scores.shape[
+                0
+            ] > self.config.get("loss_subbatch_seqlen", 32768):
+                sb_loss_func = subbatch(
+                    self.loss_impl,
+                    [0, 1],
+                    [0, 0],
+                    self.config.get("loss_subbatch_seqlen", 32768),
+                    0,
+                )
+                masked_lm_loss = sb_loss_func(prediction_scores, masked_lm_labels)
+            elif prediction_scores_dims == 3 and prediction_scores.shape[
+                1
+            ] > self.config.get("loss_subbatch_seqlen", 32768):
+                sb_loss_func = subbatch(
+                    self.loss_impl,
+                    [0, 1],
+                    [1, 1],
+                    self.config.get("loss_subbatch_seqlen", 32768),
+                    1,
+                )
+                masked_lm_loss = sb_loss_func(prediction_scores, masked_lm_labels)
+            else:
+                masked_lm_loss = self.loss_impl(prediction_scores, masked_lm_labels)
+
+            if loss_mask is None:
+                loss_mask = masked_lm_labels != self.ignored_index
+
+            lossmask = masked_lm_labels != self.ignored_index
+            if (~lossmask).all():  # empty span
+                logging.warning(
+                    f"encounter empty span when calculate loss, ignored_index={self.ignored_index}"
+                )
+                loss = paddle.mean(masked_lm_loss) * 0.0
+                loss_sum = masked_lm_loss.sum().detach()
+            else:
+                loss_mask = loss_mask.reshape([-1]).cast(paddle.float32)
+                # 逐位对齐, 全精度聚合
+                masked_lm_loss = paddle.sum(
+                    masked_lm_loss.cast(paddle.float32).reshape([-1]) * loss_mask
+                )
+                loss = masked_lm_loss / loss_mask.sum()
+                if self.token_balance_loss:
+                    _loss = masked_lm_loss / self.config.token_balance_seqlen
+                    loss = _loss - _loss.detach() + loss.detach()  # for 对线
+                loss_sum = masked_lm_loss.sum().detach()
+        if not self.return_tuple:  # only used in pp
+            if self.training:
+                return loss
+            return loss_sum
+        return loss, loss_sum
+
+
+class Ernie4_5LMHead(nn.Layer):
+    """Language model head for ERNIE with support for tensor parallelism."""
+
+    def __init__(self, config):
+        """Initialize the language model head.
+
+        Args:
+            config (PPOCRVLConfig): Model configuration containing:
+                - vocab_size: Size of vocabulary
+                - hidden_size: Dimension of hidden states
+                - tensor_parallel_degree: Degree of tensor parallelism
+                - tie_word_embeddings: Whether to tie input/output embeddings
+                - weight_share_add_bias: Whether to add bias when weight sharing
+                - use_bias: Whether to use bias term
+                - use_recompute_loss_fn: Whether to defer logits computation to loss function
+                - use_sparse_head_and_loss_fn: Whether to use sparse head computation
+        """
+
+        super(Ernie4_5LMHead, self).__init__()
+        self.config = config
+        if config.tensor_parallel_degree > 1:
+            vocab_size = config.vocab_size // config.tensor_parallel_degree
+        else:
+            vocab_size = config.vocab_size
+
+        self.weight = self.create_parameter(
+            shape=(
+                [vocab_size, config.hidden_size]
+                if config.tie_word_embeddings
+                else [config.hidden_size, vocab_size]
+            ),
+            dtype=paddle.get_default_dtype(),
+        )
+        logging.info(
+            f"output-weight:{self.weight.shape} config.tie_word_embeddings={config.tie_word_embeddings}"
+        )
+        if config.weight_share_add_bias and config.use_bias:
+            self.bias = self.create_parameter(
+                shape=[vocab_size],
+                dtype=paddle.get_default_dtype(),
+                attr=paddle.ParamAttr(
+                    initializer=paddle.nn.initializer.constant.Constant(0.0)
+                ),
+            )
+        else:
+            self.bias = None
+
+        # Must set distributed attr for Tensor Parallel !
+        self.weight.is_distributed = (
+            True if (vocab_size != config.vocab_size) else False
+        )
+        if config.weight_share_add_bias and config.use_bias:
+            self.bias.is_distributed = (
+                True if (vocab_size != config.vocab_size) else False
+            )
+
+        if self.weight.is_distributed:
+            self.weight.split_axis = 1
+        if (
+            config.weight_share_add_bias
+            and config.use_bias
+            and self.bias.is_distributed
+        ):
+            self.bias.split_axis = 0
+
+        if self.config.use_recompute_loss_fn:
+            logging.info(
+                "Using recompute_loss_fn, the calculation of logits will be moved into "
+                "loss_fn for memory optimization"
+            )
+
+    def forward(self, hidden_states, tensor_parallel_output=None):
+        """Project hidden states to vocabulary logits.
+
+        Args:
+            hidden_states (paddle.Tensor): Input tensor of shape [batch_size, seq_len, hidden_size]
+            tensor_parallel_output (Optional[bool]): Whether to output parallel results. Defaults to None.
+
+        Returns:
+            Union[
+                Tuple[paddle.Tensor, paddle.Tensor, Optional[paddle.Tensor]]:
+                    # When use_recompute_loss_fn or use_sparse_head_and_loss_fn
+                    - hidden_states: Original input
+                    - weight: Projection weights
+                    - bias: Optional bias term
+                Tuple[paddle.Tensor, paddle.Tensor, Optional[paddle.Tensor], bool]:  # With tensor_parallel_output
+                    Same as above plus tensor_parallel_output flag
+                paddle.Tensor:  # Normal case
+                    Logits tensor of shape [batch_size, seq_len, vocab_size]
+            ]
+        """
+        #  will enter this branch when:
+        # 1. use_recompute_loss_fn or use_sparse_head_and_loss_fn
+        # 2. dpo training
+        if self.config.use_recompute_loss_fn or self.config.use_sparse_head_and_loss_fn:
+            return (
+                hidden_states,
+                self.weight,
+                self.bias,
+                self.config.tie_word_embeddings,
+            )
+
+        return calc_lm_head_logits(
+            self.config,
+            hidden_states,
+            self.weight,
+            self.bias,
+            tensor_parallel_output,
+            training=self.training,
+        )
+
+
+class Ernie4_5DecoderLayer(nn.Layer):
+    """A single transformer decoder layer in ERNIE model.
+
+    Contains self-attention and feed-forward components,
+    support, residual connections, and layer normalization.
+    """
+
+    def __init__(self, config, layer_idx):
+        """Initialize the decoder layer.
+
+        Args:
+            config (PPOCRVLConfig): Model configuration.
+            layer_idx (int): Index of this layer in the transformer stack
+        """
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.layer_idx = layer_idx
+        self.config = config
+
+        self.self_attn = Ernie4_5Attention(config, layer_idx)
+        self.mlp = Ernie4_5MLP(config)
+
+        Norm = RMSNorm if config.use_rmsnorm else LayerNorm
+
+        self.input_layernorm = Norm(config)
+        self.post_attention_layernorm = Norm(config)
+
+        self.residual_add1 = FusedDropoutImpl(
+            config.hidden_dropout_prob, mode="upscale_in_train"
+        )
+        self.residual_add2 = FusedDropoutImpl(
+            config.hidden_dropout_prob, mode="upscale_in_train"
+        )
+
+        if config.sequence_parallel:
+            mark_as_sequence_parallel_parameter(self.post_attention_layernorm.weight)
+            if not hasattr(config, "disable_ffn_model_parallel"):
+                mark_as_sequence_parallel_parameter(self.input_layernorm.weight)
+                if config.use_bias:
+                    mark_as_sequence_parallel_parameter(self.self_attn.o_proj.bias)
+                    mark_as_sequence_parallel_parameter(self.mlp.down_proj.bias)
+
+            if not config.use_rmsnorm and config.use_bias:
+                mark_as_sequence_parallel_parameter(self.post_attention_layernorm.bias)
+                mark_as_sequence_parallel_parameter(self.input_layernorm.bias)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,
+        position_embeddings: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        attn_mask_start_row_indices: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        token_type_ids: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        past_key_value: Optional[Tuple[paddle.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+    ) -> Tuple[paddle.Tensor, Optional[Tuple[paddle.Tensor, paddle.Tensor]]]:
+        """Forward pass through the decoder layer.
+
+        Args:
+            hidden_states (paddle.Tensor): Input tensor [batch_size, seq_len, hidden_size]
+            position_embeddings (paddle.Tensor): Position embeddings
+            attention_mask (Optional[paddle.Tensor]): Attention mask tensor
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Indices for variable length attention
+            position_ids (Optional[paddle.Tensor]): Position indices for rotary embeddings
+            output_attentions (Optional[bool]): Whether to return attention weights
+            past_key_value (Optional[Tuple[paddle.Tensor]]): Cached key/value states
+            use_cache (Optional[bool]): Whether to cache key/value states
+
+        Returns:
+            Union: Various output combinations depending on arguments:
+                - Base case: Hidden states tensor
+                - With attention: Tuple of (hidden_states, attention_weights)
+                - With cache: Tuple of (hidden_states, cached_key_value)
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        has_gradient = not hidden_states.stop_gradient
+        if (
+            self.config.recompute
+            and self.config.recompute_granularity == "full_attn"
+            and has_gradient
+        ):
+            hidden_states, self_attn_weights, present_key_value = recompute(
+                self.self_attn,
+                hidden_states,
+                position_embeddings,
+                past_key_value,
+                attention_mask,
+                attn_mask_start_row_indices,
+                position_ids,
+                output_attentions,
+                use_cache,
+                use_reentrant=self.config.recompute_use_reentrant,
+            )
+        else:
+            hidden_states, self_attn_weights, present_key_value = self.self_attn(
+                hidden_states=hidden_states,
+                position_embeddings=position_embeddings,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                attn_mask_start_row_indices=attn_mask_start_row_indices,
+                position_ids=position_ids,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                token_type_ids=token_type_ids,
+            )
+
+        with self.model_parallel_dropout():
+            hidden_states = self.residual_add1(hidden_states, residual)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        with self.model_parallel_dropout():
+            hidden_states = self.residual_add2(hidden_states, residual)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        # remove empty tuple for pipeline parallel
+        if type(outputs) is tuple and len(outputs) == 1:
+            outputs = outputs[0]
+        return outputs
+
+    def model_parallel_dropout(self):
+        """Get context manager for model-parallel dropout with proper seed control.
+
+        Returns:
+            Context manager for dropout operation
+        """
+        if (
+            self.config.tensor_parallel_degree > 1
+            and self.config.hidden_dropout_prob > 0.0
+        ):
+            current_seed = (
+                "local_seed" if self.config.sequence_parallel else "global_seed"
+            )
+            return get_rng_state_tracker().rng_state(current_seed)
+        return contextlib.nullcontext()
+
+
+class Ernie4_5PretrainedModel(PretrainedModel):
+    """Base class for ERNIE pretrained models."""
+
+    config_class = PPOCRVLConfig
+    base_model_prefix = "ernie"
+
+    @classmethod
+    def _get_tensor_parallel_mappings(cls, config, is_split=True):
+        """Generate tensor parallel mappings for model conversion.
+
+        Args:
+            config (PPOCRVLConfig): Model configuration.
+            is_split (bool): Whether to generate split mappings (True)
+                            or merge mappings (False). Defaults to True.
+
+        Returns:
+            Dict[str, Callable[[Any], Any]]: Dictionary mapping parameter names
+                to their corresponding split/merge functions for tensor parallelism.
+        """
+
+        from ..conversion_utils import split_or_merge_func
+
+        fn = split_or_merge_func(
+            is_split=is_split,
+            tensor_parallel_degree=config.tensor_parallel_degree,
+            tensor_parallel_rank=config.tensor_parallel_rank,
+            num_attention_heads=config.num_attention_heads,
+        )
+
+        def gqa_qkv_split_func(
+            weight,
+            tensor_parallel_degree,
+            tensor_parallel_rank,
+            num_attention_heads,
+            num_key_value_heads,
+            head_dim,
+            is_quant=False,
+            is_split=True,
+        ):
+            if is_quant:
+                weight = weight.T
+
+            def get_shape(tensor):
+                return (
+                    tensor.get_shape() if hasattr(tensor, "get_shape") else tensor.shape
+                )
+
+            def slice_tensor(tensor, start, end):
+                shape = get_shape(tensor)
+                if len(shape) == 1:
+                    return tensor[start:end]
+                else:
+                    return tensor[..., start:end]
+
+            q_end = num_attention_heads * head_dim
+            k_end = q_end + num_key_value_heads * head_dim
+            v_end = k_end + num_key_value_heads * head_dim
+
+            q = slice_tensor(weight, 0, q_end)
+            k = slice_tensor(weight, q_end, k_end)
+            v = slice_tensor(weight, k_end, v_end)
+
+            def split_tensor(tensor, degree):
+                shape = get_shape(tensor)
+                size = shape[-1]
+                block_size = size // degree
+                if hasattr(tensor, "get_shape"):
+                    return [
+                        slice_tensor(tensor, i * block_size, (i + 1) * block_size)
+                        for i in range(degree)
+                    ]
+                else:
+                    return np.split(tensor, degree, axis=-1)
+
+            q_list = split_tensor(q, tensor_parallel_degree)
+            k_list = split_tensor(k, tensor_parallel_degree)
+            v_list = split_tensor(v, tensor_parallel_degree)
+
+            if tensor_parallel_rank is None:
+                out = [
+                    np.concatenate([q_i, k_i, v_i], axis=-1)
+                    for q_i, k_i, v_i in zip(q_list, k_list, v_list)
+                ]
+            else:
+                out = np.concatenate(
+                    [
+                        q_list[tensor_parallel_rank],
+                        k_list[tensor_parallel_rank],
+                        v_list[tensor_parallel_rank],
+                    ],
+                    axis=-1,
+                )
+            if is_quant:
+                out = out.T
+            return out
+
+        def gqa_qkv_merge_func(
+            weight_list,
+            num_attention_heads,
+            num_key_value_heads,
+            head_dim,
+            is_quant=False,
+            is_split=False,
+        ):
+            tensor_parallel_degree = len(weight_list)
+            num_attention_heads = num_attention_heads // tensor_parallel_degree
+            num_key_value_heads = num_key_value_heads // tensor_parallel_degree
+
+            is_paddle_tensor = not isinstance(weight_list[0], np.ndarray)
+
+            def get_shape(tensor):
+                return (
+                    tensor.get_shape() if hasattr(tensor, "get_shape") else tensor.shape
+                )
+
+            def slice_tensor(tensor, start, end):
+                if len(get_shape(tensor)) == 1:
+                    return tensor[start:end]
+                else:
+                    return tensor[..., start:end]
+
+            q_list, k_list, v_list = [], [], []
+
+            for weight in weight_list:
+                if is_quant:
+                    weight = weight.T
+                q_end = num_attention_heads * head_dim
+                k_end = q_end + num_key_value_heads * head_dim
+                v_end = k_end + num_key_value_heads * head_dim
+
+                q = slice_tensor(weight, 0, q_end)
+                k = slice_tensor(weight, q_end, k_end)
+                v = slice_tensor(weight, k_end, v_end)
+
+                q_list.append(q)
+                k_list.append(k)
+                v_list.append(v)
+
+            merged = q_list + k_list + v_list
+
+            if is_paddle_tensor:
+                tensor = paddle.concat(merged, axis=-1)
+                if tensor.place.is_gpu_place():
+                    tensor = tensor._copy_to(paddle.CUDAPinnedPlace(), False)
+
+            else:
+                tensor = np.concatenate(merged, axis=-1)
+            if is_quant:
+                tensor = tensor.T
+            return tensor
+
+        if (
+            config.num_key_value_heads is not None
+            and config.num_key_value_heads != config.num_attention_heads
+        ):
+            if is_split:
+                qkv_fn = partial(
+                    gqa_qkv_split_func,
+                    tensor_parallel_degree=config.tensor_parallel_degree,
+                    tensor_parallel_rank=config.tensor_parallel_rank,
+                    num_attention_heads=config.num_attention_heads,
+                    num_key_value_heads=config.num_key_value_heads,
+                    head_dim=(
+                        config.hidden_size // config.num_attention_heads
+                        if config.head_dim is None
+                        else config.head_dim
+                    ),
+                    is_quant=False,
+                    is_split=True,
+                )
+            else:
+                qkv_fn = partial(
+                    gqa_qkv_merge_func,
+                    num_attention_heads=config.num_attention_heads,
+                    num_key_value_heads=config.num_key_value_heads,
+                    head_dim=(
+                        config.hidden_size // config.num_attention_heads
+                        if config.head_dim is None
+                        else config.head_dim
+                    ),
+                    is_quant=False,
+                    is_split=False,
+                )
+        else:
+            qkv_fn = partial(fn, is_column=True)
+
+        def get_tensor_parallel_split_mappings(num_hidden_layers):
+            final_actions = {}
+
+            base_actions = {
+                # Column Linear
+                "layers.0.self_attn.qkv_proj.weight": qkv_fn,
+                "layers.0.mlp.up_gate_proj.weight": partial(
+                    fn, is_column=True, is_naive_2fuse=True
+                ),
+                "lm_head.weight": partial(fn, is_column=not config.tie_word_embeddings),
+                # Row Linear
+                "embed_tokens.weight": partial(fn, is_column=False),
+                "layers.0.self_attn.o_proj.weight": partial(fn, is_column=False),
+                "layers.0.mlp.down_proj.weight": partial(fn, is_column=False),
+            }
+
+            if config.use_bias:
+                base_actions.update(
+                    {
+                        # Column Linear
+                        "layers.0.self_attn.qkv_proj.bias": qkv_fn,
+                        "layers.0.mlp.up_gate_proj.bias": partial(
+                            fn, is_column=True, is_naive_2fuse=True
+                        ),
+                        "layers.0.mlp.down_proj.bias": lambda x: x[
+                            :
+                        ],  # convert PySafeSlice to ndarray.
+                        "lm_head.bias": partial(fn, is_column=True),
+                    }
+                )
+
+            for key, action in base_actions.items():
+                if "layers.0." in key:
+                    for i in range(num_hidden_layers):
+                        final_actions[key.replace("layers.0.", f"layers.{i}.")] = action
+                else:
+                    final_actions[key] = action
+            return final_actions
+
+        mappings = get_tensor_parallel_split_mappings(config.num_hidden_layers)
+        return mappings
+
+
+class Ernie4_5Model(Ernie4_5PretrainedModel):
+    """The core ERNIE transformer model"""
+
+    def __init__(self, config: PPOCRVLConfig):
+        """Initialize the ERNIE model architecture.
+
+        Args:
+            config (PPOCRVLConfig): Model configuration.
+        """
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.hidden_size = config.hidden_size
+        self.config = config
+
+        if config.tensor_parallel_degree > 1:
+            self.embed_tokens = VocabParallelEmbedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+        else:
+            self.embed_tokens = nn.Embedding(
+                self.vocab_size,
+                self.hidden_size,
+            )
+
+        self.layers = nn.LayerList(
+            [Ernie4_5DecoderLayer(config, i) for i in range(config.num_hidden_layers)]
+        )
+        Norm = RMSNorm if config.use_rmsnorm else LayerNorm
+        self.norm = Norm(config)
+        self.rotary_emb = KeyeRotaryEmbedding(config=config)
+
+        self.gradient_checkpointing = False
+
+    def get_input_embeddings(self):
+        """Get the input embedding layer.
+
+        Returns:
+            nn.Embedding: The embedding layer for input tokens
+        """
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        """Set new input embeddings.
+
+        Args:
+            value (nn.Embedding): New embedding layer to use
+        """
+        self.embed_tokens = value
+
+    @paddle.jit.not_to_static
+    def recompute_training(
+        self,
+        layer_module,
+        hidden_states,
+        position_embeddings,
+        attention_mask,
+        attn_mask_start_row_indices,
+        position_ids,
+        token_type_ids,
+        output_attentions,
+        past_key_value,
+        use_cache,
+    ):
+        """Perform gradient checkpointing for memory-efficient training.
+
+        Args:
+            layer_module (nn.Layer): Transformer layer to recompute
+            hidden_states (paddle.Tensor): Input hidden states
+            position_embeddings (paddle.Tensor): Position embeddings
+            attention_mask (paddle.Tensor): Attention mask
+            attn_mask_start_row_indices (paddle.Tensor): Variable length indices
+            position_ids (paddle.Tensor): Position indices
+            output_attentions (bool): Whether to output attention weights
+            past_key_value (Optional[Tuple[paddle.Tensor]]): Cached key/value states
+            use_cache (bool): Whether to cache key/value states
+
+        Returns:
+            paddle.Tensor: Output hidden states after recomputation
+        """
+
+        def create_custom_forward(module):
+            def custom_forward(*inputs):
+                return module(*inputs, output_gate_logits=False)
+
+            return custom_forward
+
+        hidden_states = recompute(
+            create_custom_forward(layer_module),
+            hidden_states,
+            position_embeddings,
+            attention_mask,
+            attn_mask_start_row_indices,
+            position_ids,
+            token_type_ids,
+            output_attentions,
+            past_key_value,
+            use_cache,
+        )
+        return hidden_states
+
+    def forward(
+        self,
+        input_ids=None,
+        position_ids=None,
+        token_type_ids=None,
+        attention_mask=None,
+        attn_mask_start_row_indices=None,
+        inputs_embeds=None,
+        use_cache=None,
+        past_key_values=None,
+        output_attentions=False,
+        output_hidden_states=None,
+        return_dict=False,
+    ):
+        """Forward pass through the ERNIE model.
+
+        Args:
+            input_ids (Optional[paddle.Tensor]): Input token IDs
+            position_ids (Optional[paddle.Tensor]): Position indices
+            attention_mask (Optional[paddle.Tensor]): Attention mask
+            attn_mask_start_row_indices (Optional[paddle.Tensor]): Variable length attention indices
+            inputs_embeds (Optional[paddle.Tensor]): Precomputed embeddings
+            use_cache (Optional[bool]): Whether to cache key/value states
+            past_key_values (Optional[Tuple[Tuple[paddle.Tensor]]]): Cached key/value states
+            output_attentions (Optional[bool]): Whether to output attention weights
+            output_hidden_states (Optional[bool]): Whether to output all hidden states
+            return_dict (Optional[bool]): Whether to return dict or tuple
+
+        Returns:
+            Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+                Various outputs depending on configuration, including:
+                - last_hidden_state: Final layer hidden states
+                - past_key_values: Cached key/value states if use_cache=True
+                - hidden_states: All hidden states if output_hidden_states=True
+                - attentions: Attention weights if output_attentions=True
+        """
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
+            )
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError(
+                "You have to specify either decoder_input_ids or decoder_inputs_embeds"
+            )
+
+        if batch_size != 1:
+            raise NotImplementedError
+
+        layers = self.layers[: self.config.num_hidden_layers]
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(layers))
+            kv_seq_len = 0
+        else:
+            kv_seq_len = past_key_values[0][0].shape[1]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        inputs_embeds = inputs_embeds.astype(self.embed_tokens.weight.dtype)
+
+        if self.config.sequence_parallel:
+            inputs_embeds = inputs_embeds.reshape([-1, inputs_embeds.shape[-1]])
+            inputs_embeds = ScatterOp.apply(inputs_embeds)
+
+        hidden_states = inputs_embeds
+
+        if position_ids is None or position_ids.dim() == 2:
+            raise NotImplementedError
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        if attention_mask is None:
+            raise NotImplementedError
+        causal_mask = self._update_causal_mask(
+            attention_mask.astype("int64"),
+            inputs_embeds,
+            past_key_values,
+            output_attentions,
+        )
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, (decoder_layer) in enumerate(layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            past_key_value = (
+                past_key_values[idx] if past_key_values is not None else None
+            )
+            has_gradient = not hidden_states.stop_gradient
+            if (
+                self.config.recompute
+                and self.config.recompute_granularity == "full"
+                and has_gradient
+            ):
+                layer_outputs = self.recompute_training(
+                    decoder_layer,
+                    hidden_states,
+                    position_embeddings,
+                    causal_mask,
+                    attn_mask_start_row_indices,
+                    position_ids,
+                    token_type_ids,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    position_embeddings,
+                    causal_mask,
+                    attn_mask_start_row_indices,
+                    position_ids,
+                    token_type_ids,
+                    output_attentions,
+                    past_key_value,
+                    use_cache,
+                )
+
+            if isinstance(layer_outputs, (tuple, list)):
+                hidden_states = layer_outputs[0]
+            else:
+                hidden_states = layer_outputs
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_cache,
+                    all_hidden_states,
+                    all_self_attns,
+                ]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=None,
+        )
+
+    def _update_causal_mask(
+        self,
+        attention_mask: paddle.Tensor,
+        input_tensor: paddle.Tensor,
+        past_key_values: Optional[Tuple[Tuple[paddle.Tensor]]],
+        output_attentions: bool = False,
+    ):
+        past_seen_tokens = (
+            past_key_values[0][0].shape[1]
+            if past_key_values is not None and past_key_values[0] is not None
+            else 0
+        )
+
+        dtype = input_tensor.dtype
+        min_dtype = paddle.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = (
+            attention_mask.shape[-1]
+            if isinstance(attention_mask, paddle.Tensor)
+            else past_seen_tokens + sequence_length + 1
+        )
+        cache_position = paddle.arange(
+            past_seen_tokens, past_seen_tokens + sequence_length
+        )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        return causal_mask
+
+    @staticmethod
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask,
+        sequence_length: int,
+        target_length: int,
+        dtype,
+        cache_position,
+        batch_size: int,
+    ):
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = paddle.finfo(dtype).min
+            causal_mask = paddle.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype
+            )
+            diagonal_attend_mask = paddle.arange(
+                target_length
+            ) > cache_position.reshape((-1, 1))
+            diagonal_attend_mask = diagonal_attend_mask.astype(causal_mask.dtype)
+            causal_mask *= diagonal_attend_mask
+            causal_mask = causal_mask[None, None, :, :].expand((batch_size, 1, -1, -1))
+            if attention_mask is not None:
+                causal_mask = (
+                    causal_mask.clone()
+                )  # copy to contiguous memory for in-place edit
+                if attention_mask.shape[-1] > target_length:
+                    attention_mask = attention_mask[:, :target_length]
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[
+                    :, None, None, :
+                ].astype(causal_mask.dtype)
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[
+                    :, :, :, :mask_length
+                ].masked_fill(padding_mask, min_dtype)
+        return causal_mask
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/__init__.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/__init__.py
new file mode 100644
index 0000000000..65e5c58ac8
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/__init__.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Fusion operators
+"""
+import paddle
+from paddle.incubate.nn.functional import fused_rms_norm_ext
+from paddle.incubate.nn.functional import fused_rotary_position_embedding as fused_rope
+from paddle.incubate.nn.functional import swiglu as fused_swiglu
+
+from .common_fusion_ops import Linear, matmul
+
+if paddle.device.is_compiled_with_custom_device("npu"):
+    from .npu_fusion_ops import npu_cal_aux_loss_func as cal_aux_loss
+else:
+    from paddle.incubate.nn.functional import cal_aux_loss
+
+__all__ = [
+    "fused_rope",
+    "fused_swiglu",
+    "fused_rms_norm_ext",
+    "Linear",
+    "matmul",
+    "cal_aux_loss",
+]
+
+
+def fusion_flash_attention(
+    q,
+    k,
+    v,
+    training_mode,
+    attention_probs_dropout_prob,
+    use_sparse_flash_attn,
+    attention_mask=None,
+    attn_mask_start_row_indices=None,
+    seq_length=None,
+    use_var_len_flash_attn=False,
+    rr_flash_attn=None,
+):
+    """
+    Args:
+        q (Tensor): Query tensor.
+        k (Tensor): Key tensor.
+        v (Tensor): Value tensor.
+        training_mode (bool): Whether in training mode.
+        attention_probs_dropout_prob (float): Dropout probability for attention probabilities.
+        use_sparse_flash_attn (bool): Whether to use sparse flash attention.
+        attention_mask (Tensor, optional): Attention mask. Defaults to None.
+        attn_mask_start_row_indices (Tensor, optional): Start row indices for attention mask. Defaults to None.
+        seq_length (int, optional): Sequence length. Defaults to None.
+        use_var_len_flash_attn (bool, optional): Whether to use variable length flash attention. Defaults to False.
+        rr_flash_attn (bool, optional): Whether to use round-robin flash attention. Defaults to None.
+
+    Returns:
+        Tensor: Output tensor after applying fusion flash attention.
+    """
+    from .common_fusion_ops import _fusion_flash_attention
+
+    return _fusion_flash_attention(
+        q,
+        k,
+        v,
+        training_mode=training_mode,
+        attention_probs_dropout_prob=attention_probs_dropout_prob,
+        use_sparse_flash_attn=use_sparse_flash_attn,
+        attention_mask=attention_mask,
+        attn_mask_start_row_indices=attn_mask_start_row_indices,
+        rr_flash_attn=rr_flash_attn,
+    )
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/common_fusion_ops.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/common_fusion_ops.py
new file mode 100644
index 0000000000..fb70acfae3
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/common_fusion_ops.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Common fusion operators.
+"""
+
+# TODO: Support XPU
+
+import paddle
+import paddle.nn.functional as F
+from paddle import matmul, tensor
+from paddle.nn import Linear
+from paddle.nn.functional.flash_attention import flashmask_attention
+
+__all__ = [
+    "matmul",
+    "Linear",
+]
+
+
+def _fusion_flash_attention(
+    q,
+    k,
+    v,
+    training_mode,
+    attention_probs_dropout_prob,
+    use_sparse_flash_attn,
+    attention_mask=None,
+    attn_mask_start_row_indices=None,
+    rr_flash_attn=None,
+):
+    """
+    Performs fused flash attention with multiple implementation variants.
+
+    Args:
+        q (paddle.Tensor): Query tensor with shape [batch, heads, seq_len, dim_head]
+        k (paddle.Tensor): Key tensor with shape [batch, heads, seq_len, dim_head]
+        v (paddle.Tensor): Value tensor with shape [batch, heads, seq_len, dim_head]
+        training_mode (bool): Whether in training mode (affects dropout)
+        attention_probs_dropout_prob (float): Dropout probability for attention weights
+        use_sparse_flash_attn (bool): Whether to use sparse flash attention optimization
+        attention_mask (Optional[paddle.Tensor]): Dense attention mask (default: None)
+        attn_mask_start_row_indices (Optional[paddle.Tensor]): Sparse mask indices (default: None)
+        rr_flash_attn (Optional[Callable]): Recomputation wrapper for flash attention (default: None)
+
+    Returns:
+        Tuple[paddle.Tensor, Optional[paddle.Tensor]]:
+            - Output tensor with shape [batch, seq_len, heads*dim_head]
+            - Attention weights (None for flash attention implementations)
+
+    Raises:
+        Warning: If sparse flash attention is requested but unavailable
+        ValueError: If invalid combination of mask inputs is provided
+    """
+
+    version = paddle.version.full_version
+    if attn_mask_start_row_indices is not None:
+        if use_sparse_flash_attn:
+            if rr_flash_attn is None:
+                out = flashmask_attention(
+                    q,
+                    k,
+                    v,
+                    startend_row_indices=attn_mask_start_row_indices.unsqueeze(-1),
+                    causal=True,
+                )
+            else:
+                out = rr_flash_attn(
+                    flashmask_attention,
+                    q,
+                    k,
+                    v,
+                    startend_row_indices=attn_mask_start_row_indices.unsqueeze(-1),
+                    causal=True,
+                )
+        else:
+            attention_mask = _gen_from_sparse_attn_mask_indices(
+                attn_mask_start_row_indices, q.dtype
+            )
+            if rr_flash_attn is None:
+                out = F.scaled_dot_product_attention(
+                    q,
+                    k,
+                    v,
+                    attn_mask=attention_mask,
+                    is_causal=False,
+                )
+            else:
+                out = rr_flash_attn(
+                    F.scaled_dot_product_attention,
+                    q,
+                    k,
+                    v,
+                    attn_mask=attention_mask,
+                    is_causal=False,
+                )
+        weights = None
+    else:
+        if rr_flash_attn is None:
+            out = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None and q.shape[1] != 1,
+            )
+            weights = None
+        else:
+            out = rr_flash_attn(
+                F.scaled_dot_product_attention,
+                q,
+                k,
+                v,
+                attn_mask=attention_mask,
+                is_causal=attention_mask is None and q.shape[1] != 1,
+            )
+            weights = None
+
+    out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+    return out, weights
+
+
+def _gen_from_sparse_attn_mask_indices(attn_mask_start_row_indices, dtype):
+    """
+    Recover 4-D attention_mask from attn_mask_start_row_indices.
+
+    Args:
+        attn_mask_start_row_indices (paddle.Tensor): The start row indices for the attention mask.
+        dtype (str): The data type of the tensor.
+
+    Returns:
+        paddle.Tensor: The dense attention mask recovered from attn_mask_start_row_indices.
+    """
+    batch_size, _, max_seq_len = attn_mask_start_row_indices.shape
+    base = (
+        paddle.arange(max_seq_len, dtype="int32")
+        .unsqueeze(1)
+        .expand([batch_size, -1, max_seq_len])
+        .unsqueeze(1)
+    )
+    mask_indices = attn_mask_start_row_indices.unsqueeze(1)
+
+    tril = paddle.tril(
+        paddle.ones([max_seq_len, max_seq_len], dtype="bool").expand(
+            [batch_size, 1, max_seq_len, max_seq_len]
+        )
+    )
+    attention_mask = paddle.logical_and(base < mask_indices, tril)
+    attention_mask = paddle.scale(
+        x=attention_mask.astype(dtype),
+        scale=1000000.0,
+        bias=-1.0,
+        bias_after_scale=False,
+    )
+
+    return attention_mask
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/npu_fusion_ops.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/npu_fusion_ops.py
new file mode 100644
index 0000000000..0547e2d344
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_fusion_ops/npu_fusion_ops.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+npu fusion operators.
+
+"""
+import paddle
+import paddle.distributed as dist
+import paddle.nn.functional as F
+
+
+def npu_combining(x, combine_weights, scatter_index, hard_gate=False):
+    """
+    Args:
+        x: Tensor[seq, dim]
+        combine_weights: [seq, k]
+        scatter_index:  ** [seq, k] **
+    Returns:
+        y: Tensor[s, dim]
+    """
+    x_gatherd = F.embedding(scatter_index, x)  # [s,k,dim]
+    if hard_gate:
+        return x_gatherd.squeeze(-2)
+    y = (combine_weights.unsqueeze(-1) * x_gatherd).sum(1)
+    return y
+
+
+def npu_cal_aux_loss_func(
+    gate_prob,
+    dispatch_mask,
+    tokens_mask,
+    dispatch_tokens_mask,
+    num_experts,
+    use_group,
+    moe_k,
+    global_aux_loss=False,
+    rank=None,
+    group=None,
+    clip_min=1e-6,
+):
+    """cal_aux_loss_func"""
+    if tokens_mask is not None and tokens_mask.dtype != gate_prob.dtype:
+        tokens_mask = tokens_mask.astype(gate_prob.dtype)
+
+    scale = None
+    if dispatch_tokens_mask is not None:
+        seqlen_float = dispatch_tokens_mask.astype(gate_prob.dtype).sum()
+        if (
+            tokens_mask is not None
+            and gate_prob.shape[0] != dispatch_tokens_mask.shape[0]
+        ):
+            scale = seqlen_float / paddle.clip(tokens_mask.sum(), min=1e-6)
+    elif tokens_mask is not None:
+        seqlen_float = tokens_mask.sum()
+    else:
+        seqlen_float = gate_prob.numel().astype(gate_prob.dtype) / num_experts
+    seqlen_float = paddle.clip(seqlen_float, min=1e-6)
+    if len(dispatch_mask.shape) == 2:
+        dispatch_mask = dispatch_mask.sum(0)
+    ce = dispatch_mask.astype(gate_prob.dtype).detach() / seqlen_float
+    me = paddle.sum(gate_prob, axis=0) / seqlen_float
+
+    if global_aux_loss:
+        me_list, ce_list = [], []
+        dist.all_gather(me_list, me, group=group)
+        dist.all_gather(ce_list, ce, group=group)
+        me_list[rank] = me
+        ce_list[rank] = ce
+        me = paddle.stack(me_list).mean(0)
+        ce = paddle.stack(ce_list).mean(0)
+
+    l_aux = paddle.sum(me * ce) * num_experts
+    if use_group:
+        l_aux = l_aux / moe_k
+    if scale is not None:
+        l_aux = l_aux + (scale - 1) * l_aux.detach()
+    return l_aux, None, None
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_ppocrvl.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_ppocrvl.py
new file mode 100644
index 0000000000..0c5bc15f19
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_ppocrvl.py
@@ -0,0 +1,619 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/modeling_keye.py
+# Original header:
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+
+from ....common.vlm.generation import GenerationMixin
+from ....common.vlm.transformers.model_outputs import (
+    CausalLMOutputWithCrossAttentions,
+    ModelOutput,
+)
+from ._config import PPOCRVLConfig
+from ._ernie import Ernie4_5Model, Ernie4_5PretrainedModel
+from ._projector import Projector
+from ._siglip import SiglipVisionModel
+
+
+@dataclass
+class PPOCRVLCausalLMOutputWithPast(ModelOutput):
+    loss: Optional[paddle.Tensor] = None
+    logits: paddle.Tensor = None
+    past_key_values: Optional[List[paddle.Tensor]] = None
+    hidden_states: Optional[Tuple[paddle.Tensor]] = None
+    attentions: Optional[Tuple[paddle.Tensor]] = None
+    rope_deltas: Optional[paddle.Tensor] = None
+
+
+class PPOCRVLForConditionalGeneration(Ernie4_5PretrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    config_class = PPOCRVLConfig
+    _no_split_modules = ["Ernie4_5DecoderLayer", "SiglipEncoderLayer"]
+
+    base_model_prefix = ""
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mlp_AR = Projector(config, config.vision_config)
+        self.visual = SiglipVisionModel(config.vision_config)
+        self.model = Ernie4_5Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias_attr=False)
+        self.rope_deltas = None
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    def get_rope_index(
+        self,
+        input_ids: Optional[paddle.Tensor] = None,
+        image_grid_thw: Optional[paddle.Tensor] = None,
+        video_grid_thw: Optional[paddle.Tensor] = None,
+        second_per_grid_ts: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """
+        Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+        Explanation:
+            Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+            For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+            Examples:
+                input_ids: [T T T T T], here T is for text.
+                temporal position_ids: [0, 1, 2, 3, 4]
+                height position_ids: [0, 1, 2, 3, 4]
+                width position_ids: [0, 1, 2, 3, 4]
+
+            For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+            and 1D rotary position embedding for text part.
+            Examples:
+                Temporal (Time): 3 patches, representing different segments of the video in time.
+                Height: 2 patches, dividing each frame vertically.
+                Width: 2 patches, dividing each frame horizontally.
+                We also have some important parameters:
+                fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+                tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+                temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+                interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+                input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+                vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+                vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+                vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+                text temporal position_ids: [101, 102, 103, 104, 105]
+                text height position_ids: [101, 102, 103, 104, 105]
+                text width position_ids: [101, 102, 103, 104, 105]
+                Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+                it.
+            image_grid_thw (`paddle.Tensor` of shape `(num_images, 3)`, *optional*):
+                The temporal, height and width of feature shape of each image in LLM.
+            video_grid_thw (`paddle.Tensor` of shape `(num_videos, 3)`, *optional*):
+                The temporal, height and width of feature shape of each video in LLM.
+            second_per_grid_ts (`paddle.Tensor` of shape `(num_videos)`, *optional*):
+                The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+            attention_mask (`paddle.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+        Returns:
+            position_ids (`paddle.Tensor` of shape `(3, batch_size, sequence_length)`)
+            mrope_position_deltas (`paddle.Tensor` of shape `(batch_size)`)
+        """
+        spatial_merge_size = self.config.vision_config.spatial_merge_size
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+        mrope_position_deltas = []
+        if input_ids is not None and (
+            image_grid_thw is not None or video_grid_thw is not None
+        ):
+            total_input_ids = input_ids
+            if attention_mask is None:
+                attention_mask = paddle.ones_like(total_input_ids)
+            position_ids = paddle.ones(
+                [3, input_ids.shape[0], input_ids.shape[1]],
+                dtype=input_ids.dtype,
+            )
+            image_index, video_index = 0, 0
+            for i, input_ids in enumerate(total_input_ids):
+                input_ids = input_ids[attention_mask[i] == 1]
+                image_nums, video_nums = 0, 0
+                vision_start_indices = paddle.nonzero(
+                    input_ids == vision_start_token_id
+                ).squeeze(1)
+                vision_tokens = input_ids[vision_start_indices + 1]
+                image_nums = (vision_tokens == image_token_id).sum()
+                video_nums = (vision_tokens == video_token_id).sum()
+                input_tokens = input_ids.tolist()
+                llm_pos_ids_list: list = []
+                st = 0
+                remain_images, remain_videos = image_nums, video_nums
+                for _ in range(image_nums + video_nums):
+                    if image_token_id in input_tokens and remain_images > 0:
+                        ed_image = input_tokens.index(image_token_id, st)
+                    else:
+                        ed_image = len(input_tokens) + 1
+                    if video_token_id in input_tokens and remain_videos > 0:
+                        ed_video = input_tokens.index(video_token_id, st)
+                    else:
+                        ed_video = len(input_tokens) + 1
+                    if ed_image < ed_video:
+                        t, h, w = (
+                            image_grid_thw[image_index][0],
+                            image_grid_thw[image_index][1],
+                            image_grid_thw[image_index][2],
+                        )
+                        second_per_grid_t = 0
+                        image_index += 1
+                        remain_images -= 1
+                        ed = ed_image
+
+                    else:
+                        t, h, w = (
+                            video_grid_thw[video_index][0],
+                            video_grid_thw[video_index][1],
+                            video_grid_thw[video_index][2],
+                        )
+                        if second_per_grid_ts is not None:
+                            second_per_grid_t = second_per_grid_ts[video_index]
+                        else:
+                            second_per_grid_t = 1.0
+                        video_index += 1
+                        remain_videos -= 1
+                        ed = ed_video
+                    llm_grid_t, llm_grid_h, llm_grid_w = (
+                        t.item(),
+                        h.item() // spatial_merge_size,
+                        w.item() // spatial_merge_size,
+                    )
+                    text_len = ed - st
+
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    llm_pos_ids_list.append(
+                        paddle.arange(text_len).reshape((1, -1)).expand((3, -1))
+                        + st_idx
+                    )
+
+                    if paddle.is_tensor(second_per_grid_t):
+                        second_per_grid_t = second_per_grid_t.detach().item()
+                    range_tensor = paddle.arange(llm_grid_t).reshape((-1, 1))
+                    expanded_range = range_tensor.expand((-1, llm_grid_h * llm_grid_w))
+
+                    time_tensor = (
+                        expanded_range
+                        * second_per_grid_t
+                        * self.config.vision_config.tokens_per_second
+                    )
+
+                    time_tensor_long = time_tensor.astype("int64")
+                    t_index = time_tensor_long.flatten()
+
+                    h_index = (
+                        paddle.arange(llm_grid_h)
+                        .reshape((1, -1, 1))
+                        .expand((llm_grid_t, -1, llm_grid_w))
+                        .flatten()
+                    )
+                    w_index = (
+                        paddle.arange(llm_grid_w)
+                        .reshape((1, 1, -1))
+                        .expand((llm_grid_t, llm_grid_h, -1))
+                        .flatten()
+                    )
+                    llm_pos_ids_list.append(
+                        paddle.stack([t_index, h_index, w_index]) + text_len + st_idx
+                    )
+                    st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+                if st < len(input_tokens):
+                    st_idx = (
+                        llm_pos_ids_list[-1].max() + 1
+                        if len(llm_pos_ids_list) > 0
+                        else 0
+                    )
+                    text_len = len(input_tokens) - st
+                    llm_pos_ids_list.append(
+                        paddle.arange(text_len).reshape((1, -1)).expand((3, -1))
+                        + st_idx
+                    )
+
+                llm_positions = paddle.concat(llm_pos_ids_list, axis=1).reshape((3, -1))
+                position_ids[..., i, attention_mask[i] == 1] = llm_positions
+                mrope_position_deltas.append(
+                    llm_positions.max() + 1 - len(total_input_ids[i])
+                )
+            mrope_position_deltas = paddle.to_tensor(mrope_position_deltas).unsqueeze(1)
+            return position_ids, mrope_position_deltas
+        else:
+            if attention_mask is not None:
+                position_ids = attention_mask.long().cumsum(-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                position_ids = position_ids.unsqueeze(0).expand((3, -1, -1))
+                max_position_ids = position_ids.max(0, keepdim=False)[0].max(
+                    -1, keepdim=True
+                )[0]
+                mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+            else:
+                position_ids = (
+                    paddle.arange(input_ids.shape[1])
+                    .reshape((1, 1, -1))
+                    .expand((3, input_ids.shape[0], -1))
+                )
+                mrope_position_deltas = paddle.zeros(
+                    [input_ids.shape[0], 1],
+                    dtype=input_ids.dtype,
+                )
+
+            return position_ids, mrope_position_deltas
+
+    def prepare_attention_mask_for_generation(
+        self, input_ids, pad_token_id, eos_token_id
+    ):
+        """Avoid using attention_mask with flash_attn on generation."""
+        if self.config.use_flash_attention:
+            return None
+        return super().prepare_attention_mask_for_generation(
+            input_ids, pad_token_id, eos_token_id
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        use_cache=False,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        pixel_values_videos=None,
+        position_ids=None,
+        **kwargs,
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+            pixel_values = None
+            pixel_values_videos = None
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "pixel_values": pixel_values,
+                "pixel_values_videos": pixel_values_videos,
+                "position_ids": None,
+                **kwargs,
+            }
+        )
+
+        return model_inputs
+
+    def update_model_kwargs_for_generation(
+        self, outputs, model_kwargs, is_encoder_decoder=False
+    ):
+        """
+        Updates model kwargs for generation.
+
+        Args:
+            outputs (Any): Model outputs.
+            model_kwargs (dict): Current model kwargs.
+            is_encoder_decoder (bool): Whether using encoder-decoder architecture.
+
+        Returns:
+            dict: Updated model kwargs.
+        """
+        # update cache
+        if (
+            isinstance(outputs, tuple)
+            and len(outputs) > 1
+            and not isinstance(outputs[1], paddle.Tensor)
+        ):
+            model_kwargs["past_key_values"] = outputs[1]
+
+        if (
+            isinstance(outputs, CausalLMOutputWithCrossAttentions)
+            and "past_key_values" in outputs
+        ):
+            model_kwargs["past_key_values"] = outputs.past_key_values
+
+        if (
+            not is_encoder_decoder
+            and model_kwargs.get("attention_mask", None) is not None
+        ):
+            # update attention mask
+            attention_mask = model_kwargs["attention_mask"]
+            model_kwargs["attention_mask"] = paddle.concat(
+                [
+                    attention_mask,
+                    paddle.ones(
+                        [attention_mask.shape[0], 1], dtype=attention_mask.dtype
+                    ),
+                ],
+                axis=-1,
+            )
+
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids: paddle.Tensor = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        past_key_values: Optional[List[paddle.Tensor]] = None,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        labels: Optional[paddle.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        pixel_values_videos: Optional[paddle.Tensor] = None,
+        image_grid_thw: Optional[paddle.Tensor] = None,
+        video_grid_thw: Optional[paddle.Tensor] = None,
+        rope_deltas: Optional[paddle.Tensor] = None,
+        second_per_grid_ts: Optional[paddle.Tensor] = None,
+        **kwargs,
+    ) -> Union[Tuple, PPOCRVLCausalLMOutputWithPast]:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        if inputs_embeds is None:
+            if input_ids.shape[0] != 1:
+                raise NotImplementedError
+            inputs_embeds = self.model.embed_tokens(input_ids)
+            if pixel_values is not None:
+                pixel_values = pixel_values.astype(inputs_embeds.dtype)
+                pixel_values = pixel_values.unsqueeze(0)
+                siglip_position_ids = list()
+                image_grid_hws = list()
+                sample_indices = list()
+                cu_seqlens = [0]
+
+                pro = 0
+                for idx, thw in enumerate(image_grid_thw):
+                    thw_tuple = tuple(thw.detach().cpu().numpy().tolist())
+                    numel = np.prod(thw_tuple)
+                    image_grid_hws.append(thw_tuple)
+                    image_position_ids = paddle.arange(numel) % np.prod(thw_tuple[1:])
+                    siglip_position_ids.append(image_position_ids)
+                    sample_indices.append(
+                        paddle.full((numel,), idx, dtype=paddle.int64)
+                    )
+                    cu_seqlens.append(cu_seqlens[-1] + numel)
+
+                siglip_position_ids = paddle.concat(siglip_position_ids, axis=0)
+                cu_seqlens = paddle.to_tensor(cu_seqlens, dtype=paddle.int32)
+                sample_indices = paddle.concat(sample_indices, axis=0)
+
+                vision_outputs = self.visual(
+                    pixel_values=pixel_values,
+                    image_grid_thw=image_grid_hws,
+                    position_ids=siglip_position_ids,
+                    vision_return_embed_list=True,
+                    interpolate_pos_encoding=True,
+                    sample_indices=sample_indices,
+                    cu_seqlens=cu_seqlens,
+                    return_pooler_output=False,
+                    use_rope=True,
+                    window_size=-1,
+                )
+                image_embeds = vision_outputs.last_hidden_state
+
+                image_embeds = self.mlp_AR(image_embeds, image_grid_thw)
+
+                n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+                # image_embeds is a list of tensor, each tensor is a image feature,I want to concat them all into a tensor
+                image_embeds = paddle.concat(image_embeds, axis=0)
+                n_image_features = image_embeds.shape[0]
+                if n_image_tokens != n_image_features:
+                    raise ValueError(
+                        f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                    )
+
+                mask = input_ids == self.config.image_token_id
+                mask_unsqueezed = mask.unsqueeze(-1)
+                mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+                image_mask = mask_expanded
+
+                image_embeds = image_embeds.astype(inputs_embeds.dtype)
+
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+        else:
+            if inputs_embeds.shape[0] != 1:
+                raise NotImplementedError
+
+        if attention_mask is not None and attention_mask.dtype != paddle.bool:
+            attention_mask = paddle.cast(attention_mask, paddle.bool)
+
+        # position_ids = None
+        # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
+        if position_ids is None and (
+            attention_mask is None or attention_mask.ndim == 2
+        ):
+            # calculate RoPE index once per generation in the pre-fill stage only
+            if self.rope_deltas is None or (
+                past_key_values is None or past_key_values[0] is None
+            ):
+                position_ids, rope_deltas = self.get_rope_index(
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    second_per_grid_ts,
+                    attention_mask,
+                )
+                self.rope_deltas = rope_deltas
+            # then use the prev pre-calculated rope-deltas to get the correct position ids
+            else:
+                batch_size, seq_length, _ = inputs_embeds.shape
+                delta = (
+                    (past_key_values[0][0].shape[1] + self.rope_deltas)
+                    if past_key_values is not None and past_key_values[0] is not None
+                    else 0
+                )
+                position_ids = paddle.arange(seq_length)
+                position_ids = position_ids.reshape((1, -1)).expand((batch_size, -1))
+                if (
+                    past_key_values is not None and past_key_values[0] is not None
+                ):  # otherwise `deltas` is an int `0`
+                    delta = delta.repeat_interleave(
+                        batch_size // delta.shape[0], axis=0
+                    )
+                position_ids = position_ids.add(delta)
+                position_ids = position_ids.unsqueeze(0).expand((3, -1, -1))
+
+        outputs = self.model(
+            input_ids=None,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.astype("float32")
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = paddle.nn.CrossEntropyLoss()
+            shift_logits = shift_logits.reshape((-1, self.config.vocab_size))
+            shift_labels = shift_labels.reshape((-1,))
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return PPOCRVLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            rope_deltas=self.rope_deltas,
+        )
+
+    def generate(self, inputs, **kwargs):
+        gen_kwargs = {
+            "max_new_tokens": kwargs.get("max_new_tokens", 8192),
+            "use_cache": kwargs.get("use_cache", True),
+        }
+        gen_kwargs = {**inputs, **gen_kwargs}
+        with paddle.no_grad():
+            generated_ids = super().generate(**gen_kwargs)
+        return generated_ids
+
+    def _get_image_nums_and_video_nums(
+        self,
+        input_ids: Optional[paddle.Tensor],
+    ) -> Tuple[paddle.Tensor, paddle.Tensor]:
+        """
+        Get the number of images and videos for each sample to calculate the separation length of the sample tensor.
+        These parameters are not passed through the processor to avoid unpredictable impacts from interface modifications.
+
+        Args:
+            input_ids (`paddle.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+        Returns:
+            image_nums (`paddle.Tensor` of shape `(batch_size, num_images_sample)`)
+            video_nums (`paddle.Tensor` of shape `(batch_size, num_videos_sample)`)
+        """
+        image_token_id = self.config.image_token_id
+        video_token_id = self.config.video_token_id
+        vision_start_token_id = self.config.vision_start_token_id
+
+        vision_start_mask = input_ids == vision_start_token_id
+        vision_first_mask = paddle.roll(vision_start_mask, shifts=1, axis=1)
+        image_mask = input_ids == image_token_id
+        video_mask = input_ids == video_token_id
+        image_nums = paddle.sum(vision_first_mask & image_mask, axis=1)
+        video_nums = paddle.sum(vision_first_mask & video_mask, axis=1)
+
+        return image_nums, video_nums
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_projector.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_projector.py
new file mode 100644
index 0000000000..a33cb0ca71
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_projector.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/modeling_keye.py
+# Original header:
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+import paddle
+import paddle.nn as nn
+
+
+class GELUActivation(nn.Layer):
+    """
+    Original Implementation of the GELU activation function in Google BERT repo when initially created. For
+    information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 +
+    torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional
+    Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415
+    """
+
+    def __init__(self, use_gelu_python: bool = False):
+        super().__init__()
+        if use_gelu_python:
+            self.act = self._gelu_python
+        else:
+            self.act = nn.functional.gelu
+
+    def _gelu_python(self, input):
+        return input * 0.5 * (1.0 + paddle.erf(input / math.sqrt(2.0)))
+
+    def forward(self, input):
+        return self.act(input)
+
+
+class Projector(nn.Layer):
+
+    def __init__(self, text_config, vision_config):
+        super().__init__()
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.merge_kernel_size = (2, 2)
+
+        self.hidden_size = (
+            self.vision_config.hidden_size
+            * self.merge_kernel_size[0]
+            * self.merge_kernel_size[1]
+        )
+
+        self.pre_norm = nn.LayerNorm(self.vision_config.hidden_size, epsilon=1e-05)
+        self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size)
+        self.act = GELUActivation()
+        self.linear_2 = nn.Linear(self.hidden_size, self.text_config.hidden_size)
+
+    def forward(self, image_features, image_grid_thw):
+        m1, m2 = self.merge_kernel_size
+        if isinstance(image_features, (list, tuple)):
+            processed_features = list()
+            for image_feature, image_grid in zip(image_features, image_grid_thw):
+                image_feature = self.pre_norm(image_feature)  # shape: (T*H*W, D)
+                t, h, w = image_grid
+                from einops import rearrange
+
+                image_feature = rearrange(
+                    image_feature,
+                    "(t h p1 w p2) d -> (t h w) (p1 p2 d)",
+                    t=int(t),
+                    h=int(h // m1),
+                    p1=int(m1),
+                    w=int(w // m2),
+                    p2=int(m2),
+                )
+                hidden_states = self.linear_1(image_feature)
+                hidden_states = self.act(hidden_states)
+                hidden_states = self.linear_2(hidden_states)
+                processed_features.append(hidden_states)
+
+            return processed_features
+
+        dims = image_features.shape[:-1]
+        dim = image_features.shape[-1]
+        image_features = paddle.reshape(image_features, [-1, dim])
+        hidden_states = self.pre_norm(image_features)
+        hidden_states = paddle.reshape(hidden_states, [-1, self.hidden_size])
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return paddle.reshape(hidden_states, [*dims, -1])
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_refined_recompute/__init__.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_refined_recompute/__init__.py
new file mode 100644
index 0000000000..b64cf01fdc
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_refined_recompute/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_refined_recompute/utils.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_refined_recompute/utils.py
new file mode 100644
index 0000000000..e1e32169cd
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_refined_recompute/utils.py
@@ -0,0 +1,346 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""refined recompute"""
+
+import inspect
+import queue
+from collections import defaultdict
+
+import paddle
+from paddle import framework
+from paddle.base import core
+
+__all__ = [
+    "RefinedRcomputeQueue",
+    "global_rr_queue_log",
+    "RefinedRecomputeFunction",
+    "create_skip_config_for_refined_recompute",
+]
+
+
+_is_second_fwd = False
+
+
+def is_second_fwd():
+    """
+    Determine if it is the second forward propagation
+    """
+    global _is_second_fwd
+    return _is_second_fwd
+
+
+def set_second_fwd(value=True):
+    """
+    Set whether to perform the second forward propagation based on the value
+    """
+    global _is_second_fwd
+    _is_second_fwd = value
+
+
+class CustomSavedTensorsHooks:
+    """
+    Customize saved_tensors_hooks, add logic for switching
+    variables related to the second forward propagation
+    """
+
+    def __init__(self, pack_hook, unpack_hook) -> None:
+        """
+        initialize the CustomSavedTensorsHooks object
+        """
+        self.pack_hook = pack_hook
+        self.unpack_hook = unpack_hook
+
+        self._prev = is_second_fwd()
+        pack_hook_name = f"{pack_hook.__module__}.{pack_hook.__name__}"
+        unpack_hook_name = f"{unpack_hook.__module__}.{unpack_hook.__name__}"
+        self._is_second_fwd = (
+            pack_hook_name == "paddle.distributed.fleet.recompute.recompute.inner_pack"
+            and unpack_hook_name
+            == "paddle.distributed.fleet.recompute.recompute.inner_unpack"
+        )
+
+    def __enter__(self) -> None:
+        """
+        enter the context of CustomSavedTensorsHooks
+        """
+        set_second_fwd(self._is_second_fwd)
+        core.eager.register_saved_tensors_hooks(self.pack_hook, self.unpack_hook)
+
+    def __exit__(self, *args: object) -> None:
+        """
+        exit the context of CustomSavedTensorsHooks
+        """
+        set_second_fwd(self._prev)
+        core.eager.reset_saved_tensors_hooks()
+
+
+# hack saved_tensors_hooks add set_second_fwd decorator
+paddle.autograd.saved_tensors_hooks = CustomSavedTensorsHooks
+
+
+def create_skip_config_for_refined_recompute(layer_idx, config):
+    """
+    Creates a configuration for skipping recomputation based on the configuration file,
+    effective only at the specified layer index.
+
+    Args:
+        layer_idx (int): The layer index used to check whether recomputation should be skipped.
+        config (dict): The configuration file of the input model.
+
+    Returns:
+        dict: Returns an updated configuration file containing the following key-value pairs:
+            - skip_recompute_ops (dict): A dictionary with each model layer's each operation's name and a boolean
+                                         indicating whether to skip recomputation, defaults to None.
+            - If the refined_recompute key does not exist or recompute is set to False,
+              the original configuration file is returned.
+
+    """
+    if not config.recompute:
+        return config
+    skip_config = dict()
+
+    if len(config.refined_recompute) > 0 and config.recompute_granularity != "full":
+        raise ValueError(
+            "Selective recompute only support full recompute now, "
+            "please set recompute_granularity to `full`."
+        )
+
+    for op_name, skip_num in config.refined_recompute.items():
+        if skip_num == 0:  # 0 means all recompute
+            skip_config[op_name] = False
+        elif skip_num < 0:  # < 0 means all skip recompute
+            skip_config[op_name] = True
+        else:
+            if layer_idx < skip_num:  # < the number of layers to skip recompute
+                skip_config[op_name] = True
+            else:
+                skip_config[op_name] = False
+
+    config.skip_recompute_ops[layer_idx] = skip_config
+    return config
+
+
+class RefinedRcomputeQueue:
+    """
+    Thread-safe queue management system for recomputation operations.
+
+    Provides a mechanism to track and validate multiple recomputation queues
+    with automatic naming and existence checking capabilities.
+    """
+
+    def __init__(self):
+        """
+        Initializes an empty queue registry.
+        """
+        self.rr_queue = defaultdict(queue.Queue)
+
+    def update(self, queue: queue.Queue, queue_name="unknown"):
+        """
+        Registers a new queue in the management system.
+
+        Args:
+            queue (queue.Queue): The queue object to register
+            queue_name (str): Base identifier for the queue (default: "unknown")
+                Note: Automatically appends the queue's memory address for uniqueness
+
+        Raises:
+            ValueError: If a queue with the generated name already exists
+        """
+        queue_name = f"{queue_name}_{id(queue)}"
+        if queue_name in self.rr_queue:
+            raise ValueError(f"Queue name '{queue_name}' already exists.")
+        self.rr_queue[queue_name] = queue
+
+    def check(self):
+        """
+        Validates all registered queues are empty.
+
+        Raises:
+            ValueError: If any registered queue contains pending items
+                Reports all non-empty queue names in the error message
+        """
+        non_empty_queues = [
+            name for name, queue in self.rr_queue.items() if queue.qsize() != 0
+        ]
+        if non_empty_queues:
+            raise ValueError(f"Queues {', '.join(non_empty_queues)} are not empty.")
+
+
+global_rr_queue_log = RefinedRcomputeQueue()
+
+
+class _NoopSaveInputs(paddle.autograd.PyLayer):
+    """
+    This layer does nothing but save all input tensors.
+    This is used to prevent the gradients of the inputs being computed.
+    """
+
+    @staticmethod
+    def forward(ctx, *args):
+        """This function does nothing but save all input tensors."""
+        tensors = [o.detach() for o in args if isinstance(o, paddle.Tensor)]
+        ctx.save_for_backward(*tensors)
+        # Return a dummy tensor which will be automatically released by the framework.
+        return paddle.empty((0,), dtype=tensors[0].dtype)
+
+    @staticmethod
+    def backward(ctx, *args):
+        """Should not be called since we don't support backward on this graph."""
+        raise AssertionError("Did not expect to backward on this graph")
+
+
+class RefinedRecomputeFunction:
+    """refined recompute for function"""
+
+    def __init__(self):
+        """
+        initialize the RefinedRecomputeFunction object.
+        """
+        self.is_init = False
+
+    def post_init(self, function, function_name=None):
+        """
+        post init the RefinedRecomputeFunction object.
+        """
+        if not self.is_init:
+            if function_name is None:
+                function_name = f"{function.__module__}.{function.__name__}"
+            self._hold_tensors_queue = queue.Queue()
+            global_rr_queue_log.update(self._hold_tensors_queue, function_name)
+            self.function = function
+            self.function_name = function_name
+            self.is_init = True
+
+    def __call__(self, function, *args, **kwargs):
+        """
+        call the RefinedRecomputeFunction object.
+        """
+        # in paddle.no_grad(), return the original output
+        if not framework._dygraph_tracer()._has_grad:
+            return function(*args, **kwargs)
+        self.post_init(function)
+        return self.forward(*args, **kwargs)
+
+    def forward(self, *args, **kwargs):
+        """Refined Recompute Forward"""
+        if is_second_fwd():
+            output = self._second_fwd(*args, **kwargs)
+        else:
+            output = self._first_fwd(*args, **kwargs)
+        return output
+
+    def _first_fwd(self, *args, **kwargs):
+        """
+        do the first forward
+        """
+        input_args = self.parse_to_args(*args, **kwargs)
+
+        # chose the right function
+        if self.function_name in [
+            "paddle.nn.functional.linear",
+            "paddle.nn.functional.common.linear",
+            "paddle.incubate.nn.functional.fused_linear",
+            "paddle.incubate.nn.functional.fused_matmul_bias.fused_linear",
+        ] or self.function_name.endswith("linear_reduce_scatter"):
+            # is linear function
+            outputs = self.function(*input_args)
+            self._hold_tensors_queue.put([outputs])
+            return outputs
+        else:
+            if (
+                self.function_name
+                == "paddle.nn.functional.flash_attention.flashmask_attention"
+            ):
+                kwargs["return_softmax_lse"] = True
+                kwargs["return_seed_offset"] = True
+                outputs = self.function(
+                    *args, **kwargs
+                )  # outputs is [out, result_softmax_lse, result_seed_offset]
+            elif (
+                self.function_name
+                == "paddle.nn.functional.flash_attention.flash_attention_with_sparse_mask"
+            ):
+                kwargs["return_softmax"] = False
+                kwargs["return_softmax_lse"] = True
+                kwargs["return_seed_offset"] = True
+                outputs = self.function(
+                    *args, **kwargs
+                )  # outputs is [out, result_softmax_lse, result_seed_offset]
+            elif self.function_name in [
+                "paddle.nn.functional.scaled_dot_product_attention",
+                "paddle.nn.functional.flash_attention.scaled_dot_product_attention",
+            ]:
+                fixed_seed_offset = (None,)
+                return_softmax = False
+                rng_name = ""
+                outputs = list(
+                    paddle._C_ops.flash_attn(
+                        *input_args[:3],
+                        fixed_seed_offset,
+                        *input_args[3:6],
+                        return_softmax,
+                        not input_args[6],
+                        rng_name,
+                    )
+                )
+                outputs.pop(
+                    1
+                )  # outputs is [out, result_softmax_lse, result_seed_offset]
+            else:
+                raise ValueError(
+                    f"Unknown function: {self.function_name}, please implement it first!"
+                )
+            self._hold_tensors_queue.put(outputs)
+            return outputs[0]
+
+    def _second_fwd(self, *args, **kwargs):
+        """
+        do the second forward
+        """
+        assert not self._hold_tensors_queue.empty(), "queue should not be empty"
+        input_args = self.parse_to_args(*args, **kwargs)
+        hold_tensors = self._hold_tensors_queue.get()
+        if len(hold_tensors) == 1:  # is linear function
+            _NoopSaveInputs.apply(*input_args[:2])
+        else:  # is flash function
+            _NoopSaveInputs.apply(*input_args, *hold_tensors)
+        return hold_tensors[0]
+
+    def parse_to_args(self, *args, **kwargs):
+        """
+        parse the input arguments and keywords to a list of arguments.
+        """
+        input_args = []
+        dyfunc_sig = inspect.signature(self.function)
+        bound_args = dyfunc_sig.bind(*args, **kwargs)
+        bound_args.apply_defaults()
+
+        for arg, param in zip(
+            bound_args.arguments.values(), dyfunc_sig.parameters.values()
+        ):
+            if param.kind == param.VAR_POSITIONAL:
+                input_args.extend(arg)
+            elif param.kind in (
+                param.POSITIONAL_ONLY,
+                param.POSITIONAL_OR_KEYWORD,
+            ):
+                input_args.append(arg)
+            elif param.kind == param.VAR_KEYWORD:
+                input_args.extend(arg.values())
+            elif param.kind == param.KEYWORD_ONLY:
+                input_args.append(arg)
+            else:
+                raise ValueError("Unknown parameter kind.")
+        return input_args
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_sequence_parallel_utils.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_sequence_parallel_utils.py
new file mode 100644
index 0000000000..19f062e415
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_sequence_parallel_utils.py
@@ -0,0 +1,339 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import hashlib
+
+import numpy as np
+import paddle
+from paddle import distributed as dist
+from paddle.autograd import PyLayer
+from paddle.distributed import fleet
+
+from ._distributed.common_dist_utils import (
+    all_gather_group,
+    all_gather_varlen,
+    mp_slice,
+    reduce_scatter_group,
+    scatter_axis,
+)
+
+if not hasattr(paddle.Tensor, "contiguous"):
+
+    def contiguous(self):
+        """
+        Make the tensor contiguous.
+        """
+        return self
+
+    paddle.Tensor.contiguous = contiguous
+
+
+if not hasattr(paddle.Tensor, "_md5sum"):
+
+    def _md5sum(self):
+        """
+        Calculate the md5sum of the Tensor.
+        """
+        numpy_array = np.array(self)
+        array_bytes = numpy_array.tobytes()
+        return hashlib.md5(array_bytes).hexdigest()
+
+    paddle.Tensor._md5sum = _md5sum
+
+
+class _AllToAll(paddle.autograd.PyLayer):
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        group,
+        output_split_sizes=None,
+        input_split_sizes=None,
+    ):
+        """
+        All-to-all communication in the group
+
+        Args:
+            ctx (Any): Context object.
+            input (Tensor): Input tensor.
+            group (Group): The group object.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+
+        ctx.group = group
+        ctx.input_split_sizes = input_split_sizes
+        ctx.output_split_sizes = output_split_sizes
+        # return input
+        if dist.get_world_size(group) <= 1:
+            return input
+        if input_split_sizes is None and output_split_sizes is None:
+            output = paddle.empty_like(input)
+            task = dist.stream.alltoall_single(
+                output, input, None, None, group, True, True
+            )
+            task.wait()
+        else:
+            out_sizes = [sum(output_split_sizes)]
+            out_sizes.extend(input.shape[1:])
+            output = paddle.empty(out_sizes, dtype=input.dtype)
+            task = dist.stream.alltoall_single(
+                output,
+                input,
+                output_split_sizes,
+                input_split_sizes,
+                group,
+                sync_op=False,
+            )
+            task.wait()
+        return output
+
+    @staticmethod
+    def backward(ctx, *grad_output):
+        """
+        all-to-all backward
+
+        """
+        # return grad_output
+        if ctx.input_split_sizes is None and ctx.output_split_sizes is None:
+            return _AllToAll.apply(*grad_output, ctx.group)
+        else:
+            return _AllToAll.apply(
+                *grad_output, ctx.group, ctx.input_split_sizes, ctx.output_split_sizes
+            )
+
+
+class AllGatherVarlenOpV2(PyLayer):
+    """
+    Custom PyLayer for variable-length all-gather operation with autograd support.
+    """
+
+    @staticmethod
+    def forward(ctx, input, indices, axis=0, group=None):
+        """forward"""
+        ctx.axis = axis
+        ctx.group = group
+        ctx.indices = indices
+        return all_gather_varlen(input, indices, axis=axis, group=group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return mp_slice(grad, ctx.indices, axis=ctx.axis, group=ctx.group)
+
+
+class SliceVarlenOp(PyLayer):
+    """
+    Each rank slices a variable-length portion from the **same** sequence.
+    During backward pass, gradients from all ranks are aggregated to restore
+    the mp (model parallelism) synchronization state.
+
+    This is the variable-length version of `ScatterOp`. The inverse operation is `VarlenGatherOp`.
+
+    Args:
+        input: Tensor [S,*]
+        indices: Slice lengths for each rank
+        minimum_size: If slice is empty, return `minimum_size` dummy elements.
+    Returns:
+        Sliced Tensor
+    """
+
+    @staticmethod
+    def forward(
+        ctx,
+        input,
+        indices,
+        group=None,
+    ):
+        """forward"""
+        ctx.indices = indices
+        ctx.group = group
+        ret = mp_slice(input, indices, group=ctx.group)
+        return ret
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return all_gather_varlen(grad, axis=ctx.axis, group=ctx.group)
+
+
+class ScatterOp(PyLayer):
+    """
+    Each rank slices its own portion from the **same** sequence (uniformly split).
+    During backward pass, gradients from all ranks are aggregated to restore
+    the mp (model parallelism) synchronization state.
+    The inverse operation is `GatherOp`.
+
+    input: Tensor [S,*]
+
+    Note: Not related to `distributed.scatter`.
+    """
+
+    @staticmethod
+    def forward(ctx, input, axis=0, group=None):
+        """forward"""
+        ctx.axis = axis
+        ctx.group = group
+        return scatter_axis(input, axis=axis, group=ctx.group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return all_gather_group(grad, axis=ctx.axis, group=ctx.group)
+
+
+SliceOp = ScatterOp  # `ScatterOp` similar to Sclice
+
+
+class GatherOp(PyLayer):
+    """
+    input shape: [s/n, b, h], n is mp parallelism
+    after forward shape: [s, b, h]
+    Behavior is similar to `AllGather`, but gradients will not be aggregated in backward, from MP asynchronous state to MP synchronous state.
+    """
+
+    @staticmethod
+    def forward(ctx, input, axis=0, group=None):
+        """forward"""
+        ctx.axis = axis
+        ctx.group = group
+        return all_gather_group(input, axis=axis, group=group)
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return scatter_axis(grad, axis=ctx.axis, group=ctx.group)
+
+
+class AllGatherOp(PyLayer):
+    """
+    input shape: [s/n, b, h], n is mp parallelism
+    after forward shape: [s, b, h]
+    The behavior is similar to `AllGather`, and the gradients will be aggregated in backward. After AllGather, it is still in MP asynchronous state.
+    """
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """forward"""
+        ctx.group = group
+        return all_gather_group(input, group=group)
+
+    # grad shape: [s, b, h], n is mp parallelism
+    # after forward shape: [s/n, b, h]
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        return reduce_scatter_group(grad, group=ctx.group)
+
+
+class AllGatherVarlenOp(PyLayer):
+    """the shape of allgather can be not same for each rank"""
+
+    @staticmethod
+    def forward(ctx, input, group=None):
+        """forward"""
+        hcg = fleet.get_hybrid_communicate_group()
+        if group is None:
+            group = hcg.get_model_parallel_group()
+
+        shape0 = paddle.to_tensor([input.shape[0]])
+        shape0_all = paddle.empty(shape=[group.nranks], dtype=shape0.dtype)
+        dist.stream.all_gather(shape0_all, shape0, group=group, use_calc_stream=True)
+        shape0_all = shape0_all.numpy()
+        max_shape0 = shape0_all.max()
+
+        indices = []
+        for idx, s in enumerate(shape0_all):
+            offset = idx * max_shape0
+            indices.append(list(range(offset, offset + s)))
+        indices = np.concatenate(indices, axis=0)
+        indices = indices.reshape([-1] + [1] * (len(input.shape) - 1))
+        indices = paddle.to_tensor(indices, dtype=paddle.int32)
+
+        padding = max_shape0 - input.shape[0]
+
+        ctx.shape0 = input.shape[0]
+        ctx.max_shape0 = max_shape0
+        ctx.shape0_all = shape0_all
+        ctx.padding = padding
+        ctx.indices = indices
+        ctx.group = group
+
+        if padding > 0:
+            input_shape = input.shape
+            input_shape[0] = padding
+            padding_tensor = paddle.empty(shape=input_shape, dtype=input.dtype)
+            input = paddle.concat([input, padding_tensor], axis=0)
+        output = all_gather_group(input, group)
+        output = paddle.take_along_axis(output, indices, axis=0)
+
+        return output
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward"""
+        input_shape = grad.shape
+        input_shape[0] = ctx.max_shape0 * ctx.shape0_all.shape[0]
+        output = paddle.zeros(shape=input_shape, dtype=grad.dtype)
+
+        grad = paddle.scatter(output, ctx.indices, grad)
+
+        grad = scatter_axis(grad, ctx.group)
+
+        if ctx.padding > 0:
+            grad = grad[: ctx.shape0]
+        return grad
+
+
+def sequence_parallel_sparse_mask_labels(labels, ignore_label=-100):
+    """allgather sparse label and return sparse idx"""
+    hcg = fleet.get_hybrid_communicate_group()
+    group = hcg.get_model_parallel_group()
+    # parallelism = group.nranks
+    labels = labels.flatten()
+    labels_local = paddle.split(labels, group.nranks)[group.rank]
+
+    tgt_index = paddle.nonzero(labels_local != ignore_label).squeeze()
+    if tgt_index.numel() == 0:
+        tgt_index = paddle.to_tensor([0])
+
+    tgt_index = tgt_index.reshape([-1]).astype(paddle.int32)
+    labels_local_gather = paddle.take_along_axis(labels_local, tgt_index, axis=0)
+    labels_all_gather = AllGatherVarlenOp.apply(labels_local_gather)
+    return labels_all_gather, tgt_index.reshape([-1, 1])
+
+
+###################################################
+#                                                 #
+#        Modified Parallel Linear Operator        #
+#                                                 #
+###################################################
+
+
+def mark_as_sequence_parallel_parameter(parameter):
+    parameter.sequence_parallel = True
+
+
+class MPScale(PyLayer):
+    @staticmethod
+    def forward(ctx, x, mp_degree):
+        """forward"""
+        out = paddle.scale(x, 1.0 / mp_degree)
+        return out
+
+    @staticmethod
+    def backward(ctx, dout):
+        """backward"""
+        return dout
diff --git a/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_siglip.py b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_siglip.py
new file mode 100644
index 0000000000..548ee786d0
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/modeling/ppocrvl/_siglip.py
@@ -0,0 +1,860 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/modeling_keye.py
+# Original header:
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# TODO: Weight initialization
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from ....common.vlm.activations import ACT2FN
+from ....common.vlm.transformers import PretrainedModel
+from ....common.vlm.transformers.model_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+)
+from ._config import PPOCRVisionConfig, PPOCRVLConfig
+
+
+def rotate_half(x):
+    Dh = x.shape[-1]
+    x1 = x[..., : Dh // 2]
+    x2 = x[..., Dh // 2 :]
+    return paddle.concat([-x2, x1], axis=-1)
+
+
+def _ensure_cos_sin_dim(cos, sin, dim_needed):
+    last = cos.shape[-1]
+    if last == dim_needed:
+        return cos, sin
+    elif last * 2 == dim_needed:
+        cos = paddle.concat([cos, cos], axis=-1)
+        sin = paddle.concat([sin, sin], axis=-1)
+        return cos, sin
+    else:
+        raise ValueError(
+            f"Unexpected cos/sin last-dim: {last}, expected {dim_needed} or {dim_needed//2}"
+        )
+
+
+def apply_rotary_pos_emb_vision(q, k, cos, sin):
+    orig_q_dtype, orig_k_dtype = q.dtype, k.dtype
+    q = q.astype("float32")
+    k = k.astype("float32")
+
+    Dh = q.shape[-1]
+    cos = cos.astype("float32")
+    sin = sin.astype("float32")
+    cos, sin = _ensure_cos_sin_dim(cos, sin, Dh)
+
+    cos = cos.unsqueeze(-2)
+    sin = sin.unsqueeze(-2)
+
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed.astype(orig_q_dtype), k_embed.astype(orig_k_dtype)
+
+
+def eager_attention_forward(
+    module,
+    query,
+    key,
+    value,
+    attention_mask,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = paddle.matmul(query, key.transpose((0, 1, 3, 2))) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = F.softmax(attn_weights, axis=-1, dtype="float32").astype(query.dtype)
+    attn_weights = F.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = paddle.matmul(attn_weights, value)
+    attn_output = attn_output.transpose((0, 2, 1, 3)).contiguous()
+
+    return attn_output, attn_weights
+
+
+class SiglipAttention(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        assert self.head_dim * self.num_heads == self.embed_dim
+        self.scale = self.head_dim**-0.5
+        self.dropout = getattr(config, "attention_dropout", 0.0)
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: paddle.Tensor,  # [B, L, D]
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        cu_seqlens: Optional[List[paddle.Tensor]] = None,
+        rope_emb: Optional[Tuple[paddle.Tensor, paddle.Tensor]] = None,  # (cos, sin)
+    ):
+        B, L, D = hidden_states.shape
+
+        q = self.q_proj(hidden_states)
+        k = self.k_proj(hidden_states)
+        v = self.v_proj(hidden_states)
+
+        # [B, L, H, Dh]
+
+        q = q.reshape([B, L, self.num_heads, self.head_dim])
+        k = k.reshape([B, L, self.num_heads, self.head_dim])
+        v = v.reshape([B, L, self.num_heads, self.head_dim])
+        if rope_emb is not None:
+            cos, sin = rope_emb
+            q, k = apply_rotary_pos_emb_vision(q, k, cos, sin)
+
+        # → [B, H, L, Dh]
+        q = q.transpose([0, 2, 1, 3])
+        k = k.transpose([0, 2, 1, 3])
+        v = v.transpose([0, 2, 1, 3])
+
+        attn_output, attn_weights = eager_attention_forward(
+            self,
+            q,
+            k,
+            v,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+        attn_output = attn_output.reshape([B, L, D]).contiguous()
+
+        attn_output = self.out_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class SiglipVisionEmbeddings(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size  # 1152
+        self.image_size = config.image_size  # 384
+        self.patch_size = config.patch_size  # 14
+
+        # 注意：Paddle 要用 "VALID" 或 0
+        self.patch_embedding = nn.Conv2D(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="VALID",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2  # 729
+        self.num_positions = self.num_patches
+        self.cache_position_embedding = dict()
+        self.cache_position_count = dict()
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.packing_position_embedding = nn.Embedding(32768, self.embed_dim)
+
+        self.register_buffer(
+            "position_ids",
+            paddle.arange(self.num_positions).unsqueeze(0),
+            persistable=False,
+        )
+
+    def interpolate_pos_encoding(
+        self, embeddings, height: int, width: int, is_after_patchify: bool = False
+    ):
+
+        num_positions = self.position_embedding.weight.shape[0]
+
+        patch_pos_embed = self.position_embedding.weight.unsqueeze(0)
+
+        dim = embeddings.shape[-1]
+
+        if is_after_patchify:
+            new_height = height
+            new_width = width
+        else:
+            new_height = height // self.patch_size
+            new_width = width // self.patch_size
+
+        sqrt_num_positions = paddle.to_tensor(num_positions**0.5, dtype=paddle.int64)
+        patch_pos_embed = patch_pos_embed.reshape(
+            (1, sqrt_num_positions, sqrt_num_positions, dim)
+        )
+        patch_pos_embed = patch_pos_embed.transpose((0, 3, 1, 2))
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.transpose((0, 2, 3, 1)).reshape((1, -1, dim))
+        return patch_pos_embed
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def fetch_position_embedding_lfu_cache(self, embeddings, h, w, max_cache=20):
+        grid = (h, w)
+        if grid in self.cache_position_embedding:
+            self.cache_position_count[grid] += 1
+            return self.cache_position_embedding[grid]
+
+        if len(self.cache_position_embedding) >= max_cache:
+            min_hit_grid = min(
+                self.cache_position_count, key=self.cache_position_count.get
+            )
+            self.cache_position_count.pop(min_hit_grid)
+            self.cache_position_embedding.pop(min_hit_grid)
+
+        position_embedding = self.interpolate_pos_encoding(embeddings, h, w, True)
+        self.cache_position_count[grid] = 1
+        self.cache_position_embedding[grid] = position_embedding
+        return position_embedding
+
+    def forward(
+        self,
+        pixel_values: paddle.Tensor,  # [B, L, C, H, W]
+        position_ids: Optional[paddle.Tensor] = None,  # [B or 1, S]
+        image_grid_thw: Optional[
+            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
+        ] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> paddle.Tensor:
+        if pixel_values.dim() == 5:
+            assert position_ids is not None
+            from einops import rearrange
+
+            batch_size, squence_len, channel, height, width = pixel_values.shape
+            target_dtype = self.patch_embedding.weight.dtype
+            pixel_values = rearrange(pixel_values, "b l c h w -> (b l) c h w")
+            patch_embeds = self.patch_embedding(
+                pixel_values.to(dtype=target_dtype)
+            )  # shape = [*, width, grid, grid]
+            embeddings = patch_embeds.flatten(-2).squeeze(-1)
+            embeddings = rearrange(
+                embeddings, "(b l) d -> b l d", b=batch_size, l=squence_len
+            )
+
+            # todo: not dubug
+            if interpolate_pos_encoding and image_grid_thw is not None:
+                flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+                assert batch_size == 1
+                start = 0
+                image_embedding_list = list()
+
+                assert (
+                    sum([np.prod(x) for x in flatten_image_grid_thw])
+                    == embeddings.shape[1]
+                ), (flatten_image_grid_thw, embeddings.shape)
+                embeddings = embeddings.squeeze(0)
+                tmp_embeddings = list()
+                for image_grid in image_grid_thw:
+                    t, h, w = image_grid
+                    end = start + t * h * w
+                    image_embeddings = embeddings[int(start) : int(end), :]
+                    position_embedding = (
+                        self.interpolate_pos_encoding(image_embeddings, h, w, True)
+                        .squeeze(0)
+                        .tile((t, 1))
+                    )
+                    image_embeddings = image_embeddings + position_embedding
+                    tmp_embeddings.append(image_embeddings)
+                    start = end
+                embeddings = paddle.concat(tmp_embeddings, axis=0).unsqueeze(0)
+            else:
+                embeddings = embeddings + self.packing_position_embedding(position_ids)
+            return embeddings
+        else:
+            raise NotImplementedError(str(pixel_values.shape))
+
+
+class SiglipMLP(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SiglipEncoderLayer(paddle.nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.layer_norm1 = paddle.nn.LayerNorm(
+            self.embed_dim, epsilon=config.layer_norm_eps
+        )
+        self.self_attn = SiglipAttention(config)
+        self.layer_norm2 = paddle.nn.LayerNorm(
+            self.embed_dim, epsilon=config.layer_norm_eps
+        )
+        self.mlp = SiglipMLP(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        output_attentions=False,
+        cu_seqlens=None,
+        rope_emb=None,
+    ):
+
+        residual = hidden_states
+        ############################
+        ln1_out = self.layer_norm1(hidden_states)
+
+        x, attn_w = self.self_attn(
+            hidden_states=ln1_out,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            cu_seqlens=cu_seqlens,
+            rope_emb=rope_emb,
+        )
+
+        hs_post_attn = residual + x
+
+        residual = hs_post_attn
+        ln2_out = self.layer_norm2(residual)
+
+        mlp_out = self.mlp(ln2_out)
+
+        hidden_states_out = residual + mlp_out
+
+        outputs = (hidden_states_out,)
+        if output_attentions:
+            outputs += (attn_w,)
+        return outputs
+
+
+class SigLIPRotaryEmbedding(nn.Layer):
+    def __init__(self, dim: int, theta: float = 10000.0) -> None:
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.rope_init()
+
+    def rope_init(self):
+        arange = paddle.arange(0, self.dim, 2, dtype="float32")
+        inv_freq = 1.0 / (self.theta ** (arange / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistable=False)
+
+    def forward(self, seqlen: int) -> paddle.Tensor:
+        seq = paddle.arange(seqlen, dtype=self.inv_freq.dtype)
+        freqs = paddle.outer(seq, self.inv_freq)
+        return freqs
+
+
+class SiglipEncoder(nn.Layer):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+        num_heads = config.num_attention_heads
+        head_dim = embed_dim // num_heads
+        self.layers = nn.LayerList(
+            [SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.rotary_pos_emb = SigLIPRotaryEmbedding(head_dim // 2)
+        self.gradient_checkpointing = False
+
+    @staticmethod
+    def flatten_list(image_grid_thw):
+        tmp_image_grid_thw = list()
+        for image_grid in image_grid_thw:
+            if isinstance(image_grid, list):
+                tmp_image_grid_thw.extend(image_grid)
+            else:
+                tmp_image_grid_thw.append(image_grid)
+        return tmp_image_grid_thw
+
+    def build_window_index(self, image_grid, window_size):
+        """
+        返回：
+          window_indices: int64 [sum(t*h*w_valid)]
+          cu_seqlens_within_windows: int32 [num_windows_total*t]，首位补 0 的前缀和
+        """
+        from einops import rearrange
+
+        window_indices = list()
+        pad_values = -100
+        start_window_index = 0
+        cu_seqlens_within_windows = list()
+
+        for t, h, w in map(int, image_grid):
+            window_index = paddle.arange(t * h * w).reshape((t, h, w))
+            pad_h = (-h) % window_size
+            pad_w = (-w) % window_size
+            assert pad_h >= 0 and pad_w >= 0, (pad_h, pad_w)
+            window_index = F.pad(window_index, (0, pad_w, 0, pad_h), value=pad_values)
+            window_index = rearrange(
+                window_index,
+                "t (h p1) (w p2) -> t (h w) (p1 p2)",
+                p1=window_size,
+                p2=window_size,
+            )
+            window_seqlens = (window_index != pad_values).long().sum(-1).reshape(-1)
+            window_index = window_index.reshape(-1)
+            window_index = window_index[window_index != pad_values]
+            window_indices.append(window_index + start_window_index)
+            cu_seqlens_within_windows.append(
+                window_seqlens.cumsum(0) + start_window_index
+            )
+            start_window_index += t * h * w
+        window_indices = paddle.concat(window_indices, axis=0)
+        cu_seqlens_within_windows = paddle.concat(cu_seqlens_within_windows, axis=0)
+        cu_seqlens_within_windows = F.pad(
+            cu_seqlens_within_windows, (1, 0), value=0
+        ).astype("int32")
+        return window_indices, cu_seqlens_within_windows
+
+    def forward(
+        self,
+        inputs_embeds: paddle.Tensor,
+        attention_mask: Optional[paddle.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cu_seqlens: Optional[paddle.Tensor] = None,
+        image_grid_thw: Optional[
+            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
+        ] = None,
+        height_position_ids: Optional[paddle.Tensor] = None,
+        width_position_ids: Optional[paddle.Tensor] = None,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[int] = -1,
+        vision_or_text: str = "vision",
+    ):
+
+        vision_or_text = "vision"
+        assert vision_or_text in ["vision", "text"]
+        use_window_attn = window_size > 0 and vision_or_text == "vision"
+        use_rope = (use_rope is True) and (vision_or_text == "vision")
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        hidden_states = inputs_embeds
+        attention_mask = (
+            attention_mask.to(inputs_embeds.dtype)
+            if attention_mask is not None
+            else None
+        )
+
+        if use_rope is True:
+            flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+            assert (
+                sum([np.prod(x) for x in flatten_image_grid_thw])
+                == hidden_states.shape[1]
+            ), (flatten_image_grid_thw, hidden_states.shape)
+
+            if width_position_ids is None or height_position_ids is None:
+                split_hids = list()
+                split_wids = list()
+                for t, h, w in flatten_image_grid_thw:
+                    t, h, w = map(int, (t, h, w))
+                    image_pids = paddle.arange(t * h * w) % (h * w)
+                    sample_hids = image_pids // w
+                    sample_wids = image_pids % w
+                    split_hids.append(sample_hids)
+                    split_wids.append(sample_wids)
+                width_position_ids = paddle.concat(split_wids, axis=0)
+                height_position_ids = paddle.concat(split_hids, axis=0)
+
+            window_indices, cu_seqlens_within_windows = None, None
+
+            if use_window_attn:
+                window_indices, cu_seqlens_within_windows = self.build_window_index(
+                    flatten_image_grid_thw, window_size
+                )
+                reversed_window_indices = window_indices.argsort()
+                height_position_ids = height_position_ids[window_indices]
+                width_position_ids = width_position_ids[window_indices]
+
+            pids = paddle.stack(
+                [height_position_ids, width_position_ids], axis=-1
+            ).astype(paddle.int64)
+            max_grid_size = pids.max() + 1
+            rope_emb_max_grid = self.rotary_pos_emb(max_grid_size)
+
+            rope_emb = rope_emb_max_grid[pids].flatten(1)
+
+            rope_emb = rope_emb.tile((1, 2))
+            rope_emb = (rope_emb.cos(), rope_emb.sin())
+
+        else:
+            rope_emb = None
+
+            window_indices, cu_seqlens_within_windows = None, None
+
+            if use_window_attn:
+                flatten_image_grid_thw = self.flatten_list(image_grid_thw)
+                assert (
+                    sum(
+                        [
+                            np.prod(x.astype("float32").cpu().numpy())
+                            for x in flatten_image_grid_thw
+                        ]
+                    )
+                    == hidden_states.shape[1]
+                ), (flatten_image_grid_thw, hidden_states.shape)
+
+                window_indices, cu_seqlens_within_windows = self.build_window_index(
+                    flatten_image_grid_thw, window_size
+                )
+                reversed_window_indices = window_indices.argsort()
+
+        if use_window_attn:
+            assert cu_seqlens_within_windows is not None
+            attn_cu_seqlens = cu_seqlens_within_windows
+            hidden_states = hidden_states[:, window_indices, :]
+        else:
+            attn_cu_seqlens = cu_seqlens
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (
+                    (hidden_states[:, reversed_window_indices, :],)
+                    if use_window_attn
+                    else (hidden_states,)
+                )
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+                cu_seqlens=attn_cu_seqlens,
+                rope_emb=rope_emb,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if use_window_attn:
+            hidden_states = hidden_states[:, reversed_window_indices, :]
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_states,
+            attentions=all_attentions,
+        )
+
+
+class SiglipMultiheadAttentionPoolingHead(nn.Layer):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: PPOCRVisionConfig):
+        super().__init__()
+
+        self.probe = self.create_parameter(
+            shape=(1, 1, config.hidden_size),
+            default_initializer=paddle.nn.initializer.Normal(),
+        )
+        self.attention = nn.MultiHeadAttention(
+            config.hidden_size, config.num_attention_heads
+        )
+        self.layernorm = nn.LayerNorm(config.hidden_size, epsilon=config.layer_norm_eps)
+        self.mlp = SiglipMLP(config)
+
+    def forward(self, hidden_state, key_padding_mask=None):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.tile((batch_size, 1, 1))
+
+        hidden_state = self.attention(
+            probe, hidden_state, hidden_state, key_padding_mask=key_padding_mask
+        )[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+
+
+class SiglipVisionTransformer(nn.Layer):
+    def __init__(self, config: PPOCRVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SiglipVisionEmbeddings(config)
+        self.encoder = SiglipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, epsilon=config.layer_norm_eps)
+        self.use_head = (
+            True if not hasattr(config, "vision_use_head") else config.vision_use_head
+        )
+        if self.use_head:
+            self.head = SiglipMultiheadAttentionPoolingHead(config)
+
+    def forward(
+        self,
+        pixel_values,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        attention_mask=None,
+        sample_indices=None,
+        image_indices=None,
+        position_ids=None,
+        height_position_ids=None,
+        width_position_ids=None,
+        cu_seqlens=None,
+        padding_mask=None,
+        vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[
+            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
+        ] = None,
+        return_pooler_output: Optional[bool] = True,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+    ) -> BaseModelOutputWithPooling:
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        hidden_states = self.embeddings(
+            pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            image_grid_thw=image_grid_thw,
+        )
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            attention_mask=attention_mask,
+            cu_seqlens=cu_seqlens,
+            image_grid_thw=image_grid_thw,
+            use_rope=use_rope,
+            height_position_ids=height_position_ids,
+            width_position_ids=width_position_ids,
+            window_size=window_size,
+            vision_or_text="vision",
+        )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if return_pooler_output is True:
+            if sample_indices is not None:
+                assert self.use_head is True
+                dim = last_hidden_state.shape[-1]
+                sample_hidden_state_list = list()
+
+                hidden_state = last_hidden_state.squeeze(0)
+                sample_index = sample_indices
+                unique_sample_index = (
+                    paddle.unique(sample_index).sort().values.unbind(0)
+                )
+                unique_sample_index = list(unique_sample_index)
+                if len(unique_sample_index) > 0 and unique_sample_index[0] == -1:
+                    unique_sample_index = unique_sample_index[1:]
+                for sample_idx in unique_sample_index:
+                    token_indices = (sample_index == sample_idx).nonzero().flatten()
+                    sample_hidden_state = hidden_state[token_indices]
+                    sample_hidden_state_list.append(sample_hidden_state)
+
+                if not vision_return_embed_list:
+                    max_length = max(
+                        [_state.shape[0] for _state in sample_hidden_state_list]
+                    )
+                    tmp_sample_hidden_state_list = list()
+                    padding_mask = list()
+                    for idx, _state in enumerate(sample_hidden_state_list):
+                        padding_length = max_length - _state.shape[0]
+                        mask = _state.new_zeros(size=(max_length,), dtype=paddle.int64)
+                        mask[-padding_length:] = 1
+                        padding_mask.append(mask)
+                        padding = _state.new_zeros(size=(padding_length, dim))
+                        new_state = paddle.concat([_state, padding], axis=0)
+                        tmp_sample_hidden_state_list.append(new_state)
+                    sample_hidden_state = paddle.stack(
+                        tmp_sample_hidden_state_list, axis=0
+                    )
+                    padding_mask = (
+                        paddle.stack(padding_mask, axis=0)
+                        .astype("float32")
+                        .to(last_hidden_state.dtype)
+                    )
+                    pooler_output = self.head(
+                        sample_hidden_state, key_padding_mask=padding_mask
+                    )
+                else:
+                    pooler_output = list()
+                    for state in sample_hidden_state_list:
+                        sample_pooler_output = self.head(state.unsqueeze(0))
+                        pooler_output.append(sample_pooler_output)
+                    pooler_output = paddle.concat(pooler_output, axis=0)
+                    sample_hidden_state = sample_hidden_state_list
+
+                return BaseModelOutputWithPooling(
+                    last_hidden_state=sample_hidden_state,
+                    pooler_output=pooler_output,
+                    hidden_states=encoder_outputs.hidden_states,
+                    attentions=encoder_outputs.attentions,
+                )
+            else:
+                pooler_output = self.head(last_hidden_state) if self.use_head else None
+
+            return BaseModelOutputWithPooling(
+                last_hidden_state=last_hidden_state,
+                pooler_output=pooler_output,
+                hidden_states=encoder_outputs.hidden_states,
+                attentions=encoder_outputs.attentions,
+            )
+
+        sample_hidden_state = list()
+        assert cu_seqlens is not None
+        for i in range(cu_seqlens.shape[0] - 1):
+            start = cu_seqlens[i]
+            end = cu_seqlens[i + 1]
+            tensor = last_hidden_state[:, start:end, :].squeeze(0)
+            sample_hidden_state.append(tensor)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sample_hidden_state,
+            pooler_output=None,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class SiglipPreTrainedModel(PretrainedModel):
+    config_class = PPOCRVLConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    _no_split_modules = [
+        "SiglipTextEmbeddings",
+        "SiglipEncoderLayer",
+        "SiglipVisionEmbeddings",
+        "SiglipMultiheadAttentionPoolingHead",
+    ]
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+
+
+class SiglipVisionModel(SiglipPreTrainedModel):
+    config_class = PPOCRVisionConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: PPOCRVisionConfig):
+        super().__init__(config)
+
+        self.vision_model = SiglipVisionTransformer(config)
+
+    def get_input_embeddings(self) -> nn.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+        self,
+        pixel_values,
+        sample_indices=None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        position_ids=None,
+        vision_return_embed_list: Optional[bool] = False,
+        image_grid_thw: Optional[
+            List[Union[Tuple[int, int, int], List[Tuple[int, int, int]]]]
+        ] = None,
+        cu_seqlens=None,
+        return_pooler_output: Optional[bool] = True,
+        use_rope: Optional[bool] = False,
+        window_size: Optional[bool] = -1,
+    ) -> BaseModelOutputWithPooling:
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            position_ids=position_ids,
+            vision_return_embed_list=vision_return_embed_list,
+            image_grid_thw=image_grid_thw,
+            sample_indices=sample_indices,
+            cu_seqlens=cu_seqlens,
+            return_pooler_output=return_pooler_output,
+            use_rope=use_rope,
+            window_size=window_size,
+        )
diff --git a/paddlex/inference/models/doc_vlm/predictor.py b/paddlex/inference/models/doc_vlm/predictor.py
index c7fb3a0c77..121117f7d2 100644
--- a/paddlex/inference/models/doc_vlm/predictor.py
+++ b/paddlex/inference/models/doc_vlm/predictor.py
@@ -12,12 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import base64
 import copy
+import io
 import os
 import warnings
-from typing import List
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
 
 from ....modules.doc_vlm.model_list import MODELS
+from ....utils.deps import require_genai_client_plugin
 from ....utils.device import TemporaryDeviceChanger
 from ....utils.env import get_device_type
 from ...common.batch_sampler import DocVLMBatchSampler
@@ -32,6 +39,7 @@ class DocVLMPredictor(BasePredictor):
         "PP-DocBee": {"PP-DocBee-2B", "PP-DocBee-7B"},
         "PP-DocBee2": {"PP-DocBee2-3B"},
         "PP-Chart2Table": {"PP-Chart2Table"},
+        "PaddleOCR-VL": {"PaddleOCR-VL"},
     }
 
     def __init__(self, *args, **kwargs):
@@ -40,18 +48,20 @@ def __init__(self, *args, **kwargs):
             *args: Arbitrary positional arguments passed to the superclass.
             **kwargs: Arbitrary keyword arguments passed to the superclass.
         """
-        import paddle
-
         super().__init__(*args, **kwargs)
-        self.device = kwargs.get("device", None)
-        self.dtype = (
-            "bfloat16"
-            if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
-            and (self.device is None or "cpu" not in self.device)
-            else "float32"
-        )
 
-        self.infer, self.processor = self._build(**kwargs)
+        if self._use_local_model:
+            import paddle
+
+            self.device = kwargs.get("device", None)
+            self.dtype = (
+                "bfloat16"
+                if ("npu" in get_device_type() or paddle.amp.is_bfloat16_supported())
+                and (self.device is None or "cpu" not in self.device)
+                else "float32"
+            )
+
+            self.infer, self.processor = self._build(**kwargs)
 
     def _build_batch_sampler(self):
         """Builds and returns an DocVLMBatchSampler instance.
@@ -80,6 +90,7 @@ def _build(self, **kwargs):
             PPChart2TableInference,
             PPDocBee2Inference,
             PPDocBeeInference,
+            PPOCRVLForConditionalGeneration,
         )
 
         # build processor
@@ -116,52 +127,97 @@ def _build(self, **kwargs):
                     self.model_dir,
                     dtype=self.dtype,
                 )
+        elif self.model_name in self.model_group["PaddleOCR-VL"]:
+            if kwargs.get("use_hpip", False):
+                warnings.warn(
+                    "The PaddelOCR-VL series does not support `use_hpip=True` for now."
+                )
+            with TemporaryDeviceChanger(self.device):
+                model = PPOCRVLForConditionalGeneration.from_pretrained(
+                    self.model_dir,
+                    dtype=self.dtype,
+                )
         else:
             raise NotImplementedError(f"Model {self.model_name} is not supported.")
 
         return model, processor
 
-    def process(self, data: List[dict], **kwargs):
+    def process(
+        self,
+        data: List[dict],
+        max_new_tokens: Optional[int] = None,
+        skip_special_tokens: Optional[bool] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs,
+    ):
         """
         Process a batch of data through the preprocessing, inference, and postprocessing.
 
         Args:
             data (List[dict]): A batch of input data, must be a dict (e.g. {"image": /path/to/image, "query": some question}).
-            kwargs (Optional[dict]): Arbitrary keyword arguments passed to model.generate.
 
         Returns:
             dict: A dictionary containing the raw sample information and prediction results for every instance of the batch.
         """
+        # TODO: Sampling settings
+        # FIXME: When `skip_special_tokens` is `True`, the results from different backends may differ.
+
         assert all(isinstance(i, dict) for i in data)
 
-        src_data = copy.copy(data)
-        # preprocess
-        data = self.processor.preprocess(data)
-        data = self._switch_inputs_to_device(data)
+        if self._use_local_model:
+            src_data = copy.copy(data)
+            # preprocess
+            data = self.processor.preprocess(data)
+            data = self._switch_inputs_to_device(data)
+
+            # do infer
+            generate_kwargs = {}
+            if max_new_tokens is not None:
+                generate_kwargs["max_new_tokens"] = max_new_tokens
+            if use_cache is not None:
+                generate_kwargs["use_cache"] = use_cache
+            with TemporaryDeviceChanger(self.device):
+                preds = self.infer.generate(
+                    data,
+                    **generate_kwargs,
+                )
 
-        # do infer
-        with TemporaryDeviceChanger(self.device):
-            preds = self.infer.generate(data, **kwargs)
+            # postprocess
+            postprocess_kwargs = {}
+            if skip_special_tokens is not None:
+                postprocess_kwargs["skip_special_tokens"] = skip_special_tokens
+            preds = self.processor.postprocess(preds, **postprocess_kwargs)
+        else:
+            require_genai_client_plugin()
 
-        # postprocess
-        preds = self.processor.postprocess(preds)
+            src_data = data
+
+            preds = self._genai_client_process(
+                data,
+                max_new_tokens=max_new_tokens,
+                skip_special_tokens=skip_special_tokens,
+            )
 
         result_dict = self._format_result_dict(preds, src_data)
         return result_dict
 
     def build_processor(self, **kwargs):
         from ..common.tokenizer import (
+            LlamaTokenizer,
             MIXQwen2_5_Tokenizer,
             MIXQwen2Tokenizer,
             QWenTokenizer,
         )
+        from ..common.tokenizer.tokenizer_utils import ChatTemplate
         from .processors import (
             GOTImageProcessor,
             PPChart2TableProcessor,
             PPDocBee2Processor,
             PPDocBeeProcessor,
+            PPOCRVLProcessor,
             Qwen2_5_VLImageProcessor,
             Qwen2VLImageProcessor,
+            SiglipImageProcessor,
         )
 
         if self.model_name in self.model_group["PP-DocBee"]:
@@ -182,6 +238,21 @@ def build_processor(self, **kwargs):
             return PPDocBee2Processor(
                 image_processor=image_processor, tokenizer=tokenizer
             )
+        elif self.model_name in self.model_group["PaddleOCR-VL"]:
+            image_processor = SiglipImageProcessor.from_pretrained(self.model_dir)
+            vocab_file = str(Path(self.model_dir, "tokenizer.model"))
+            tokenizer = LlamaTokenizer.from_pretrained(
+                self.model_dir, vocab_file=vocab_file
+            )
+            # HACK
+            chat_template_file = Path(self.model_dir, "chat_template.jinja")
+            tokenizer.chat_template = ChatTemplate._compile_jinja_template(
+                chat_template_file.read_text(encoding="utf-8")
+            )
+            return PPOCRVLProcessor(
+                image_processor=image_processor,
+                tokenizer=tokenizer,
+            )
         else:
             raise NotImplementedError
 
@@ -251,3 +322,79 @@ def _switch_inputs_to_device(self, input_dict):
             for k in input_dict
         }
         return rst_dict
+
+    def _genai_client_process(self, data, max_new_tokens, skip_special_tokens):
+        def _process(item):
+            image = item["image"]
+            if isinstance(image, str):
+                if image.startswith("http://") or image.startswith("https://"):
+                    image_url = image
+                else:
+                    from PIL import Image
+
+                    with Image.open(image) as img:
+                        with io.BytesIO() as buf:
+                            img.save(buf, format="JPEG")
+                            image_url = "data:image/jpeg;base64," + base64.b64encode(
+                                buf.getvalue()
+                            ).decode("ascii")
+            elif isinstance(image, np.ndarray):
+                import cv2
+                from PIL import Image
+
+                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+                img = Image.fromarray(image)
+                with io.BytesIO() as buf:
+                    img.save(buf, format="JPEG")
+                    image_url = "data:image/jpeg;base64," + base64.b64encode(
+                        buf.getvalue()
+                    ).decode("ascii")
+            else:
+                raise TypeError(f"Not supported image type: {type(image)}")
+
+            is_fastdeploy_server = self._genai_client.backend == "fastdeploy-server"
+            if is_fastdeploy_server:
+                kwargs = {
+                    "temperature": 1,
+                    "top_p": 0,
+                }
+            else:
+                kwargs = {
+                    "temperature": 0,
+                }
+            kwargs["extra_body"] = {}
+            if max_new_tokens is not None:
+                kwargs["max_completion_tokens"] = max_new_tokens
+            elif self.model_name in self.model_group["PaddleOCR-VL"]:
+                kwargs["max_completion_tokens"] = 8192
+            if skip_special_tokens is not None:
+                if self._genai_client.backend in (
+                    "fastdeploy-server",
+                    "vllm-server",
+                    "sglang-server",
+                ):
+                    kwargs["extra_body"]["skip_special_tokens"] = skip_special_tokens
+                else:
+                    raise ValueError("Not supported")
+
+            chat_completion = self._genai_client.create_chat_completion(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "image_url", "image_url": {"url": image_url}},
+                            {"type": "text", "text": item["query"]},
+                        ],
+                    }
+                ],
+                **kwargs,
+            )
+            return chat_completion.choices[0].message.content
+
+        batch_size = len(data)
+        if batch_size == 1:
+            return _process(data[0])
+        else:
+            # TODO: Concurrency control
+            with ThreadPoolExecutor(max_workers=batch_size) as executor:
+                return list(executor.map(_process, data))
diff --git a/paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py b/paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py
index 4c55c15c98..2b4fc3e58c 100644
--- a/paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py
+++ b/paddlex/inference/models/doc_vlm/processors/GOT_ocr_2_0.py
@@ -77,9 +77,11 @@ def preprocess(self, image: Union[str, Image.Image, np.ndarray, Dict, List]):
         return {"input_ids": input_ids, "images": images}
 
     @benchmark.timeit
-    def postprocess(self, model_pred, *args, **kwargs):
+    def postprocess(self, model_pred, **kwargs):
         return self.tokenizer.batch_decode(
-            model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            clean_up_tokenization_spaces=False,
         )
 
     def _load_image(self, image_file):
diff --git a/paddlex/inference/models/doc_vlm/processors/__init__.py b/paddlex/inference/models/doc_vlm/processors/__init__.py
index 1031846e45..31d5c6daca 100644
--- a/paddlex/inference/models/doc_vlm/processors/__init__.py
+++ b/paddlex/inference/models/doc_vlm/processors/__init__.py
@@ -13,5 +13,6 @@
 # limitations under the License.
 
 from .GOT_ocr_2_0 import GOTImageProcessor, PPChart2TableProcessor
+from .ppocrvl import PPOCRVLProcessor, SiglipImageProcessor
 from .qwen2_5_vl import PPDocBee2Processor, Qwen2_5_VLImageProcessor
 from .qwen2_vl import PPDocBeeProcessor, Qwen2VLImageProcessor
diff --git a/paddlex/inference/models/doc_vlm/processors/common.py b/paddlex/inference/models/doc_vlm/processors/common.py
index e2ee195dda..b5cd32fae1 100644
--- a/paddlex/inference/models/doc_vlm/processors/common.py
+++ b/paddlex/inference/models/doc_vlm/processors/common.py
@@ -418,7 +418,7 @@ def process_vision_info(
         if "image" in vision_info or "image_url" in vision_info:
             image_inputs.append(fetch_image(vision_info))
         else:
-            raise ValueError("image, image_url should in content.")
+            raise ValueError("image, image_url should be in content.")
     if len(image_inputs) == 0:
         image_inputs = None
     return image_inputs
@@ -426,10 +426,10 @@ def process_vision_info(
 
 def fetch_image(
     ele: Dict[str, Union[str, Image.Image]],
-    size_factor: int,
-    min_pixels: int,
-    max_pixels: int,
-    max_ratio: float,
+    size_factor: Optional[int] = None,
+    min_pixels: Optional[int] = None,
+    max_pixels: Optional[int] = None,
+    max_ratio: Optional[float] = None,
 ) -> Image.Image:
     if not isinstance(ele, dict):
         ele = {"image": ele}
@@ -458,29 +458,41 @@ def fetch_image(
             f"Unrecognized image input, support local path, http url, base64 and PIL.Image, got {image}"
         )
     image = image_obj.convert("RGB")
-    # resize
-    if "resized_height" in ele and "resized_width" in ele:
-        resized_height, resized_width = smart_resize(
-            ele["resized_height"],
-            ele["resized_width"],
-            factor=size_factor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            max_ratio=max_ratio,
-        )
+
+    if (
+        size_factor is not None
+        and min_pixels is not None
+        and max_pixels is not None
+        and max_ratio is not None
+    ):
+        do_resize = True
     else:
-        width, height = image.size  # Image, not tensor
-        min_pixels = ele.get("min_pixels", min_pixels)
-        max_pixels = ele.get("max_pixels", max_pixels)
-        resized_height, resized_width = smart_resize(
-            height,
-            width,
-            factor=size_factor,
-            min_pixels=min_pixels,
-            max_pixels=max_pixels,
-            max_ratio=max_ratio,
-        )
-    image = image.resize((resized_width, resized_height))
+        do_resize = False
+
+    if do_resize:
+        # resize
+        if "resized_height" in ele and "resized_width" in ele:
+            resized_height, resized_width = smart_resize(
+                ele["resized_height"],
+                ele["resized_width"],
+                factor=size_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                max_ratio=max_ratio,
+            )
+        else:
+            width, height = image.size  # Image, not tensor
+            min_pixels = ele.get("min_pixels", min_pixels)
+            max_pixels = ele.get("max_pixels", max_pixels)
+            resized_height, resized_width = smart_resize(
+                height,
+                width,
+                factor=size_factor,
+                min_pixels=min_pixels,
+                max_pixels=max_pixels,
+                max_ratio=max_ratio,
+            )
+        image = image.resize((resized_width, resized_height))
 
     return image
 
diff --git a/paddlex/inference/models/doc_vlm/processors/ppocrvl/__init__.py b/paddlex/inference/models/doc_vlm/processors/ppocrvl/__init__.py
new file mode 100644
index 0000000000..ee42136c4a
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/processors/ppocrvl/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ._ppocrvl import PPOCRVLProcessor
+from ._siglip import SiglipImageProcessor
diff --git a/paddlex/inference/models/doc_vlm/processors/ppocrvl/_ppocrvl.py b/paddlex/inference/models/doc_vlm/processors/ppocrvl/_ppocrvl.py
new file mode 100644
index 0000000000..7a010df923
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/processors/ppocrvl/_ppocrvl.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/processing_keye.py
+# Original header:
+# Copyright 2025 The Keye Team and The HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+from typing import List
+
+import paddle
+
+from .....utils.benchmark import benchmark
+from ..common import BatchFeature, fetch_image
+
+
+class PPOCRVLProcessor(object):
+    _DEFAULT_TEXT_KWARGS = {
+        "padding": False,
+        "return_tensors": "pd",
+    }
+    _DEFAULT_VIDEO_KWARGS = {
+        "fps": 2.0,
+        "return_tensors": "pd",
+    }
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+    ):
+        self.image_token = (
+            "<|image_pad|>"
+            if not hasattr(tokenizer, "image_token")
+            else tokenizer.image_token
+        )
+        self.video_token = (
+            "<|video_pad|>"
+            if not hasattr(tokenizer, "video_token")
+            else tokenizer.video_token
+        )
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+
+    @benchmark.timeit
+    def preprocess(
+        self,
+        input_dicts,
+    ):
+        images = [fetch_image(input_dict["image"]) for input_dict in input_dicts]
+
+        text = []
+        for input_dict in input_dicts:
+            messages = [
+                {
+                    "role": "user",
+                    "content": input_dict["query"],
+                }
+            ]
+            prompt = self.tokenizer.apply_chat_template(messages, tokenize=False)
+            text.append(prompt)
+
+        videos = None
+        output_kwargs = {
+            "tokenizer_init_kwargs": self.tokenizer.init_kwargs,
+            "text_kwargs": copy.deepcopy(self._DEFAULT_TEXT_KWARGS),
+            "video_kwargs": copy.deepcopy(self._DEFAULT_VIDEO_KWARGS),
+        }
+
+        if images is not None:
+            image_inputs = self.image_processor(images=images, return_tensors="pd")
+            image_inputs["pixel_values"] = image_inputs["pixel_values"]
+            image_grid_thw = image_inputs["image_grid_thw"]
+
+        else:
+            image_inputs = {}
+            image_grid_thw = None
+
+        if videos is not None:
+            # TODO: add video processing
+            videos_inputs = self.image_processor(
+                images=None, videos=videos, **output_kwargs["images_kwargs"]
+            )
+            video_grid_thw = videos_inputs["video_grid_thw"]
+
+            fps = output_kwargs["videos_kwargs"].pop("fps", 2.0)
+            if isinstance(fps, (int, float)):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / fps
+                ] * len(video_grid_thw)
+            elif hasattr(fps, "__len__") and len(fps) == len(video_grid_thw):
+                second_per_grid_ts = [
+                    self.image_processor.temporal_patch_size / tmp for tmp in fps
+                ]
+            else:
+                raise ValueError(
+                    f"The length of fps ({len(fps) if hasattr(fps, '__len__') else fps}) must be equal to the length of video_grid_thw ({len(video_grid_thw)}) or fps should be a single number."
+                )
+            videos_inputs.update(
+                {"second_per_grid_ts": paddle.to_tensor(second_per_grid_ts)}
+            )
+
+        else:
+            videos_inputs = {}
+            video_grid_thw = None
+
+        if not isinstance(text, list):
+            text = [text]
+
+        if image_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.image_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.image_token,
+                        "<|placeholder|>"
+                        * int(
+                            image_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.image_token)
+
+        if video_grid_thw is not None:
+            index = 0
+            for i in range(len(text)):
+                while self.video_token in text[i]:
+                    text[i] = text[i].replace(
+                        self.video_token,
+                        "<|placeholder|>"
+                        * (
+                            video_grid_thw[index].prod()
+                            // self.image_processor.merge_size
+                            // self.image_processor.merge_size
+                        ),
+                        1,
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", self.video_token)
+
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+
+    @benchmark.timeit
+    def postprocess(self, model_pred, **kwargs) -> List[str]:
+        return self.tokenizer.batch_decode(
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            spaces_between_special_tokens=False,
+        )
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        names_from_processor = list(
+            dict.fromkeys(tokenizer_input_names + image_processor_input_names)
+        )
+        return names_from_processor + ["second_per_grid_ts"]
diff --git a/paddlex/inference/models/doc_vlm/processors/ppocrvl/_siglip.py b/paddlex/inference/models/doc_vlm/processors/ppocrvl/_siglip.py
new file mode 100644
index 0000000000..3227273021
--- /dev/null
+++ b/paddlex/inference/models/doc_vlm/processors/ppocrvl/_siglip.py
@@ -0,0 +1,284 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This file is based on https://github.com/Kwai-Keye/Keye/blob/main/keye-vl-8b-preview/image_processing_keye.py
+# Original header:
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Keye."""
+
+# TODO: Support videos
+
+import json
+import math
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from ......utils import logging
+from ..common import (
+    BatchFeature,
+    convert_to_rgb,
+    make_batched_images,
+    make_list_of_images,
+    to_numpy_array,
+)
+
+_OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
+_OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
+
+
+def adjust_size(size, patch_size):
+    num_patches = size // patch_size
+    if num_patches % 2 != 0:
+        num_patches -= 1
+    return num_patches * patch_size
+
+
+def smart_resize(
+    height: int,
+    width: int,
+    factor: int = 28,
+    min_pixels: int = 28 * 28 * 130,
+    max_pixels: int = 28 * 28 * 1280,
+):
+    """Rescales the image so that the following conditions are met:
+
+    1. Both dimensions (height and width) are divisible by 'factor'.
+
+    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+
+    3. The aspect ratio of the image is maintained as closely as possible.
+
+    """
+    # if height < factor or width < factor:
+    #    raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor}")
+    # if int(height < factor//4) + int(width < factor//4):
+    #     raise ValueError(f"height:{height} or width:{width} must be larger than factor:{factor//4}")
+
+    if height < factor:
+        logging.debug(
+            f"smart_resize: height={height} < factor={factor}, reset height=factor"
+        )
+        width = round((width * factor) / height)
+        height = factor
+
+    if width < factor:
+        logging.debug(
+            f"smart_resize: width={width} < factor={factor}, reset width=factor"
+        )
+        height = round((height * factor) / width)
+        width = factor
+
+    if max(height, width) / min(height, width) > 200:
+        raise ValueError(
+            f"absolute aspect ratio must be smaller than 200, got {max(height, width) / min(height, width)}"
+        )
+    h_bar = round(height / factor) * factor
+    w_bar = round(width / factor) * factor
+    if h_bar * w_bar > max_pixels:
+        beta = math.sqrt((height * width) / max_pixels)
+        h_bar = math.floor(height / beta / factor) * factor
+        w_bar = math.floor(width / beta / factor) * factor
+    elif h_bar * w_bar < min_pixels:
+        beta = math.sqrt(min_pixels / (height * width))
+        h_bar = math.ceil(height * beta / factor) * factor
+        w_bar = math.ceil(width * beta / factor) * factor
+    return h_bar, w_bar
+
+
+class SiglipImageProcessor(object):
+    model_input_names = [
+        "pixel_values",
+        "image_grid_thw",
+        "pixel_values_videos",
+        "video_grid_thw",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        resample: int = 3,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: bool = True,
+        min_pixels: int = 28 * 28 * 130,
+        max_pixels: int = 28 * 28 * 1280,
+        patch_size: int = 14,
+        temporal_patch_size: int = 1,
+        merge_size: int = 2,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.do_resize = do_resize
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else _OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else _OPENAI_CLIP_STD
+        self.min_pixels = min_pixels
+        self.max_pixels = max_pixels
+        self.patch_size = patch_size
+        self.temporal_patch_size = temporal_patch_size
+        self.merge_size = merge_size
+        self.size = {"min_pixels": min_pixels, "max_pixels": max_pixels}  # not used
+        self.do_convert_rgb = do_convert_rgb
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_dir):
+        pretrained_model_dir = Path(pretrained_model_dir)
+        image_processor_config_path = pretrained_model_dir / "preprocessor_config.json"
+        with open(image_processor_config_path, "r", encoding="utf-8") as f:
+            image_processor_config = json.load(f)
+        return cls(**image_processor_config)
+
+    def _preprocess(
+        self,
+        images,
+        do_resize: Optional[bool] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+    ):
+        images = make_list_of_images(images)
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        width, height = images[0].size
+        resized_height, resized_width = height, width
+        processed_images = []
+
+        for image in images:
+            if do_resize:
+                resized_height, resized_width = smart_resize(
+                    height,
+                    width,
+                    factor=self.patch_size * self.merge_size,
+                    min_pixels=self.min_pixels,
+                    max_pixels=self.max_pixels,
+                )
+
+                image = image.resize(
+                    (resized_width, resized_height), resample=self.resample
+                )
+
+            image = to_numpy_array(image)
+
+            if do_rescale:
+                image = (image * rescale_factor).astype(np.float32)
+
+            if do_normalize:
+                image = image.astype(np.float32)
+                image -= np.array(image_mean, dtype=np.float32)
+                image /= np.array(image_std, dtype=np.float32)
+
+            processed_images.append(image)
+
+        patches = np.array(processed_images)
+        patches = patches.transpose(0, 3, 1, 2)
+        if patches.shape[0] == 1:
+            patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
+        channel = patches.shape[1]
+        grid_t = patches.shape[0] // self.temporal_patch_size
+        grid_h, grid_w = (
+            resized_height // self.patch_size,
+            resized_width // self.patch_size,
+        )
+
+        patches = patches.reshape(
+            grid_t,
+            self.temporal_patch_size,
+            channel,
+            grid_h,
+            self.patch_size,
+            grid_w,
+            self.patch_size,
+        )
+        patches = patches.transpose(0, 3, 5, 2, 1, 4, 6)
+        assert self.temporal_patch_size == 1
+        flatten_patches = patches.reshape(
+            grid_t * grid_h * grid_w, channel, self.patch_size, self.patch_size
+        )
+        return flatten_patches, (grid_t, grid_h, grid_w)
+
+    def __call__(
+        self,
+        images,
+        videos=None,
+        do_resize: Optional[bool] = None,
+        size: Optional[Dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors=None,
+    ):
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = (
+            rescale_factor if rescale_factor is not None else self.rescale_factor
+        )
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = (
+            do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        )
+
+        if images is not None:
+            images = make_batched_images(images)
+        if videos is not None:
+            raise NotImplementedError("Videos are not yet supported")
+
+        if images is not None:
+            pixel_values, vision_grid_thws = [], []
+            for image in images:
+                patches, image_grid_thw = self._preprocess(
+                    image,
+                    do_resize=do_resize,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    do_convert_rgb=do_convert_rgb,
+                )
+                pixel_values.extend(patches)
+                vision_grid_thws.append(image_grid_thw)
+            pixel_values = np.array(pixel_values)
+            vision_grid_thws = np.array(vision_grid_thws)
+            data = {"pixel_values": pixel_values, "image_grid_thw": vision_grid_thws}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py b/paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py
index 3916a21da9..35d899ee13 100644
--- a/paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py
+++ b/paddlex/inference/models/doc_vlm/processors/qwen2_5_vl.py
@@ -539,10 +539,12 @@ def preprocess(self, input_dicts: List[Dict]):
         return rst_inputs
 
     @benchmark.timeit
-    def postprocess(self, model_pred, *args, **kwargs) -> List[str]:
+    def postprocess(self, model_pred, **kwargs) -> List[str]:
         """
         Post process adapt for PaddleX
         """
         return self.tokenizer.batch_decode(
-            model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            clean_up_tokenization_spaces=False,
         )
diff --git a/paddlex/inference/models/doc_vlm/processors/qwen2_vl.py b/paddlex/inference/models/doc_vlm/processors/qwen2_vl.py
index 53cb675701..9b26dccc5f 100644
--- a/paddlex/inference/models/doc_vlm/processors/qwen2_vl.py
+++ b/paddlex/inference/models/doc_vlm/processors/qwen2_vl.py
@@ -534,10 +534,12 @@ def preprocess(self, input_dicts):
         return rst_inputs
 
     @benchmark.timeit
-    def postprocess(self, model_pred, *args, **kwargs):
+    def postprocess(self, model_pred, **kwargs):
         """
         Post process adapt for PaddleX
         """
         return self.tokenizer.batch_decode(
-            model_pred[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+            model_pred[0],
+            skip_special_tokens=kwargs.get("skip_special_tokens", True),
+            clean_up_tokenization_spaces=False,
         )
diff --git a/paddlex/inference/pipelines/base.py b/paddlex/inference/pipelines/base.py
index 220ddd5080..0b45140924 100644
--- a/paddlex/inference/pipelines/base.py
+++ b/paddlex/inference/pipelines/base.py
@@ -91,6 +91,7 @@ def create_model(self, config: Dict, **kwargs) -> BasePredictor:
         if self.hpi_config is not None:
             hpi_config = hpi_config or {}
             hpi_config = {**self.hpi_config, **hpi_config}
+        genai_config = config.get("genai_config", None)
 
         from .. import create_predictor
 
@@ -110,6 +111,7 @@ def create_model(self, config: Dict, **kwargs) -> BasePredictor:
             pp_option=pp_option,
             use_hpip=use_hpip,
             hpi_config=hpi_config,
+            genai_config=genai_config,
             **kwargs,
         )
         return model
diff --git a/paddlex/model.py b/paddlex/model.py
index 830cb6c20b..f072f11e72 100644
--- a/paddlex/model.py
+++ b/paddlex/model.py
@@ -81,12 +81,13 @@ def _build_predictor(self):
         predict_kwargs = deepcopy(self._config.Predict)
 
         model_dir = predict_kwargs.pop("model_dir", None)
+        device = self._config.Global.get("device", None)
 
         UNSET = object()
-        device = self._config.Global.get("device", None)
         kernel_option = predict_kwargs.pop("kernel_option", UNSET)
         use_hpip = predict_kwargs.pop("use_hpip", UNSET)
         hpi_config = predict_kwargs.pop("hpi_config", UNSET)
+        genai_config = predict_kwargs.pop("genai_config", UNSET)
 
         create_predictor_kwargs = {}
         if kernel_option is not UNSET:
@@ -99,10 +100,12 @@ def _build_predictor(self):
             create_predictor_kwargs["use_hpip"] = False
         if hpi_config is not UNSET:
             create_predictor_kwargs["hpi_config"] = hpi_config
+        if genai_config is not UNSET:
+            create_predictor_kwargs["genai_config"] = genai_config
 
         predictor = create_predictor(
             self._model_name,
-            model_dir,
+            model_dir=model_dir,
             device=device,
             **create_predictor_kwargs,
         )
diff --git a/paddlex/modules/base/trainer.py b/paddlex/modules/base/trainer.py
index 61daa57b84..78b3e149ad 100644
--- a/paddlex/modules/base/trainer.py
+++ b/paddlex/modules/base/trainer.py
@@ -84,7 +84,7 @@ def train(self, *args, **kwargs):
                     "uniform_output_enabled", True
                 ),
                 "export_with_pir": export_with_pir,
-                "ips": self.train_config.get("dist_ips", None)
+                "ips": self.train_config.get("dist_ips", None),
             }
         )
 
diff --git a/paddlex/modules/doc_vlm/model_list.py b/paddlex/modules/doc_vlm/model_list.py
index 5886d04eed..85e67d7fd3 100644
--- a/paddlex/modules/doc_vlm/model_list.py
+++ b/paddlex/modules/doc_vlm/model_list.py
@@ -13,4 +13,10 @@
 # limitations under the License.
 
 
-MODELS = ["PP-DocBee-2B", "PP-DocBee-7B", "PP-Chart2Table", "PP-DocBee2-3B"]
+MODELS = [
+    "PP-DocBee-2B",
+    "PP-DocBee-7B",
+    "PP-Chart2Table",
+    "PP-DocBee2-3B",
+    "PaddleOCR-VL",
+]
diff --git a/paddlex/paddlex_cli.py b/paddlex/paddlex_cli.py
index 590380a150..059b853f43 100644
--- a/paddlex/paddlex_cli.py
+++ b/paddlex/paddlex_cli.py
@@ -29,10 +29,12 @@
 from .utils import logging
 from .utils.deps import (
     get_dep_version,
-    get_paddle2onnx_spec,
+    get_genai_dep_specs,
+    get_genai_fastdeploy_spec,
+    get_paddle2onnx_dep_specs,
     get_serving_dep_specs,
+    is_dep_available,
     is_paddle2onnx_plugin_available,
-    require_paddle2onnx_plugin,
 )
 from .utils.env import get_paddle_cuda_version
 from .utils.install import install_packages, uninstall_packages
@@ -225,12 +227,20 @@ def install(args):
     """install paddlex"""
 
     def _install_serving_deps():
-        reqs = get_serving_dep_specs()
-        # Should we sort the requirements?
-        install_packages(reqs)
+        try:
+            install_packages(get_serving_dep_specs())
+        except Exception:
+            logging.error("Installation failed", exc_info=True)
+            sys.exit(1)
+        logging.info("Successfully installed the serving plugin")
 
     def _install_paddle2onnx_deps():
-        install_packages([get_paddle2onnx_spec()])
+        try:
+            install_packages(get_paddle2onnx_dep_specs())
+        except Exception:
+            logging.error("Installation failed", exc_info=True)
+            sys.exit(1)
+        logging.info("Successfully installed the Paddle2ONNX plugin")
 
     def _install_hpi_deps(device_type):
         SUPPORTED_DEVICE_TYPES = ["cpu", "gpu", "npu"]
@@ -270,33 +280,97 @@ def _install_hpi_deps(device_type):
                 logging.info(
                     f"The high-performance inference plugin '{package}' is mutually exclusive with '{other_package}' (version {version} installed). Uninstalling '{other_package}'..."
                 )
-                uninstall_packages([other_package])
+                try:
+                    uninstall_packages([other_package])
+                except Exception:
+                    logging.error("Failed to uninstall packages", exc_info=True)
+                    sys.exit(1)
 
         with importlib.resources.path("paddlex", hpip_links_file) as f:
             version = get_dep_version(package)
-            if version is None:
-                install_packages([package], pip_install_opts=["--find-links", str(f)])
-            else:
-                response = input(
-                    f"The high-performance inference plugin is already installed (version {repr(version)}). Do you want to reinstall it? (y/n):"
-                )
-                if response.lower() in ["y", "yes"]:
-                    uninstall_packages([package])
+            try:
+                if version is None:
                     install_packages(
-                        [package],
-                        pip_install_opts=[
-                            "--find-links",
-                            str(f),
-                        ],
+                        [package], pip_install_opts=["--find-links", str(f)]
                     )
                 else:
-                    return
+                    response = input(
+                        f"The high-performance inference plugin is already installed (version {repr(version)}). Do you want to reinstall it? (y/n):"
+                    )
+                    if response.lower() in ["y", "yes"]:
+                        uninstall_packages([package])
+                        install_packages(
+                            [package],
+                            pip_install_opts=[
+                                "--find-links",
+                                str(f),
+                            ],
+                        )
+                    else:
+                        return
+            except Exception:
+                logging.error("Installation failed", exc_info=True)
+                sys.exit(1)
+
+        logging.info("Successfully installed the high-performance inference plugin")
 
         if not is_paddle2onnx_plugin_available():
             logging.info(
                 "The Paddle2ONNX plugin is not available. It is recommended to run `paddlex --install paddle2onnx` to install the Paddle2ONNX plugin to use the full functionality of high-performance inference."
             )
 
+    def _install_genai_deps(plugin_types):
+        fd_plugin_types = []
+        not_fd_plugin_types = []
+        for plugin_type in plugin_types:
+            if "fastdeploy" in plugin_type:
+                fd_plugin_types.append(plugin_type)
+            else:
+                not_fd_plugin_types.append(plugin_type)
+        if fd_plugin_types:
+            if not is_dep_available("paddlepaddle"):
+                sys.exit("Please install PaddlePaddle first.")
+            import paddle.device
+
+            if not paddle.device.is_compiled_with_cuda():
+                sys.exit("Currently, only the GPU version of FastDeploy is supported.")
+            cap = paddle.device.cuda.get_device_capability()
+            if cap in ((8, 0), (9, 0)):
+                index_url = "https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-80_90/"
+            elif cap in ((8, 6), (8, 9)):
+                index_url = "https://www.paddlepaddle.org.cn/packages/stable/fastdeploy-gpu-86_89/"
+            else:
+                sys.exit(
+                    f"The compute capability of the GPU is {cap[0]}.{cap[1]}, which is not supported. The supported compute capabilities are 8.0, 8.6, 8.9, and 9.0."
+                )
+            try:
+                install_packages(
+                    [get_genai_fastdeploy_spec("gpu")],
+                    pip_install_opts=["--extra-index-url", index_url],
+                )
+            except Exception:
+                logging.error("Installation failed", exc_info=True)
+                sys.exit(1)
+
+        reqs = []
+        for plugin_type in not_fd_plugin_types:
+            try:
+                r = get_genai_dep_specs(plugin_type)
+            except ValueError:
+                logging.error("Invalid generative AI plugin type: %s", plugin_type)
+                sys.exit(2)
+            reqs += r
+        try:
+            install_packages(reqs, constraints="required")
+        except Exception:
+            logging.error("Installation failed", exc_info=True)
+            sys.exit(1)
+
+        logging.info(
+            "Successfully installed the generative AI plugin"
+            + ("s" if len(plugin_types) > 1 else "")
+        )
+
     # Enable debug info
     os.environ["PADDLE_PDX_DEBUG"] = "True"
     # Disable eager initialization
@@ -322,10 +396,10 @@ def _install_hpi_deps(device_type):
 
     hpi_plugins = list(filter(lambda name: name.startswith("hpi-"), plugins))
     if hpi_plugins:
-        for i in hpi_plugins:
-            plugins.remove(i)
+        for p in hpi_plugins:
+            plugins.remove(p)
         if plugins:
-            logging.error("`hpi` cannot be used together with other plugins.")
+            logging.error("`hpi-xxx` cannot be used together with other plugins.")
             sys.exit(2)
         if len(hpi_plugins) > 1 or len(hpi_plugins[0].split("-")) != 2:
             logging.error(
@@ -338,10 +412,29 @@ def _install_hpi_deps(device_type):
         _install_hpi_deps(device_type=device_type)
         return
 
+    genai_plugins = list(filter(lambda name: name.startswith("genai-"), plugins))
+    if genai_plugins:
+        for p in genai_plugins:
+            plugins.remove(p)
+        if plugins:
+            logging.error("`genai-xxx` cannot be used together with other plugins.")
+            sys.exit(2)
+        genai_plugin_types = [p[len("genai-") :] for p in genai_plugins]
+        _install_genai_deps(genai_plugin_types)
+        return
+
+    all_repo_names = get_all_supported_repo_names()
+    unknown_plugins = []
+    for p in plugins:
+        if p not in all_repo_names:
+            unknown_plugins.append(p)
+    if unknown_plugins:
+        logging.error("Unknown plugins: %s", unknown_plugins)
+        sys.exit(2)
     if plugins:
         repo_names = plugins
     elif len(plugins) == 0:
-        repo_names = get_all_supported_repo_names()
+        repo_names = all_repo_names
     setup(
         repo_names=repo_names,
         no_deps=args.no_deps,
@@ -374,19 +467,31 @@ def pipeline_predict(
 
 
 def serve(pipeline, *, device, use_hpip, hpi_config, host, port):
-    from .inference.serving.basic_serving import create_pipeline_app, run_server
+    try:
+        from .inference.serving.basic_serving import create_pipeline_app, run_server
+    except RuntimeError:
+        logging.error("Failed to load the serving module", exc_info=True)
+        sys.exit(1)
 
     pipeline_config = load_pipeline_config(pipeline)
-    pipeline = create_pipeline(
-        config=pipeline_config, device=device, use_hpip=use_hpip, hpi_config=hpi_config
-    )
+    try:
+        pipeline = create_pipeline(
+            config=pipeline_config,
+            device=device,
+            use_hpip=use_hpip,
+            hpi_config=hpi_config,
+        )
+    except Exception:
+        logging.error("Failed to create the pipeline", exc_info=True)
+        sys.exit(1)
     app = create_pipeline_app(pipeline, pipeline_config)
     run_server(app, host=host, port=port)
 
 
 # TODO: Move to another module
 def paddle_to_onnx(paddle_model_dir, onnx_model_dir, *, opset_version):
-    require_paddle2onnx_plugin()
+    if not is_paddle2onnx_plugin_available():
+        sys.exit("Please install the Paddle2ONNX plugin first.")
 
     ONNX_MODEL_FILENAME = f"{MODEL_FILE_PREFIX}.onnx"
     CONFIG_FILENAME = f"{MODEL_FILE_PREFIX}.yml"
@@ -448,6 +553,9 @@ def _copy_additional_files(input_dir, output_dir):
             shutil.copy(src_path, dst_path)
             logging.info(f"Copied {src_path} to {dst_path}")
 
+    if not paddle_model_dir:
+        sys.exit("PaddlePaddle model directory must be specified")
+
     paddle_model_dir = Path(paddle_model_dir)
     if not onnx_model_dir:
         onnx_model_dir = paddle_model_dir
@@ -476,7 +584,6 @@ def main():
 
     if args.install is not None:
         install(args)
-        return
     elif args.serve:
         serve(
             args.pipeline,
@@ -486,14 +593,12 @@ def main():
             host=args.host,
             port=args.port,
         )
-        return
     elif args.paddle2onnx:
         paddle_to_onnx(
             args.paddle_model_dir,
             args.onnx_model_dir,
             opset_version=args.opset_version,
         )
-        return
     else:
         if args.get_pipeline_config is not None:
             interactive_get_pipeline(args.get_pipeline_config, args.save_path)
@@ -506,13 +611,16 @@ def main():
                     pipeline_args_dict[arg_name] = getattr(args, arg_name)
                 else:
                     logging.warning(f"Argument {arg_name} is missing in args")
-            pipeline_predict(
-                args.pipeline,
-                args.input,
-                args.device,
-                args.save_path,
-                use_hpip=args.use_hpip or None,
-                hpi_config=args.hpi_config,
-                **pipeline_args_dict,
-            )
-            return
+            try:
+                pipeline_predict(
+                    args.pipeline,
+                    args.input,
+                    args.device,
+                    args.save_path,
+                    use_hpip=args.use_hpip or None,
+                    hpi_config=args.hpi_config,
+                    **pipeline_args_dict,
+                )
+            except Exception:
+                logging.error("Pipeline prediction failed", exc_info=True)
+                sys.exit(1)
diff --git a/paddlex/utils/deps.py b/paddlex/utils/deps.py
index 6463898a94..09e1d0002a 100644
--- a/paddlex/utils/deps.py
+++ b/paddlex/utils/deps.py
@@ -29,6 +29,8 @@
 )
 _COLLECTIVE_EXTRA_NAMES = {"base", "plugins", "all"}
 
+_SUPPORTED_GENAI_ENGINE_BACKENDS = ["fastdeploy-server", "vllm-server", "sglang-server"]
+
 
 class DependencyError(Exception):
     pass
@@ -63,18 +65,21 @@ def _get_extras():
 EXTRAS = _get_extras()
 
 
-def _get_dep_specs():
+def _get_base_dep_specs(required_only=False):
     dep_specs = defaultdict(list)
     for dep_spec in importlib.metadata.requires("paddlex"):
         extra_name, dep_spec = _get_extra_name_and_remove_extra_marker(dep_spec)
-        if extra_name is None or extra_name == "all":
+        if (required_only and extra_name is None) or (
+            not required_only and (extra_name is None or extra_name == "base")
+        ):
             dep_spec = dep_spec.rstrip()
             req = Requirement(dep_spec)
             dep_specs[req.name].append(dep_spec)
     return dep_specs
 
 
-DEP_SPECS = _get_dep_specs()
+BASE_DEP_SPECS = _get_base_dep_specs()
+REQUIRED_DEP_SPECS = _get_base_dep_specs(required_only=True)
 
 
 def get_dep_version(dep):
@@ -85,33 +90,32 @@ def get_dep_version(dep):
 
 
 @lru_cache()
-def is_dep_available(dep, /, check_version=None):
-    # Currently for several special deps we check if the import packages exist.
-    if dep in ("paddlepaddle", "paddle-custom-device", "ultra-infer") and check_version:
+def is_dep_available(dep, /, check_version=False):
+    if (
+        dep in ("paddlepaddle", "paddle-custom-device", "ultra-infer", "fastdeploy")
+        and check_version
+    ):
         raise ValueError(
-            "Currently, `check_version` is not allowed to be `True` for `paddlepaddle`, `paddle-custom-device`, and `ultra-infer`."
+            "`check_version` is not allowed to be `True` for `paddlepaddle`, `paddle-custom-device`, `ultra-infer`, and `fastdeploy`."
         )
+    # Currently for several special deps we check if the import packages exist.
     if dep == "paddlepaddle":
         return importlib.util.find_spec("paddle") is not None
     elif dep == "paddle-custom-device":
         return importlib.util.find_spec("paddle_custom_device") is not None
     elif dep == "ultra-infer":
         return importlib.util.find_spec("ultra_infer") is not None
-    else:
-        if dep != "paddle2onnx" and dep not in DEP_SPECS:
-            raise ValueError("Unknown dependency")
-    if check_version is None:
-        if dep == "paddle2onnx":
-            check_version = True
-        else:
-            check_version = False
+    elif dep == "fastdeploy":
+        return importlib.util.find_spec("fastdeploy") is not None
     version = get_dep_version(dep)
     if version is None:
         return False
     if check_version:
-        if dep == "paddle2onnx":
-            return Version(version) in Requirement(get_paddle2onnx_spec()).specifier
-        for dep_spec in DEP_SPECS[dep]:
+        if dep not in BASE_DEP_SPECS:
+            raise ValueError(
+                f"Currently, `check_version=True` is supported only for base dependencies."
+            )
+        for dep_spec in BASE_DEP_SPECS[dep]:
             if Version(version) in Requirement(dep_spec).specifier:
                 return True
     else:
@@ -252,5 +256,66 @@ def require_paddle2onnx_plugin():
         )
 
 
-def get_paddle2onnx_spec():
-    return "paddle2onnx == 2.0.2rc3"
+def get_paddle2onnx_dep_specs():
+    dep_specs = []
+    for item in EXTRAS["paddle2onnx"].values():
+        dep_specs += item
+    return dep_specs
+
+
+def is_genai_engine_plugin_available(backend="any"):
+    if backend != "any" and backend not in _SUPPORTED_GENAI_ENGINE_BACKENDS:
+        raise ValueError(f"Unknown backend type: {backend}")
+    if backend == "any":
+        for be in _SUPPORTED_GENAI_ENGINE_BACKENDS:
+            if is_genai_engine_plugin_available(be):
+                return True
+        return False
+    else:
+        if "fastdeploy" in backend:
+            return is_dep_available("fastdeploy")
+        elif is_extra_available(f"genai-{backend}"):
+            return True
+        return False
+
+
+def require_genai_engine_plugin(backend="any"):
+    if not is_genai_engine_plugin_available(backend):
+        if backend == "any":
+            prefix = "The generative AI engine plugins are"
+        else:
+            prefix = f"The generative AI {repr(backend)} engine plugin is"
+        raise RuntimeError(f"{prefix} not available. Please install it properly.")
+
+
+def is_genai_client_plugin_available():
+    return is_extra_available("genai-client")
+
+
+def require_genai_client_plugin():
+    if not is_genai_client_plugin_available():
+        raise RuntimeError(
+            "The generative AI client plugin is not available. Please install it properly."
+        )
+
+
+def get_genai_fastdeploy_spec(device_type):
+    SUPPORTED_DEVICE_TYPES = ("gpu",)
+    if device_type not in SUPPORTED_DEVICE_TYPES:
+        raise ValueError(f"Unsupported device type: {device_type}")
+    if device_type == "gpu":
+        return "fastdeploy-gpu == 2.0.3"
+    else:
+        raise AssertionError
+
+
+def get_genai_dep_specs(type):
+    if type != "client" and type not in _SUPPORTED_GENAI_ENGINE_BACKENDS:
+        raise ValueError(f"Invalid type: {type}")
+    if "fastdeploy" in type:
+        raise ValueError(f"{repr(type)} is not supported")
+
+    dep_specs = []
+    for item in EXTRAS[f"genai-{type}"].values():
+        dep_specs += item
+    return dep_specs
diff --git a/paddlex/utils/install.py b/paddlex/utils/install.py
index 89e1153e4f..3df89ffc77 100644
--- a/paddlex/utils/install.py
+++ b/paddlex/utils/install.py
@@ -23,35 +23,47 @@
 
 
 def install_packages_from_requirements_file(
-    requirements_file_path, pip_install_opts=None
+    requirements_file_path,
+    pip_install_opts=None,
+    constraints="base",
 ):
-    from .deps import DEP_SPECS
+    from .deps import BASE_DEP_SPECS, REQUIRED_DEP_SPECS
 
-    # TODO: Precompute or cache the constraints
-    with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
-        for reqs in DEP_SPECS.values():
-            for req in reqs:
-                req = Requirement(req)
-                if req.marker and not req.marker.evaluate():
-                    continue
-                if req.url:
-                    req = f"{req.name}@{req.url}"
-                else:
-                    req = f"{req.name}{req.specifier}"
-                f.write(req + "\n")
-        constraints_file_path = f.name
+    if constraints not in ("base", "required", "none"):
+        raise ValueError(f"Invalid constraints setting: {constraints}")
 
     args = [
         sys.executable,
         "-m",
         "pip",
         "install",
-        "-c",
-        constraints_file_path,
         *(pip_install_opts or []),
         "-r",
         requirements_file_path,
     ]
+
+    if constraints == "base":
+        dep_specs = BASE_DEP_SPECS
+    elif constraints == "required":
+        dep_specs = REQUIRED_DEP_SPECS
+    else:
+        dep_specs = None
+    if dep_specs:
+        # TODO: Precompute or cache the constraints
+        with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
+            for reqs in dep_specs.values():
+                for req in reqs:
+                    req = Requirement(req)
+                    if req.marker and not req.marker.evaluate():
+                        continue
+                    if req.url:
+                        req = f"{req.name}@{req.url}"
+                    else:
+                        req = f"{req.name}{req.specifier}"
+                    f.write(req + "\n")
+            constraints_file_path = f.name
+        args.extend(["-c", constraints_file_path])
+
     logging.debug("Command: %s", args)
 
     try:
@@ -60,14 +72,16 @@ def install_packages_from_requirements_file(
         os.unlink(constraints_file_path)
 
 
-def install_packages(requirements, pip_install_opts=None):
+def install_packages(requirements, pip_install_opts=None, constraints="base"):
     with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
         for req in requirements:
             f.write(req + "\n")
         reqs_file_path = f.name
     try:
         return install_packages_from_requirements_file(
-            reqs_file_path, pip_install_opts=pip_install_opts
+            reqs_file_path,
+            pip_install_opts=pip_install_opts,
+            constraints=constraints,
         )
     finally:
         os.unlink(reqs_file_path)
diff --git a/setup.py b/setup.py
index b8d4f19700..ba29e3b791 100644
--- a/setup.py
+++ b/setup.py
@@ -20,9 +20,9 @@
 
 from setuptools import find_packages, setup
 
-DEP_SPECS = {
+BASE_DEP_SPECS = {
     "aiohttp": ">= 3.9",
-    "aistudio_sdk": ">=0.3.5",
+    "aistudio-sdk": ">=0.3.5",
     "bce-python-sdk": ">= 0.9",
     "beautifulsoup4": "",
     "chardet": "",
@@ -31,12 +31,10 @@
     "decord": "== 0.6.0; (platform_machine == 'x86_64' or platform_machine == 'AMD64') and sys_platform != 'darwin'",
     "einops": "",
     "faiss-cpu": "",
-    "fastapi": ">= 0.110",
     "filelock": "",
-    "filetype": ">= 1.2",
     "ftfy": "",
     "GPUtil": ">= 1.4",
-    "huggingface_hub": "",
+    "huggingface-hub": "",
     "imagesize": "",
     "Jinja2": "",
     "joblib": "",
@@ -67,24 +65,22 @@
     "ruamel.yaml": "",
     "scikit-image": "",
     "scikit-learn": "",
+    "sentencepiece": "",
     "shapely": "",
     "soundfile": "",
-    "starlette": ">= 0.36",
     "tiktoken": "",
     "tokenizers": ">= 0.19",
     "tqdm": "",
     "typing-extensions": "",
     "ujson": "",
-    "uvicorn": ">= 0.16",
-    "yarl": ">= 1.9",
 }
 
 REQUIRED_DEPS = [
-    "aistudio_sdk",
+    "aistudio-sdk",
     "chardet",
     "colorlog",
     "filelock",
-    "huggingface_hub",
+    "huggingface-hub",
     "modelscope",
     "numpy",
     "packaging",
@@ -121,6 +117,7 @@
             # For the same reason as in `cv`
             "pypdfium2",
             "regex",
+            "sentencepiece",
             "tiktoken",
         ],
         "ie": [
@@ -176,6 +173,7 @@
             "pypdfium2",
             "regex",
             "scikit-learn",
+            "sentencepiece",
             "shapely",
             "tiktoken",
             "tokenizers",
@@ -199,14 +197,37 @@
         ],
     },
     "plugins": {
+        "genai-client": [
+            "openai >= 1.63",
+        ],
+        "genai-sglang-server": [
+            "einops",
+            "sglang [all] == 0.4.10.post2",
+            "torch == 2.7.1",
+            "transformers",
+            "xformers",
+        ],
+        "genai-vllm-server": [
+            "einops",
+            "torch == 2.7.1",
+            "transformers",
+            "uvloop",
+            "vllm == 0.10.0",
+            "xformers",
+        ],
+        "paddle2onnx": [
+            "paddle2onnx == 2.0.2rc3",
+        ],
         "serving": [
-            "aiohttp",
-            "bce-python-sdk",
-            "fastapi",
-            "filetype",
-            "starlette",
-            "uvicorn",
-            "yarl",
+            "aiohttp >= 3.9",
+            "bce-python-sdk >= 0.9",
+            "fastapi >= 0.110",
+            "filetype >= 1.2",
+            "opencv-contrib-python == 4.10.0.84",
+            "pypdfium2 >= 4",
+            "starlette >= 0.36",
+            "uvicorn >= 0.16",
+            "yarl >= 1.9",
         ],
     },
 }
@@ -215,7 +236,7 @@
 def _get_dep_specs(deps):
     dep_specs = []
     for dep in deps:
-        val = DEP_SPECS[dep]
+        val = BASE_DEP_SPECS[dep]
         if not isinstance(val, list):
             val = [val]
         for v in val:
@@ -243,16 +264,17 @@ def dependencies():
 
 def extras():
     dic = {}
-    all_dep_specs = set()
-    for group_name, group in EXTRAS.items():
-        group_dep_specs = set()
-        for extra_name, extra_deps in group.items():
-            extra_dep_specs = _get_dep_specs(extra_deps)
-            dic[extra_name] = _sort_dep_specs(extra_dep_specs)
-            group_dep_specs.update(extra_dep_specs)
-            dic[group_name] = _sort_dep_specs(group_dep_specs)
-            all_dep_specs.update(group_dep_specs)
-    dic["all"] = _sort_dep_specs(all_dep_specs)
+
+    base_dep_specs = set()
+    for extra_name, extra_deps in EXTRAS["base"].items():
+        extra_dep_specs = _get_dep_specs(extra_deps)
+        dic[extra_name] = _sort_dep_specs(extra_dep_specs)
+        base_dep_specs.update(extra_dep_specs)
+    dic["base"] = _sort_dep_specs(base_dep_specs)
+
+    for extra_name, extra_dep_specs in EXTRAS["plugins"].items():
+        dic[extra_name] = _sort_dep_specs(extra_dep_specs)
+
     return dic
 
 
@@ -295,19 +317,23 @@ def _recursively_find(pattern, exts=None):
     for p in itertools.chain(
         _recursively_find("paddlex/configs/*", exts=[".yml", ".yaml"]),
     ):
-        if Path(p).suffix in (".pyc", ".pyo"):
-            continue
         pkg_data.append(Path(p).relative_to("paddlex").as_posix())
     pipeline_config = [
         Path(p).relative_to("paddlex").as_posix()
         for p in glob.glob("paddlex/pipelines/*.yaml")
     ]
-    pkg_data.append("inference/pipelines/ppchatocrv3/ch_prompt.yaml")
     pkg_data.extend(pipeline_config)
+    pkg_data.append("inference/pipelines/ppchatocrv3/ch_prompt.yaml")
     pkg_data.append(".version")
     pkg_data.append("hpip_links.html")
     pkg_data.append("hpip_links_cu12.html")
     pkg_data.append("inference/utils/hpi_model_info_collection.json")
+    genai_chat_templates = [
+        Path(p).relative_to("paddlex").as_posix()
+        for p in glob.glob("paddlex/inference/genai/chat_templates/*.jinja")
+    ]
+    pkg_data.extend(genai_chat_templates)
+    pkg_data.extend("inference/genai/models/")
     ops_file_dir = "paddlex/ops"
     ops_file_types = ["h", "hpp", "cpp", "cc", "cu"]
     return pkgs, {
@@ -334,6 +360,10 @@ def _recursively_find(pattern, exts=None):
         entry_points={
             "console_scripts": [
                 "paddlex = paddlex.__main__:console_entry",
+                "paddlex_genai_server = paddlex.inference.genai.server:run_genai_server",
+            ],
+            "vllm.general_plugins": [
+                "register_paddlex_genai_models = paddlex.inference.genai.backends.vllm:register_models"
             ],
         },
         # PyPI package information
@@ -342,7 +372,6 @@ def _recursively_find(pattern, exts=None):
             "Intended Audience :: Developers",
             "Intended Audience :: Education",
             "Intended Audience :: Science/Research",
-            "License :: OSI Approved :: Apache Software License",
             "Programming Language :: Python :: 3.8",
             "Programming Language :: Python :: 3.9",
             "Programming Language :: Python :: 3.10",
@@ -355,6 +384,7 @@ def _recursively_find(pattern, exts=None):
             "Topic :: Software Development :: Libraries",
             "Topic :: Software Development :: Libraries :: Python Modules",
         ],
-        license="Apache 2.0",
+        license="Apache-2.0",
+        license_files=["LICENSE", "THIRD_PARTY_LICENSES/*/LICENSE"],
         keywords=["paddlepaddle"],
     )