Adding aic-hw-version Compile Options Support (#528)

abukhoy · web-flow · commit faab24548845 · 2025-09-05T11:58:16.000+05:30
This pull request introduces support for compile-time options via
keyword arguments (`kwargs`), including the `aic-hw-version` parameter,
which now accepts values `"ai100"` or `"ai200"`. If no value is
provided, the default is `"ai100"`, representing the AI100 hardware.

These enhancements allow users to tailor the `compile` API to better
suit their specific requirements.

### Example Usage:
```python
from QEfficient import QEFFAutoModelForCausalLM
from transformers import AutoTokenizer

model_name = "gpt2"
model = QEFFAutoModelForCausalLM.from_pretrained(model_name, num_hidden_layers=2)

model.compile(prefill_seq_len=128, ctx_len=256, num_cores=16, num_devices=1, **{'aic-hw-version': 'ai100'})

tokenizer = AutoTokenizer.from_pretrained(model_name)
model.generate(prompts=["Hi there!!"], tokenizer=tokenizer)
```

&gt; **Note:** Previously, the default value for `aic-hw-version` was
`"2.0"`, which implicitly referred to AI100. This value is now
deprecated and replaced with the explicit `"ai100"` identifier.

---------

Signed-off-by: Abukhoyer Shaik &lt;abukhoye@qti.qualcomm.com&gt;
diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py
@@ -146,14 +146,20 @@ def compile(self, *args, **kwargs) -> Path:
             :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``.
             :mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``.
             :compiler_options: Pass any compiler option as input.
-            Following flag can be passed in compiler_options to enable QNN Compilation path.
-                :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
-                :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
-            for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
-                - aic_num_cores=16 -> -aic-num-cores=16
-                - convert_to_fp16=True -> -convert-to-fp16
+
+                Following flag can be passed in compiler_options to enable QNN Compilation path.
+                    :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.``
+                    :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed``
+
+                for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below:
+
+                    - aic_num_cores=16 -> -aic-num-cores=16
+                    - convert_to_fp16=True -> -convert-to-fp16
+                    - aic_hw_version=ai100 -> -aic-hw-version=ai100
+                    - aic_hw_version=ai200 -> -aic-hw-version=ai200
 
         ``QEFFAutoModelForCausalLM`` Args:
+
             :full_batch_size (int): Full batch size to allocate cache lines.
             :batch_size (int): Batch size to compile for. ``Defaults to 1``.
             :prefill_seq_len (int): Prefill sequence length to compile for. Prompt will be chunked according to this length.
@@ -311,8 +317,12 @@ def _compile(
             :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.``
             :compiler_options: Pass any compiler option as input.
                 Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+
                 - aic_num_cores=16 -> -aic-num-cores=16
                 - convert_to_fp16=True -> -convert-to-fp16
+                - aic_hw_version=ai100 -> -aic-hw-version=ai100
+                - aic_hw_version=ai200 -> -aic-hw-version=ai200
+
                 For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored.
         """
         if onnx_path is None and self.onnx_path is None:
@@ -344,7 +354,13 @@ def _compile(
 
             return self.qpc_path
 
-        command = constants.COMPILER + [f"-m={onnx_path}"]
+        command = (
+            constants.COMPILER
+            + [
+                f"-aic-hw-version={compiler_options.pop('aic_hw_version', compiler_options.pop('aic-hw-version', constants.DEFAULT_AIC_HW_VERSION))}"
+            ]
+            + [f"-m={onnx_path}"]
+        )
 
         if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None):
             command.append(f"-mdp-load-partition-config={mdp_ts_json_path}")
diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py
@@ -143,8 +143,12 @@ def main(
         :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
         :trust_remote_code (bool): Trust remote code execution. ``Defaults to False.``
         :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
-                -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1
-                -qpc_crc=True -> -qpc-crc
+
+            - `allocator_dealloc_delay=1` → `-allocator-dealloc-delay=1`
+            - `qpc_crc=True` → `-qpc-crc`
+            - `aic_hw_version=ai100` → `-aic-hw-version=ai100`
+            - `aic_hw_version=ai200` → `-aic-hw-version=ai200`
+
 
     .. code-block:: bash
 
diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py
@@ -13,6 +13,7 @@
 from typing import List, Optional, Tuple
 
 from QEfficient.compile.qnn_compiler import compile as qnn_compile
+from QEfficient.utils import constants
 from QEfficient.utils._utils import load_json, load_yaml
 from QEfficient.utils.logging_utils import logger
 
@@ -77,7 +78,7 @@ def compile_kv_model_on_cloud_ai_100(
         "/opt/qti-aic/exec/qaic-exec",
         f"-m={onnx_path}",
         "-aic-hw",
-        "-aic-hw-version=2.0",
+        f"-aic-hw-version={kwargs.pop('aic_hw_version', kwargs.pop('aic-hw-version', constants.DEFAULT_AIC_HW_VERSION))}",
         f"-network-specialization-config={specializations_json}",
         "-convert-to-fp16",
         "-retained-state",
@@ -167,6 +168,10 @@ def compile(
         :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.``
         :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.``
         :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.``
+        :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below:
+
+            - `aic_hw_version=ai100` → `-aic-hw-version=ai100`
+            - `aic_hw_version=ai200` → `-aic-hw-version=ai200`
 
     Returns:
         :str: Path to compiled ``qpc`` package.
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -287,12 +287,20 @@ def compile(
             :num_cores (int): Number of cores used to compile the model.
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
             :compiler_options (dict, optional): Additional compiler options.
+
                 For QAIC Compiler: Extra arguments for qaic-exec can be passed.
                     :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
                     :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+
+                    Params are converted to flags as below:
+
+                    - aic_hw_version=ai100 -> -aic-hw-version=ai100
+                    - aic_hw_version=ai200 -> -aic-hw-version=ai200
+
                 For QNN Compiler: Following arguments can be passed.
                     :enable_qnn (bool): Enables QNN Compilation.
                     :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file.
+
         Returns:
             :str: Path of the compiled ``qpc`` package.
         """
@@ -1701,13 +1709,19 @@ def compile(
             :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model.
             :prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``.
             :compiler_options (dict, optional): Additional compiler options. ``Defaults to None``.
+
                 For QAIC Compiler: Extra arguments for qaic-exec can be passed.
                     :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``.
                     :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
                     :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.``
+
                     Params are converted to flags as below:
+
                     - aic_num_cores=16 -> -aic-num-cores=16
                     - convert_to_fp16=True -> -convert-to-fp16
+                    - aic_hw_version=ai100 -> -aic-hw-version=ai100
+                    - aic_hw_version=ai200 -> -aic-hw-version=ai200
+
                 For QNN Compiler: Following arguments can be passed.
                     :enable_qnn (bool): Enables QNN Compilation.
                     :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file.
@@ -1960,6 +1974,10 @@ def compile(
             :num_cores (int): Number of cores used to compile the model.
             :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``.
             :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``.
+            :compiler_options (dict, optional): Additional compiler options. ``Defaults to None``.
+
+                - aic_hw_version=ai100 -> -aic-hw-version=ai100
+                - aic_hw_version=ai200 -> -aic-hw-version=ai200
 
             Other args are not yet implemented for AutoModelForSpeechSeq2Seq
         Returns:
diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py
@@ -85,7 +85,8 @@ def get_models_dir():
 ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99
 ONNX_EXPORT_OPSET = 13
 
-COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"]
+COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw"]
+DEFAULT_AIC_HW_VERSION = "ai100"
 
 # InternVL constants
 # Fixing the feature size with reference to OpenGVLab/InternVL2_5-1B, OpenGVLab/InternVL2_5-38B and OpenGVLab/InternVL2_5-78B
diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md
@@ -116,7 +116,20 @@ To disable MQ, just pass single soc like below, below step will compile the mode
 ```bash
 python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first
 ```
+### Device Selection for Inference
 
+You can choose which device to run your inference on. By default, it will run on **AI 100 Core**.
+
+To specify a different device, use the `aic-hw-version` option:
+```
+aic-hw-version = 'ai100'  # Default
+aic-hw-version = 'ai200'  # To run on AI 200 Core
+```
+
+
+```bash
+python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --aic-hw-version ai100
+```
 ### Continuous Batching
 
 Users can compile a model utilizing the continuous batching feature by specifying full_batch_size <full_batch_size_value> in the infer and compiler APIs. If full_batch_size is not provided, the model will be compiled in the regular way.