diff --git a/QEfficient/base/modeling_qeff.py b/QEfficient/base/modeling_qeff.py index 6c9f88d9f..6ecbf0fc0 100644 --- a/QEfficient/base/modeling_qeff.py +++ b/QEfficient/base/modeling_qeff.py @@ -146,14 +146,20 @@ def compile(self, *args, **kwargs) -> Path: :mxfp6_matmul (bool): Use MXFP6 to compress weights for MatMul nodes to run faster on device. ``Defaults to False``. :mxint8_kv_cache (bool): Use MXINT8 to compress KV-cache on device to access and update KV-cache faster. ``Defaults to False``. :compiler_options: Pass any compiler option as input. - Following flag can be passed in compiler_options to enable QNN Compilation path. - :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.`` - :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed`` - for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below: - - aic_num_cores=16 -> -aic-num-cores=16 - - convert_to_fp16=True -> -convert-to-fp16 + + Following flag can be passed in compiler_options to enable QNN Compilation path. + :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False. if not passed.`` + :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None. if not passed`` + + for QAIC compilation path, any flag that is supported by ``qaic-exec`` can be passed. Params are converted to flags as below: + + - aic_num_cores=16 -> -aic-num-cores=16 + - convert_to_fp16=True -> -convert-to-fp16 + - aic_hw_version=ai100 -> -aic-hw-version=ai100 + - aic_hw_version=ai200 -> -aic-hw-version=ai200 ``QEFFAutoModelForCausalLM`` Args: + :full_batch_size (int): Full batch size to allocate cache lines. :batch_size (int): Batch size to compile for. ``Defaults to 1``. :prefill_seq_len (int): Prefill sequence length to compile for. Prompt will be chunked according to this length. @@ -311,8 +317,12 @@ def _compile( :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. ``Defaults to None.`` :compiler_options: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below: + - aic_num_cores=16 -> -aic-num-cores=16 - convert_to_fp16=True -> -convert-to-fp16 + - aic_hw_version=ai100 -> -aic-hw-version=ai100 + - aic_hw_version=ai200 -> -aic-hw-version=ai200 + For QNN Compilation path, when enable_qnn is set to True, any parameter passed in compiler_options will be ignored. """ if onnx_path is None and self.onnx_path is None: @@ -344,7 +354,13 @@ def _compile( return self.qpc_path - command = constants.COMPILER + [f"-m={onnx_path}"] + command = ( + constants.COMPILER + + [ + f"-aic-hw-version={compiler_options.pop('aic_hw_version', compiler_options.pop('aic-hw-version', constants.DEFAULT_AIC_HW_VERSION))}" + ] + + [f"-m={onnx_path}"] + ) if mdp_ts_json_path := compiler_options.pop("mdp_load_partition_config", None): command.append(f"-mdp-load-partition-config={mdp_ts_json_path}") diff --git a/QEfficient/cloud/infer.py b/QEfficient/cloud/infer.py index 1c620ad7d..95310586a 100644 --- a/QEfficient/cloud/infer.py +++ b/QEfficient/cloud/infer.py @@ -143,8 +143,12 @@ def main( :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` :trust_remote_code (bool): Trust remote code execution. ``Defaults to False.`` :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below: - -allocator_dealloc_delay=1 -> -allocator-dealloc-delay=1 - -qpc_crc=True -> -qpc-crc + + - `allocator_dealloc_delay=1` → `-allocator-dealloc-delay=1` + - `qpc_crc=True` → `-qpc-crc` + - `aic_hw_version=ai100` → `-aic-hw-version=ai100` + - `aic_hw_version=ai200` → `-aic-hw-version=ai200` + .. code-block:: bash diff --git a/QEfficient/compile/compile_helper.py b/QEfficient/compile/compile_helper.py index 56177cce9..3c9d370e3 100644 --- a/QEfficient/compile/compile_helper.py +++ b/QEfficient/compile/compile_helper.py @@ -13,6 +13,7 @@ from typing import List, Optional, Tuple from QEfficient.compile.qnn_compiler import compile as qnn_compile +from QEfficient.utils import constants from QEfficient.utils._utils import load_json, load_yaml from QEfficient.utils.logging_utils import logger @@ -77,7 +78,7 @@ def compile_kv_model_on_cloud_ai_100( "/opt/qti-aic/exec/qaic-exec", f"-m={onnx_path}", "-aic-hw", - "-aic-hw-version=2.0", + f"-aic-hw-version={kwargs.pop('aic_hw_version', kwargs.pop('aic-hw-version', constants.DEFAULT_AIC_HW_VERSION))}", f"-network-specialization-config={specializations_json}", "-convert-to-fp16", "-retained-state", @@ -167,6 +168,10 @@ def compile( :allow_mxint8_mdp_io (bool): Allows MXINT8 compression of MDP IO traffic ``Defaults to False.`` :enable_qnn (bool): Enables QNN Compilation. ``Defaults to False.`` :qnn_config (str): Path of QNN Config parameters file. ``Defaults to None.`` + :kwargs: Pass any compiler option as input. Any flag that is supported by `qaic-exec` can be passed. Params are converted to flags as below: + + - `aic_hw_version=ai100` → `-aic-hw-version=ai100` + - `aic_hw_version=ai200` → `-aic-hw-version=ai200` Returns: :str: Path to compiled ``qpc`` package. diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py index b3d27f3a5..42898381d 100644 --- a/QEfficient/transformers/models/modeling_auto.py +++ b/QEfficient/transformers/models/modeling_auto.py @@ -287,12 +287,20 @@ def compile( :num_cores (int): Number of cores used to compile the model. :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. :compiler_options (dict, optional): Additional compiler options. + For QAIC Compiler: Extra arguments for qaic-exec can be passed. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` + + Params are converted to flags as below: + + - aic_hw_version=ai100 -> -aic-hw-version=ai100 + - aic_hw_version=ai200 -> -aic-hw-version=ai200 + For QNN Compiler: Following arguments can be passed. :enable_qnn (bool): Enables QNN Compilation. :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. + Returns: :str: Path of the compiled ``qpc`` package. """ @@ -1701,13 +1709,19 @@ def compile( :num_speculative_tokens (int, optional): Number of speculative tokens to take as input for Speculative Decoding Target Language Model. :prefill_only (bool): if ``True`` compile for prefill only and if ``False`` compile for decode only. Defaults to None, which compiles for both ``prefill and ``decode``. :compiler_options (dict, optional): Additional compiler options. ``Defaults to None``. + For QAIC Compiler: Extra arguments for qaic-exec can be passed. :mos (int, optional): Effort level to reduce on-chip memory. Defaults to -1, meaning no effort. ``Defaults to -1``. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. :allow_mxint8_mdp_io (bool, optional): Allows MXINT8 compression of MDP IO traffic. ``Defaults to False.`` + Params are converted to flags as below: + - aic_num_cores=16 -> -aic-num-cores=16 - convert_to_fp16=True -> -convert-to-fp16 + - aic_hw_version=ai100 -> -aic-hw-version=ai100 + - aic_hw_version=ai200 -> -aic-hw-version=ai200 + For QNN Compiler: Following arguments can be passed. :enable_qnn (bool): Enables QNN Compilation. :qnn_config (str): Path of QNN Config parameters file. Any extra parameters for QNN compilation can be passed via this file. @@ -1960,6 +1974,10 @@ def compile( :num_cores (int): Number of cores used to compile the model. :mxfp6_matmul (bool, optional): Whether to use ``mxfp6`` compression for weights. ``Defaults to False``. :aic_enable_depth_first (bool, optional): Enables DFS with default memory size. ``Defaults to False``. + :compiler_options (dict, optional): Additional compiler options. ``Defaults to None``. + + - aic_hw_version=ai100 -> -aic-hw-version=ai100 + - aic_hw_version=ai200 -> -aic-hw-version=ai200 Other args are not yet implemented for AutoModelForSpeechSeq2Seq Returns: diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index cc52658c6..92d0b32f2 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -85,7 +85,8 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99 ONNX_EXPORT_OPSET = 13 -COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw", "-aic-hw-version=2.0"] +COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw"] +DEFAULT_AIC_HW_VERSION = "ai100" # InternVL constants # Fixing the feature size with reference to OpenGVLab/InternVL2_5-1B, OpenGVLab/InternVL2_5-38B and OpenGVLab/InternVL2_5-78B diff --git a/docs/source/quick_start.md b/docs/source/quick_start.md index 233fb491a..0aa294e36 100644 --- a/docs/source/quick_start.md +++ b/docs/source/quick_start.md @@ -116,7 +116,20 @@ To disable MQ, just pass single soc like below, below step will compile the mode ```bash python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device-group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first ``` +### Device Selection for Inference +You can choose which device to run your inference on. By default, it will run on **AI 100 Core**. + +To specify a different device, use the `aic-hw-version` option: +``` +aic-hw-version = 'ai100' # Default +aic-hw-version = 'ai200' # To run on AI 200 Core +``` + + +```bash +python -m QEfficient.cloud.infer --model_name gpt2 --batch_size 1 --prompt_len 32 --ctx_len 128 --mxfp6 --num_cores 16 --device_group [0] --prompt "My name is" --mos 1 --aic_enable_depth_first --aic-hw-version ai100 +``` ### Continuous Batching Users can compile a model utilizing the continuous batching feature by specifying full_batch_size in the infer and compiler APIs. If full_batch_size is not provided, the model will be compiled in the regular way.