add restrict for exporting act-quant models (#480)

n1ck-guo · web-flow · commit af5650875a40 · 2025-03-28T11:46:34.000+08:00
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -293,7 +293,7 @@ def __init__(
             "act_sym",
             "act_dynamic",
             "act_data_type",
-            "super_bits", 
+            "super_bits",
             "super_group_size"
         ]
 
@@ -428,8 +428,26 @@ def quantize_and_save(self, output_dir: str = "tmp_autoround", format: str = "au
                 logger.error(f"Unsupported format {format_}, please choose from {supported_formats}")
                 exit(-1)
 
+        # only support to export afp8
+        if self.act_bits <= 8:
+            if "fp8" not in self.act_data_type:
+                if len(formats) > 1 or "fake" not in formats:
+                    logger.warning(
+                        f"Currently only support to export auto_round format quantized model"
+                        " with fp8 dtype activation for activation quantization."
+                        " Change format to fake and save."
+                        )
+                    formats = ["fake"]
+            else:
+                if len(formats) > 1 or "auto_round" not in formats:
+                    logger.warning(
+                        f"Currently only support to export auto_round format for W{self.bits}AFP8 model,"
+                        " change format to auto_round"
+                    )
+                    formats = ["auto_round"]
+
         # If multiple formats are specified, enforce inplace=False
-        if len(format) > 1:
+        if len(formats) > 1:
             inplace = False
         inplace = kwargs.get("inplace", inplace)
         kwargs.pop("inplace", None)
@@ -1502,6 +1520,24 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
         Returns:
             object: The compressed model object.
         """
+        # only support to export afp8
+        if self.act_bits <= 8:
+            if "fp8" not in self.act_data_type:
+                if format != "fake":
+                    logger.warning(
+                        f"Currently only support to export auto_round format quantized model"
+                        " with fp8 dtype activation for activation quantization."
+                        " Change format to fake and save."
+                        )
+                    format = "fake"
+            else:
+                if format != "auto_round":
+                    logger.warning(
+                        f"Currently only support to export auto_round format for W{self.bits}AFP8 model,"
+                        " change format to auto_round"
+                    )
+                    format = "auto_round"
+
         if self.low_cpu_mem_usage:
             self.model = self.model.to('cpu')