save processor automatically (#372)

n1ck-guo · web-flow · commit e4528e92b49c · 2024-12-06T14:51:55.000+08:00
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -53,9 +53,6 @@ def run_lmms():
     lmms_eval(args)
 
 def switch():
-    # if "--lmms" in sys.argv:
-    #     sys.argv.remove("--lmms")
-    #     run_lmms()
     if "--mllm" in sys.argv:
         sys.argv.remove("--mllm")
         run_mllm()
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -1263,6 +1263,9 @@ def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **k
             self.model.save_pretrained(output_dir)
             if self.tokenizer is not None:
                 self.tokenizer.save_pretrained(output_dir)
+            processor = kwargs.get("processor", None)
+            if processor is not None:
+                processor.save_pretrained(output_dir)
             return
 
         from auto_round.export import EXPORT_FORMAT
diff --git a/auto_round/mllm/autoround_mllm.py b/auto_round/mllm/autoround_mllm.py
@@ -38,7 +38,7 @@ def _only_text_test(model, tokenizer, device):
         tokenizer.padding_side = 'left'
         if tokenizer.pad_token is None:
             tokenizer.pad_token = tokenizer.eos_token
-        if device != model.device.type:
+        if device.split(':')[0] != model.device.type:
             model = model.to(device)
         inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(model.device)
         model(**inputs)
@@ -150,19 +150,20 @@ def __init__(
         self.to_quant_block_names = to_quant_block_names
         self.extra_data_dir = extra_data_dir
         self.quant_nontext_module = quant_nontext_module
+        self.processor = processor
         self.image_processor = image_processor
         self.template = template if template is not None else model.config.model_type
         if not isinstance(dataset, torch.utils.data.DataLoader):
             self.template = get_template(
                 self.template, model=model, tokenizer=tokenizer, processor=processor, image_processor=image_processor)
-        
-        dataset = self.template.default_dataset if dataset is None else dataset
+            dataset = self.template.default_dataset if dataset is None else dataset
         
         from ..calib_dataset import CALIB_DATASETS
         from .mllm_dataset import MLLM_DATASET
         if isinstance(dataset, str):
             if quant_nontext_module or \
-                (dataset in CALIB_DATASETS.keys() and not _only_text_test(model, tokenizer, device)):
+                (dataset in CALIB_DATASETS.keys() and not \
+                 _only_text_test(model, tokenizer, device)):
                 if quant_nontext_module:
                     logger.warning(f"Text only dataset cannot be used for calibrating non-text modules,"
                                 "switching to liuhaotian/llava_conv_58k")
@@ -372,4 +373,20 @@ def calib(self, nsamples, bs):
                 m = m.to("meta")
         # torch.cuda.empty_cache()
 
+    def save_quantized(self, output_dir=None, format="auto_round", inplace=True, **kwargs):
+        """Save the quantized model to the specified output directory in the specified format.
+
+        Args:
+            output_dir (str, optional): The directory to save the quantized model. Defaults to None.
+            format (str, optional): The format in which to save the model. Defaults to "auto_round".
+            inplace (bool, optional): Whether to modify the model in place. Defaults to True.
+            **kwargs: Additional keyword arguments specific to the export format.
 
+        Returns:
+            object: The compressed model object.
+        """
+        if self.processor is not None and not hasattr(self.processor, "chat_template"):
+            self.processor.chat_template = None
+        compressed_model = super().save_quantized(
+            output_dir=output_dir, format=format, inplace=inplace, processor=self.processor, **kwargs)
+        return compressed_model
diff --git a/auto_round/mllm/template.py b/auto_round/mllm/template.py
@@ -118,24 +118,25 @@ def _register_template(
 
 def load_template(path: str):
     """Load template information from a json file."""
-    data = json.load(open(path, "r"))
-    if "model_type" not in data:
-        data["model_type"] = "user_define"
-    if "replace_tokens" in data and data["replace_tokens"] is not None:
-        assert len(data["replace_tokens"]) % 2 == 0, \
-            "the format of replace_tokens should be [old_tag1, replace_tag1, old_tag2, replace_tag2]"
-        temp = []
-        for i in range(0, len(data["replace_tokens"]), 2):
-            temp.append((data["replace_tokens"][i], data["replace_tokens"][i + 1]))
-        data["replace_tokens"] = temp
-    if "processor" in data:
-        assert data["processor"] in PROCESSORS.keys(), \
-            "{} is not supported, current support: {}".format(data["processor"], ",".join(PROCESSORS.keys()))
-        data["processor"] = PROCESSORS[data["processor"]]
-    template = _register_template(
-        **data
-    )
-    return template
+    with open(path, "r") as file:
+        data = json.load(file)
+        if "model_type" not in data:
+            data["model_type"] = "user_define"
+        if "replace_tokens" in data and data["replace_tokens"] is not None:
+            assert len(data["replace_tokens"]) % 2 == 0, \
+                "the format of replace_tokens should be [old_tag1, replace_tag1, old_tag2, replace_tag2]"
+            temp = []
+            for i in range(0, len(data["replace_tokens"]), 2):
+                temp.append((data["replace_tokens"][i], data["replace_tokens"][i + 1]))
+            data["replace_tokens"] = temp
+        if "processor" in data:
+            assert data["processor"] in PROCESSORS.keys(), \
+                "{} is not supported, current support: {}".format(data["processor"], ",".join(PROCESSORS.keys()))
+            data["processor"] = PROCESSORS[data["processor"]]
+        template = _register_template(
+            **data
+        )
+        return template
 
 
 def _load_preset_template():
diff --git a/auto_round/script/mllm.py b/auto_round/script/mllm.py
@@ -418,13 +418,11 @@ def tune(args):
     inplace = False if len(format_list) > 1 else True
     for format_ in format_list:
         eval_folder = f'{export_dir}-{format_}'
-        if processor is not None and not hasattr(processor, "chat_template"):
-            processor.chat_template = None
         safe_serialization = True
         if "phi3_v" in model_type:
             safe_serialization = False
         autoround.save_quantized(
-            eval_folder, format=format_, inplace=inplace, processor=processor, safe_serialization=safe_serialization)
+            eval_folder, format=format_, inplace=inplace, safe_serialization=safe_serialization)
 
 
 def eval(args):