fix gpu ut (#376)

n1ck-guo · web-flow · commit 2ffadfd20dd7 · 2024-12-09T09:14:34.000+08:00
diff --git a/auto_round/mllm/autoround_mllm.py b/auto_round/mllm/autoround_mllm.py
@@ -19,6 +19,7 @@
 
 from ..utils import (
     logger,
+    detect_device,
     to_device,
     to_dtype,
     get_multimodal_block_names,
@@ -34,6 +35,7 @@
 def _only_text_test(model, tokenizer, device):
     """Test if the model whether can use text-only datasets."""
     try:
+        device = detect_device(device)
         text = ["only text", "test"]
         tokenizer.padding_side = 'left'
         if tokenizer.pad_token is None:
diff --git a/test_cuda/test_support_vlms.py b/test_cuda/test_support_vlms.py
@@ -74,6 +74,7 @@ def test_qwen2(self):
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         )
         print(output_text[0])
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_phi3(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
@@ -129,8 +130,8 @@ def test_phi3(self):
         response = processor.batch_decode(generate_ids, 
         skip_special_tokens=True, 
         clean_up_tokenization_spaces=False)[0] 
-
         print(response)
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
     def test_llava(self):
         model_path = "/models/llava-v1.5-7b/"
@@ -166,6 +167,7 @@ class DataArgs:
 
         output = model.generate(inputs['input_ids'].to(model.device), images=image_input.unsqueeze(0).half(), max_new_tokens=50)
         print(tokenizer.batch_decode(output))
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
     
     def test_llama(self):
         model_path = "/models/Llama-3.2-11B-Vision-Instruct/"
@@ -204,6 +206,7 @@ def test_llama(self):
 
         output = model.generate(**inputs, max_new_tokens=50)
         print(processor.decode(output[0]))
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
     
     def test_cogvlm(self):
         model_path = "/models/cogvlm2-llama3-chat-19B/"
@@ -257,6 +260,7 @@ def test_cogvlm(self):
         response = tokenizer.decode(outputs[0])
         response = response.split("<|end_of_text|>")[0]
         print(response)     
+        shutil.rmtree(quantized_model_path, ignore_errors=True)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/test_cuda/test_vlms.py b/test_cuda/test_vlms.py
@@ -3,10 +3,12 @@
 import sys
 import unittest
 import re
+import os
 
 sys.path.insert(0, "..")
 
 from PIL import Image
+from auto_round import AutoRoundConfig
 import requests
 
 
@@ -38,7 +40,7 @@ def tearDownClass(self):
     #     assert (
     #             res == """<s> There is a girl who likes adventure, and she is looking for a partner to go on a treasure hunt. She has found a map that leads to a hidden treasure, but she needs a partner to help her decipher the clues and find the treasure. You""")
 
-    def inference(self, quantized_model_dir):
+    def qwen_inference(self, quantized_model_dir):
         from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
         tokenizer = AutoTokenizer.from_pretrained(quantized_model_dir)
         processor = AutoProcessor.from_pretrained(quantized_model_dir, trust_remote_code=True)
@@ -104,19 +106,68 @@ def test_vlm_tune(self):
 
         quantized_model_path = self.save_dir
         autoround.save_quantized(quantized_model_path, format='auto_round', inplace=False)
-        self.inference(quantized_model_path)
+        self.qwen_inference(quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
         autoround.save_quantized(quantized_model_path, format='auto_gptq', inplace=False)
-        self.inference(quantized_model_path)
+        self.qwen_inference(quantized_model_path)
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
+    def phi3_infernece(self, quantized_model_dir):
+        from transformers import AutoModelForCausalLM, AutoProcessor
+        quantized_model_path = os.path.join(quantized_model_dir, "Phi-3.5-vision-instruct-w4g128-auto_round")
+        res = os.system(f"cp /models/Phi-3.5-vision-instruct/*.py {quantized_model_path}")
+        model = AutoModelForCausalLM.from_pretrained(
+            quantized_model_path, 
+            device_map="auto", 
+            trust_remote_code=True, 
+            torch_dtype="auto"
+            )
+        processor = AutoProcessor.from_pretrained(quantized_model_path, 
+        trust_remote_code=True, 
+        num_crops=4
+        )
+
+        image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+        content = "Describe this image."
+        messages = [
+            {"role": "user", 
+            "content": "<|image_1|>\n"+content},
+        ]
+
+        prompt = processor.tokenizer.apply_chat_template(
+        messages, 
+        tokenize=False, 
+        add_generation_prompt=True
+        )
+        image_inputs = Image.open(requests.get(image_url, stream=True).raw)
+        inputs = processor(prompt, image_inputs, return_tensors="pt").to(model.device) 
+
+        generation_args = { 
+            "max_new_tokens": 1000, 
+            "temperature": 0.0, 
+            "do_sample": False, 
+        } 
+
+        generate_ids = model.generate(**inputs, 
+        eos_token_id=processor.tokenizer.eos_token_id, 
+        **generation_args
+        )
+
+        # remove input tokens 
+        generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
+        response = processor.batch_decode(generate_ids, 
+        skip_special_tokens=True, 
+        clean_up_tokenization_spaces=False)[0] 
+
+        print(response)
+
     def test_quant_not_text(self):
         from auto_round import AutoRoundMLLM
-        from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer
+        from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer
 
         ## load the model
-        model_name = "/models/Qwen2-VL-2B-Instruct"
-        model = Qwen2VLForConditionalGeneration.from_pretrained(
+        model_name = "/models/Phi-3.5-vision-instruct"
+        model = AutoModelForCausalLM.from_pretrained(
             model_name, trust_remote_code=True)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
         processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True)
@@ -127,19 +178,21 @@ def test_quant_not_text(self):
                                   bits=bits, group_size=group_size, sym=sym, iters=1, nsamples=1,quant_nontext_module=True)
         autoround.quantize()
 
-        quantized_model_path = "./saved"
-        autoround.save_quantized(quantized_model_path, format='auto_round', inplace=False)
-        self.inference(quantized_model_path)
+        quantized_model_path = "./saved/Phi-3.5-vision-instruct-w4g128-auto_round"
+        autoround.save_quantized(quantized_model_path, format='auto_round', inplace=False, safe_serialization=False)
+        self.phi3_infernece("./saved")
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_quant_not_text_fp_layers(self):
         import  os
         python_path = sys.executable
         absolute_path = os.path.abspath(self.save_dir)
         res = os.system(
-            f"cd .. && {python_path} -m   auto_round --mllm --model /models/Qwen2-VL-2B-Instruct --fp_layers model.layers.27,visual.blocks.29 --quant_nontext_module --iters 1 --nsamples 1 --output_dir {absolute_path}")
-        self.inference(os.path.join(absolute_path,"Qwen2-VL-2B-Instruct-w4g128-auto_round"))
-        shutil.rmtree(os.path.join(absolute_path,"Qwen2-VL-2B-Instruct-w4g128-auto_round"), ignore_errors=True)
+            f"cd .. && {python_path} -m auto_round --mllm --model /models/Phi-3.5-vision-instruct "
+            f"--fp_layers model.layers.27,model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16 "
+            f"--quant_nontext_module --iters 1 --nsamples 1 --output_dir {absolute_path}")
+        self.phi3_infernece(absolute_path)
+        shutil.rmtree(absolute_path, ignore_errors=True)