Temporarily close qxk api for new release (#478)

n1ck-guo · web-flow · commit d2cbed96e2c6 · 2025-03-28T10:01:09.000+08:00
* Temporarily close qxk api for new release
diff --git a/README.md b/README.md
@@ -157,8 +157,8 @@ autoround = AutoRound(model, tokenizer, bits=bits, group_size=group_size, sym=sy
 ## the best accuracy, 3X slower, low_gpu_mem_usage could save ~20G but ~30% slower
 # autoround = AutoRound(model, tokenizer, nsamples=512, iters=1000, low_gpu_mem_usage=True, bits=bits, group_size=group_size, sym=sym)
 
-## fast and low memory, 2-3X speedup, slight accuracy drop at W4G128
-# autoround = AutoRound(model, tokenizer, nsamples=128, iters=200, seqlen=512, batch_size=4, bits=bits, group_size=group_size, sym=sym )
+## 2-3X speedup, slight accuracy drop at W4G128
+# autoround = AutoRound(model, tokenizer, nsamples=128, iters=50, lr=5e-3, seqlen=512, batch_size=4, bits=bits, group_size=group_size, sym=sym )
 
 autoround.quantize()
 output_dir = "./tmp_autoround"
diff --git a/auto_round/export/export_to_gguf/config.py b/auto_round/export/export_to_gguf/config.py
@@ -18,24 +18,24 @@
 
 GGUF_CONFIG["gguf:q4_1"] = {"bits": 4, "act_bits": 16, "group_size": 32, "asym": True, "data_type": "int"}
 
-GGUF_CONFIG["gguf:q4_k_s"] = {
-    "bits": 4,
-    "act_bits": 16,
-    "super_group_size": 8,
-    "super_bits": 6,
-    "group_size": 32,
-    "asym": True,
-    "data_type": "int_asym_dq"
-}
+# GGUF_CONFIG["gguf:q4_k_s"] = {
+#     "bits": 4,
+#     "act_bits": 16,
+#     "super_group_size": 8,
+#     "super_bits": 6,
+#     "group_size": 32,
+#     "asym": True,
+#     "data_type": "int_asym_dq"
+# }
 
-GGUF_CONFIG["gguf:q2_k_s"] = {
-    "bits": 2,
-    "act_bits": 16,
-    "super_group_size": 16,
-    "super_bits": 4,
-    "group_size": 16,
-    "asym": True,
-    "data_type": "int_asym_dq"
-}
+# GGUF_CONFIG["gguf:q2_k_s"] = {
+#     "bits": 2,
+#     "act_bits": 16,
+#     "super_group_size": 16,
+#     "super_bits": 4,
+#     "group_size": 16,
+#     "asym": True,
+#     "data_type": "int_asym_dq"
+# }
 
 GGUF_CONFIG["gguf:q8_0"] = {"bits": 8, "act_bits": 16, "group_size": 32, "asym": False, "data_type": "int"}
diff --git a/auto_round/export/export_to_gguf/export.py b/auto_round/export/export_to_gguf/export.py
@@ -31,8 +31,8 @@
         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
         "q4_0": gguf.LlamaFileType.MOSTLY_Q4_0,
         "q4_1": gguf.LlamaFileType.MOSTLY_Q4_1,
-        "q4_k_s": gguf.LlamaFileType.MOSTLY_Q4_K_S,
-        "q2_k_s": gguf.LlamaFileType.MOSTLY_Q2_K_S,
+        # "q4_k_s": gguf.LlamaFileType.MOSTLY_Q4_K_S,
+        # "q2_k_s": gguf.LlamaFileType.MOSTLY_Q2_K_S,
         "q8_0": gguf.LlamaFileType.MOSTLY_Q8_0,
         "auto": gguf.LlamaFileType.GUESSED,
     }
@@ -66,6 +66,7 @@ def save_quantized_as_gguf(output_dir, backend="gguf:q4_0", **kwargs):
             model_name = model_name[-1]
 
         output_type = backend.split(":")[-1]
+        assert output_type.lower() in FTYPE_MAP, f"{output_type} is not supported"
         output_type = FTYPE_MAP.get(output_type.lower())
 
 
diff --git a/test/test_autoround_acc.py b/test/test_autoround_acc.py
@@ -67,7 +67,7 @@ def test_default_acc(self):
         out1 = model_tmp(inp)
         
         assert out0[0].equal(out1[0])
-        self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=1e-04))
+        self.assertTrue(isclose(float(out0[0][0][0][0]), -0.021002087742090225, rel_tol=5e-04))
 
 
 if __name__ == "__main__":
diff --git a/test/test_autoround_export_to_itrex.py b/test/test_autoround_export_to_itrex.py
@@ -75,10 +75,11 @@ def test_autoround_int_quant(self):
         optq_2 = round(model, self.tokenizer, device="cpu", nsamples=20, seqlen=10)
         q_model, layer_config2 = optq_2.quantize()
         compressed_model = pack_model(model=q_model, layer_config=layer_config2, inplace=False)
+        compressed_model = compressed_model.to(torch.float32)
         out4 = q_model(self.lm_input)
         out5 = compressed_model(self.lm_input)
         self.assertTrue(torch.all(out1[0] == out6[0]))
-        self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=1e-3)))
+        self.assertTrue(torch.all(torch.isclose(out4[0], out5[0], atol=5e-3)))
 
     def test_config(self):
         from auto_round.export.export_to_itrex import QuantConfig
diff --git a/test/test_basic_usage.py b/test/test_basic_usage.py
@@ -31,7 +31,7 @@ def test_auto_round_cmd(self):
             assert False, "cmd line test fail, please have a check"
         
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model 'facebook/opt-125m' --eval_task_by_task --tasks piqa,openbookqa --bs 32"
+            f"cd .. && {python_path} -m auto_round --model 'facebook/opt-125m' --iter 1 --nsamples 1 --eval_task_by_task --tasks piqa,openbookqa --bs 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ def test_auto_round_cmd(self):`
`31`	`31`	`assert False, "cmd line test fail, please have a check"`
`32`	`32`
`33`	`33`	`res = os.system(`
`34`		`- f"cd .. && {python_path} -m auto_round --model 'facebook/opt-125m' --eval_task_by_task --tasks piqa,openbookqa --bs 32"`
	`34`	`+ f"cd .. && {python_path} -m auto_round --model 'facebook/opt-125m' --iter 1 --nsamples 1 --eval_task_by_task --tasks piqa,openbookqa --bs 32"`
`35`	`35`	`)`
`36`	`36`	`if res > 0 or res == -1:`
`37`	`37`	`assert False, "cmd line test fail, please have a check"`