cleanup checks for GIL control, GIL=0, and python >= 3.13.3t (#1743)

Qubitium · web-flow · commit 5e292758ea2b · 2025-09-02T12:25:00.000+08:00
* add proper checks for GIL control, GIL=0, and python 3.13.3t

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;

* ruff

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;

* ruff fix

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;

* ruff fix

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;

* rename

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;

* recommend users to upgrade to 3.13.3t

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;

---------

Signed-off-by: Qubitium &lt;Qubitium@modelcloud.ai&gt;
diff --git a/gptqmodel/models/auto.py b/gptqmodel/models/auto.py
@@ -90,6 +90,7 @@
 from .definitions.gpt_bigcode import GPTBigCodeGPTQ  # noqa: E402
 from .definitions.gpt_neo import GPTNeoGPTQ  # noqa: E402
 from .definitions.gpt_neox import GPTNeoXGPTQ  # noqa: E402
+from .definitions.gpt_oss import GPTOSSGPTQ  # noqa: E402
 from .definitions.gptj import GPTJGPTQ  # noqa: E402
 from .definitions.granite import GraniteGPTQ  # noqa: E402
 from .definitions.grinmoe import GrinMOEGPTQ  # noqa: E402
@@ -130,7 +131,6 @@
 from .definitions.telechat2 import TeleChat2GPTQ
 from .definitions.xverse import XverseGPTQ  # noqa: E402
 from .definitions.yi import YiGPTQ  # noqa: E402
-from .definitions.gpt_oss import GPTOSSGPTQ  # noqa: E402
 
 # make quants and inference more determinisitc
 torch.manual_seed(787)
diff --git a/gptqmodel/models/definitions/gpt_oss.py b/gptqmodel/models/definitions/gpt_oss.py
@@ -13,12 +13,14 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .._const import EXPERT_INDEX_PLACEHOLDER
-from ..base import BaseGPTQModel
 import torch
 import torch.nn.functional as F
 from torch import nn
 
+from .._const import EXPERT_INDEX_PLACEHOLDER
+from ..base import BaseGPTQModel
+
+
 class GptOssExpertsNew(nn.Module):
     def __init__(self, config, ori_experts=None):
         super().__init__()
@@ -29,7 +31,7 @@ def __init__(self, config, ori_experts=None):
         self.alpha = 1.702
         self.limit = 7.0
         self.quantizing = False
-    
+
         self.gate_up = nn.ModuleList([
             nn.Linear(self.hidden_size, 2 * self.expert_dim, dtype=config.dtype)
             for _ in range(self.num_experts)
@@ -90,27 +92,27 @@ def forward(self, hidden_states: torch.Tensor, router_indices=None, routing_weig
             expert_mask = (router_indices == expert_idx).any(dim=-1)  # (num_tokens,)
             if not expert_mask.any():
                 continue
-                
+
             expert_tokens = hidden_states[expert_mask]  # (selected_tokens, hidden_size)
-            
+
             gate_up_output = self.gate_up[expert_idx](expert_tokens)  # (selected_tokens, 2*expert_dim)
             gate, up = gate_up_output[..., ::2], gate_up_output[..., 1::2]
-            
+
             gate = gate.clamp(min=None, max=self.limit)
             up = up.clamp(min=-self.limit, max=self.limit)
             glu = gate * torch.sigmoid(gate * self.alpha)
-            
+
             expert_output = self.down[expert_idx]((up + 1) * glu)  # (selected_tokens, hidden_size)
-            
+
             expert_weights = routing_weights[expert_mask, expert_idx].unsqueeze(-1)  # (selected_tokens, 1)
-            
+
             final_output[expert_mask] += expert_output * expert_weights
-        
+
         if seq_len > 1:
             final_output = final_output.view(batch_size, seq_len, self.hidden_size)
         else:
             final_output = final_output.view(batch_size, self.hidden_size)
-            
+
         return final_output
 
 class GptOssTopKRouterNew(nn.Module):
@@ -164,10 +166,11 @@ def after_model_load(self, model, load_quantized_model=False):
             return model
 
         import os
-        from transformers.integrations.hub_kernels import use_kernel_forward_from_hub
         from concurrent.futures import ThreadPoolExecutor
         from functools import partial
+
         import transformers.models.gpt_oss.modeling_gpt_oss as gpt_oss_modeling
+        from transformers.integrations.hub_kernels import use_kernel_forward_from_hub
 
         @use_kernel_forward_from_hub("MegaBlocksMoeMLP")
         class GptOssMLPNew(nn.Module):
@@ -189,7 +192,7 @@ def process_module(name, module, model, config):
                 parent, child = name.rsplit(".", maxsplit=1)
                 parent = model.get_submodule(parent)
                 setattr(parent, child, new_module)
-        
+
         with ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
             process_fn = partial(process_module, model=model, config=model.config)
             list(executor.map(lambda x: process_fn(x[0], x[1]), model.named_modules()))
diff --git a/gptqmodel/nn_modules/qlinear/tritonv2.py b/gptqmodel/nn_modules/qlinear/tritonv2.py
@@ -23,12 +23,12 @@
 from ...models._const import DEVICE, PLATFORM
 from ...utils.backend import BACKEND
 from ...utils.logger import setup_logger
-from ...utils.python import has_gil
+from ...utils.python import has_gil_disabled
 from .torch import TorchQuantLinear
 
 try:
     # TODO: triton is not compatible with free threading
-    if not has_gil():
+    if not has_gil_disabled():
         raise Exception("GIL is disabled so Triton is not (yet) compatible.")
 
     import triton
diff --git a/gptqmodel/utils/__init__.py b/gptqmodel/utils/__init__.py
@@ -16,12 +16,24 @@
 
 from .backend import BACKEND
 from .logger import setup_logger
-from .python import has_gil, log_gil_required
+from .python import gte_python_3_13_3, has_gil_control, has_gil_disabled, log_gil_requirements_for
+
+log = setup_logger()
 
 # TODO: datasets is not compatible with free threading
-if has_gil():
+if has_gil_disabled():
+    log.info("Python GIL is disabled and GPTQModel will auto enable multi-gpu quant acceleration for MoE models plus multi-cpu accelerated packing.")
     from .perplexity import Perplexity
 else:
-    log_gil_required("utils/Perplexity")
+    if has_gil_control():
+        log.warn(
+            "Python >= 3.13T (free-threading) version detected but GIL is not disabled due to manual override or `regex` package compatibility which can be ignored. Please disable GIL via env `PYTHON_GIL=0`.")
+
+    log.warn(
+        "Python GIL is enabled: Multi-gpu quant acceleration for MoE models is sub-optimal and multi-core accelerated cpu packing is also disabled. We recommend Python >= 3.13.3t with Pytorch > 2.8 for mult-gpu quantization and multi-cpu packing with env `PYTHON_GIL=0`.")
+
+    log_gil_requirements_for("utils/Perplexity")
+
+
 
 from .vram import get_vram
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
@@ -55,7 +55,7 @@
 from ..nn_modules.qlinear.ipex import HAS_IPEX, IPEXQuantLinear
 from ..quantization import FORMAT, QuantizeConfig
 from ..quantization.config import FORMAT_FIELD_JSON, QUANT_METHOD, dynamic_get
-from . import has_gil
+from . import has_gil_disabled
 from .backend import BACKEND
 from .importer import select_quant_linear
 from .logger import setup_logger
@@ -642,7 +642,7 @@ def pack_model(
     names = list(qModules.keys())
     lock = threading.Lock()
 
-    if not has_gil():
+    if has_gil_disabled():
         from device_smi import Device
         cpu = Device("cpu")
         max_packers = cpu.count * cpu.cores
diff --git a/gptqmodel/utils/python.py b/gptqmodel/utils/python.py
@@ -1,16 +1,25 @@
+import platform
 import sys
 
 from gptqmodel.utils.logger import setup_logger
+from packaging.version import Version
 
 log = setup_logger()
 
+# Check if GIL (global interpreter lock) is controllable in this Python build.
+# Starting from python 3.13 it is possible to disable GIL
+def has_gil_control():
+    return hasattr(sys, '_is_gil_enabled')
+
 # Check if GIL (global interpreter lock) is enabled.
 # Starting from python 3.13 it is possible to disable GIL
-def has_gil():
-    if hasattr(sys, '_is_gil_enabled'):
-        return sys._is_gil_enabled()
+def has_gil_disabled():
+    return has_gil_control() and not sys._is_gil_enabled()
 
-    return True
+# Check For Python > 3.13.3
+def gte_python_3_13_3():
+    return Version(platform.python_version()) >= Version("3.13.3")
 
-def log_gil_required(feature: str):
-    log.warn.once(f"Feature `{feature}` requires python GIL. Feature is currently skipped/disabled.")
+# torch compile requires GIL=1 or python 3.13.3t with GIL=0
+def log_gil_requirements_for(feature: str):
+    log.warn.once(f"Feature `{feature}` requires python GIL or Python > 3.13.3T (T for Threading-Free edition of Python) plus Torch 2.8. Feature is currently skipped/disabled.")
diff --git a/gptqmodel/utils/torch.py b/gptqmodel/utils/torch.py
@@ -23,7 +23,7 @@
 from torch.cpu import StreamContext
 
 from ..utils.logger import setup_logger
-from . import has_gil, log_gil_required
+from . import gte_python_3_13_3, has_gil_disabled, log_gil_requirements_for
 
 # pytorch 2.6.0 fixes many compilation errors
 TORCH_HAS_COMPILE = version.parse(torch.__version__).release >= version.Version('2.6').release
@@ -70,10 +70,9 @@ class BalanceStrategy(str, Enum):
     pass
 
 def torch_compile(module: Union[torch.nn.Module, Callable], backend:str ="inductor", mode: str = None, fullgraph=False):
-    # requires torch >2.8 for proper torch.compile
-    # torch compile broken for free threading
-    if not has_gil():
-        log_gil_required("Torch Compile")
+    # requires torch >2.8 for proper torch.compile + Python 3.13.3t (freethreading)
+    if has_gil_disabled() and not gte_python_3_13_3():
+        log_gil_requirements_for("Torch Compile")
         return module
 
     #from ..models.base import PYTORCH_MIN_VERSION_WITH_COMPILE