InternLM
diff --git a/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lmdeploy/turbomind/turbomind.py‎
Lines changed: 196 additions & 3 deletions b/‎lmdeploy/turbomind/turbomind.py‎
Lines changed: 196 additions & 3 deletions
diff --git a/‎src/turbomind/engine/model_request.cc‎
Lines changed: 2 additions & 2 deletions b/‎src/turbomind/engine/model_request.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/turbomind/engine/model_request.h‎
Lines changed: 3 additions & 3 deletions b/‎src/turbomind/engine/model_request.h‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/turbomind/python/CMakeLists.txt‎
Lines changed: 5 additions & 1 deletion b/‎src/turbomind/python/CMakeLists.txt‎
Lines changed: 5 additions & 1 deletion
@@ -292,7 +292,9 @@ add_subdirectory(src)
 if (BUILD_PY_FFI)
   if (CALL_FROM_SETUP_PY)
     install(TARGETS _turbomind DESTINATION ${CMAKE_INSTALL_PREFIX})
+    install(TARGETS _xgrammar DESTINATION ${CMAKE_INSTALL_PREFIX})
   else()
     install(TARGETS _turbomind DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
+    install(TARGETS _xgrammar DESTINATION ${CMAKE_SOURCE_DIR}/lmdeploy/lib)
   endif()
 endif ()
@@ -10,10 +10,11 @@
 from collections.abc import Sequence
 from concurrent.futures import ThreadPoolExecutor
 from dataclasses import asdict
+from enum import Enum
 from functools import partial
 from multiprocessing.reduction import ForkingPickler
 from queue import Queue
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, List, Optional, Union
 
 import numpy as np
 import torch
@@ -105,6 +106,39 @@ def update_parallel_config(cfg: TurbomindEngineConfig):
     cfg.devices = cfg.devices or list(range(cfg.device_num))
 
 
+# Borrowed from xgrammar's TokenizerInfo.VocabType
+class VocabType(Enum):
+    """The type of the vocabulary.
+
+    Used in TokenizerInfo. XGrammar supports three types of
+    vocabularies: RAW, BYTE_FALLBACK, BYTE_LEVEL.
+    """
+
+    RAW = 0
+    """The vocabulary is in the raw format.
+
+    The tokens in the vocabulary are kept in their original form without any processing. This kind of tokenizer includes
+    the tiktoken tokenizer, e.g. microsoft/Phi-3-small-8k-instruct, Qwen/Qwen-7B-Chat, etc.
+    """
+
+    BYTE_FALLBACK = 1
+    """The vocabulary used in the byte fallback BPE tokenizer.
+
+    The tokens are encoded through the byte-fallback conversion. E.g. "\u001b" -> "<0x1B>", " apple" -> "▁apple". This
+    kind of tokenizer includes meta-llama/Llama-2-7b-chat, microsoft/Phi-3.5-mini-instruct, etc.
+    """
+
+    BYTE_LEVEL = 2
+    """The vocabulary used in the byte level BPE tokenizer.
+
+    The tokens are encoded through the byte-to-unicode conversion, as in
+    https://github.com/huggingface/transformers/blob/87be06ca77166e6a6215eee5a990ab9f07238a18/src/transformers/models/gpt2/tokenization_gpt2.py#L38-L59
+
+    This kind of tokenizer includes meta-llama/Meta-Llama-3-8B-Instruct,
+    meta-llama/Meta-Llama-3.1-8B-Instruct, etc.
+    """
+
+
 class TurboMind:
     """LMDeploy's inference engine.
 
@@ -163,18 +197,177 @@ def __init__(self,
         self.session_len = self.config.session_len
 
         if decode_grammar is not None:
-            tokenizer_info = _xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=decode_grammar_vocab_size)
+            tokenizer_info = self._get_xgrammar_tokenizer_info(tokenizer, vocab_size=decode_grammar_vocab_size)
             compiler = _xgr.GrammarCompiler(tokenizer_info, max_threads=decode_grammar_threads)
 
             if decode_grammar_type == 'json_schema':
                 grammar = compiler.compile_json_schema(decode_grammar, **decode_grammar_extra)
             elif decode_grammar_type == 'regex':
-                grammar = compiler.from_regex(decode_grammar)
+                grammar = compiler.compile_regex(decode_grammar)
             else:
                 assert False, f'Decode grammar type {decode_grammar_type} should be in ["json_schema", "regex"]'
 
             self.grammar = grammar
 
+    # Borrowed from xgrammar's TokenizerInfo.from_huggingface
+    def _get_xgrammar_tokenizer_info(
+        self,
+        tokenizer: 'PreTrainedTokenizerBase',  # noqa: F821
+        *,
+        vocab_size: Optional[int] = None,
+        stop_token_ids: Optional[Union[List[int], int]] = None,
+    ) -> 'TokenizerInfo':  # noqa: F821
+        """Construct the tokenizer info from the huggingface tokenizer. This
+        constructor supports various tokenizer backends, including the
+        huggingface fast tokenizer and tiktoken tokenizer. Necessary
+        information is automatically detected from the tokenizer.
+
+        The vocab_size parameter is introduced to handle the misalignment between the model's
+        vocab_size and the tokenizer's vocabulary size. User should pass the model's vocab_size
+        (could be defined in the model config) here. See docs of vocab_size for more details.
+
+        The stop token ids is by default the eos_token_id of the tokenizer. If there are other
+        stop tokens, you can specify them manually.
+
+        Parameters
+        ----------
+        tokenizer : PreTrainedTokenizerBase
+            The huggingface tokenizer.
+
+        vocab_size : Optional[int], default: None
+            The vocabulary size **defined by the model** (**not the tokenizer**). This equals to the
+            vocab dimension of the model's lm_head. This is the size of the token mask.
+
+            It can be:
+
+            1. the same as the tokenizer's vocabulary size. This is the most common case.
+            2. larger than the tokenizer's vocabulary size. This happens when the model has padding
+               to lm_head, possibly due to aligning lm_head to the power of 2.
+               E.g. Phi-3 and Deepseek-V2.
+            3. smaller than the tokenizer's vocabulary size. This happens when the tokenizer has
+               some added tokens that will not supported by the model. E.g.
+               Llama-3.2 Vision and Molmo-72B-0924 has padded `<|image|>` tokens, but they will not
+               be considered in lm_head or generated by the model.
+
+            model_vocab_size need to be provided for case 2 and 3. If not provided, it will be
+            set to the tokenizer's vocabulary size.
+
+        stop_token_ids : Optional[List[int]], default: None
+            The stop token ids. If not provided, the eos_token_id of the tokenizer will be used.
+
+        Returns
+        -------
+        tokenizer_info : TokenizerInfo
+            The tokenizer info.
+        """
+        from transformers import PreTrainedTokenizerFast
+
+        if isinstance(stop_token_ids, int):
+            stop_token_ids = [stop_token_ids]
+        if isinstance(stop_token_ids, list) and len(stop_token_ids) == 0:
+            raise ValueError('stop_token_ids cannot be empty')
+
+        try:
+            vocab_dict = tokenizer.get_vocab()
+        except AttributeError as e:
+            msg = (f'Cannot get the vocabulary of the tokenizer {type(tokenizer)}. The tokenizer '
+                   'should have a get_vocab method.')
+            raise ValueError(msg) from e
+
+        # Some tokenizer don't have token id 0 or 1 or 2. So the max_id could be larger than the
+        # number of tokens.
+        max_id = max(vocab_dict.values())
+        tokenizer_vocab_size = max(len(vocab_dict), max_id + 1)
+
+        vocab_size = vocab_size or tokenizer_vocab_size
+
+        # maintain tokenizer's indexing
+        encoded_vocab = [''] * vocab_size
+        for token, idx in vocab_dict.items():
+            if idx < vocab_size:
+                encoded_vocab[idx] = token
+
+        if isinstance(tokenizer, PreTrainedTokenizerFast):
+            # huggingface fast tokenizer
+            # - the vocabulary is directly obtained from tokenizer.get_vocab()
+            #   (tokenizer.backend_tokenizer.to_str() may not contain the full vocab, special
+            #   tokens may be omitted)
+            # - the vocab size is obtained from len(tokenizer.get_vocab()) or provided by user
+            # - the vocab type and add_prefix_space are obtained from
+            #   tokenizer.backend_tokenizer.to_str()
+            # - stop token id is provided by user, or auto detected.
+            backend_str = tokenizer.backend_tokenizer.to_str()
+            if stop_token_ids is None:
+                if hasattr(tokenizer, 'eos_token_id') and tokenizer.eos_token_id is not None:
+                    stop_token_ids = [tokenizer.eos_token_id]
+                else:
+                    logger.warning('When constructing TokenizerInfo from a huggingface tokenizer, '
+                                   'stop_token_ids is neither provided by user nor found from the tokenizer. '
+                                   'It will be automatically detected.')
+            metadata = json.loads(_xgr.TokenizerInfo._detect_metadata_from_hf(backend_str))
+            return _xgr.TokenizerInfo(
+                encoded_vocab,
+                vocab_type=metadata['vocab_type'],
+                vocab_size=vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=metadata['add_prefix_space'],
+            )
+
+        elif _xgr.TokenizerInfo._is_tiktoken_tokenizer(tokenizer):
+            # tiktoken tokenizer
+            # e.g. Phi-3-small-8k-instruct, Qwen-7B-Chat, stablelm-2-12b-chat (previously)
+            if stop_token_ids is None:
+                if hasattr(tokenizer, 'eos_token_id') and tokenizer.eos_token_id is not None:
+                    stop_token_ids = [tokenizer.eos_token_id]
+                else:
+                    logger.warning('When constructing TokenizerInfo from a huggingface tokenizer, '
+                                   'stop_token_ids is neither provided by user nor found from the tokenizer. '
+                                   'It will be automatically detected.')
+            return _xgr.TokenizerInfo(
+                encoded_vocab,
+                VocabType.RAW,
+                vocab_size=vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=False,
+            )
+
+        elif _xgr.TokenizerInfo._is_sentencepiece_tokenizer(tokenizer):
+            # sentencepiece tokenizer
+            # e.g. Chatglm3-6b
+            if hasattr(tokenizer, 'sp_model'):
+                sp_model = tokenizer.sp_model
+            elif hasattr(tokenizer, 'tokenizer') and hasattr(tokenizer.tokenizer, 'sp_model'):
+                sp_model = tokenizer.tokenizer.sp_model
+
+            if stop_token_ids is None:
+                if hasattr(tokenizer, 'eos_token_id') and tokenizer.eos_token_id is not None:
+                    stop_token_ids = [tokenizer.eos_token_id]
+                else:
+                    eos_id = sp_model.eos_id()
+                    if eos_id != -1:
+                        stop_token_ids = [eos_id]
+                    else:
+                        logger.warning('When constructing TokenizerInfo from a huggingface tokenizer, '
+                                       'stop_token_ids is neither provided by user nor found from the tokenizer. '
+                                       'It will be automatically detected.')
+            # detect vocab_type of tokenizer
+            if '<0x0A>' in vocab_dict:
+                vocab_type = VocabType.BYTE_FALLBACK
+            else:
+                vocab_type = VocabType.RAW
+
+            return _xgr.TokenizerInfo(
+                encoded_vocab,
+                vocab_type=vocab_type,
+                vocab_size=vocab_size,
+                stop_token_ids=stop_token_ids,
+                add_prefix_space=True,
+            )
+
+        else:
+            # TODO(yixin): unsupported tokenizer
+            raise ValueError(f'Unsupported tokenizer type: {type(tokenizer)}')
+
     def _check_unloaded_tm_params(self):
         tm_params = self._tm_model.tm_params
         if len(tm_params) > 0:
 
@@ -139,9 +139,9 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
     return OutputParam{outputs_, state, metrics};
 }
 
-void ModelRequest::setGrammar(std::shared_ptr<xgrammar::CompiledGrammar> grammar)
+void ModelRequest::setGrammar(const xgrammar::CompiledGrammar& grammar)
 {
-    grammar_ = grammar;
+    grammar_ = std::make_shared<xgrammar::CompiledGrammar>(grammar);
 }
 
 }  // namespace turbomind
@@ -40,7 +40,7 @@ class ModelRequest {
     };
 
     OutputParam Forward(InputParam param, std::function<void()> cb);
-    void setGrammar(std::shared_ptr<xgrammar::CompiledGrammar> grammar);
+    void        setGrammar(const xgrammar::CompiledGrammar& grammar);
 
 protected:
     Gateway* const gateway_;
@@ -55,8 +55,8 @@ class ModelRequest {
 
     std::weak_ptr<Request> request_;
 
-    std::shared_ptr<TensorMap> inputs_;
-    std::shared_ptr<TensorMap> outputs_;
+    std::shared_ptr<TensorMap>                 inputs_;
+    std::shared_ptr<TensorMap>                 outputs_;
     std::shared_ptr<xgrammar::CompiledGrammar> grammar_;
 };
 
 
@@ -13,9 +13,13 @@ if(NOT pybind11_FOUND)
 endif()
 
 pybind11_add_module(${PROJECT_NAME} bind.cpp)
-target_link_libraries(${PROJECT_NAME} PRIVATE LlamaTritonBackend xgrammar)
+target_link_libraries(${PROJECT_NAME} PRIVATE LlamaTritonBackend)
 target_compile_features(${PROJECT_NAME} PRIVATE cxx_std_14)
 
+pybind11_add_module(_xgrammar xgrammar_bind.cpp)
+target_link_libraries(_xgrammar PRIVATE core xgrammar)
+target_compile_features(_xgrammar PRIVATE cxx_std_14)
+
 if (CALL_FROM_SETUP_PY)
   set(_INSTALL_CUDA_RPATH
       "\$ORIGIN"
Original file line number	Diff line number	Diff line change
`@@ -139,9 +139,9 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output`
`139`	`139`	`return OutputParam{outputs_, state, metrics};`
`140`	`140`	`}`
`141`	`141`
`142`		`-void ModelRequest::setGrammar(std::shared_ptr<xgrammar::CompiledGrammar> grammar)`
	`142`	`+void ModelRequest::setGrammar(const xgrammar::CompiledGrammar& grammar)`
`143`	`143`	`{`
`144`		`- grammar_ = grammar;`
	`144`	`+ grammar_ = std::make_shared<xgrammar::CompiledGrammar>(grammar);`
`145`	`145`	`}`
`146`	`146`
`147`	`147`	`} // namespace turbomind`