feat: enable grammar init in turbomind

windreamer · windreamer · commit f93c1b1ed494 · 2025-09-12T12:16:53.000+08:00
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -32,6 +32,7 @@
 lmdeploy_dir = osp.split(lmdeploy.__file__)[0]
 sys.path.append(osp.join(lmdeploy_dir, 'lib'))
 import _turbomind as _tm  # noqa: E402
+import _xgrammar as _xgr  # noqa: E402
 
 logger = get_logger('lmdeploy')
 
@@ -125,6 +126,11 @@ def __init__(self,
                  model_name: str = None,
                  chat_template_name: str = None,
                  engine_config: TurbomindEngineConfig = None,
+                 decode_grammar: Optional[str] = None,
+                 decode_grammar_type: str = 'json_schema',
+                 decode_grammar_threads: int = 4,
+                 decode_grammar_vocab_size: Optional[int] = None,
+                 decode_grammar_extra: Dict[str, Any] = {},
                  **kwargs):
         self.model_name = model_name
         self.chat_template_name = chat_template_name
@@ -156,12 +162,25 @@ def __init__(self,
 
         self.session_len = self.config.session_len
 
+        if decode_grammar is not None:
+            tokenizer_info = _xgr.TokenizerInfo.from_huggingface(tokenizer, vocab_size=decode_grammar_vocab_size)
+            compiler = _xgr.GrammarCompiler(tokenizer_info, max_threads=decode_grammar_threads)
+
+            if decode_grammar_type == 'json_schema':
+                grammar = compiler.compile_json_schema(decode_grammar, **decode_grammar_extra)
+            elif decode_grammar_type == 'regex':
+                grammar = compiler.from_regex(decode_grammar)
+            else:
+                assert False, f'Decode grammar type {decode_grammar_type} should be in ["json_schema", "regex"]'
+
+            self.grammar = grammar
+
     def _check_unloaded_tm_params(self):
         tm_params = self._tm_model.tm_params
         if len(tm_params) > 0:
             uninitialized = list(tm_params.keys())
             logger.warning('the model may not be loaded successfully '
-                           f'with {len(tm_params)} uninitialized params:\n{uninitialized}')
+                           f'with {len(tm_params)} uninitialized params:\n{uninitialized}')  # noqa: E231
 
     def _load_weights(self):
         """Load weights."""
@@ -252,7 +271,7 @@ def _postprocess_config(self, tm_config: TurbomindModelConfig, engine_config: Tu
         # pack `self.config` and `self.engine_config` into a dict
         self.config_dict = self.config.to_dict()
         self.config_dict.update(dict(engine_config=asdict(self.engine_config)))
-        logger.info(f'turbomind model config:\n\n'
+        logger.info(f'turbomind model config:\n\n'  # noqa: E231
                     f'{json.dumps(self.config_dict, indent=2)}')
 
     def _from_hf(self, model_path: str, engine_config: TurbomindEngineConfig):
@@ -550,6 +569,9 @@ def model_inst(self):
 
     def _create_model_instance(self, device_id):
         model_inst = self.tm_model.model_comm.create_model_instance(device_id)
+        if hasattr(self.tm_model, 'grammar'):
+            model_inst.set_grammar(self.tm_model.grammar)
+
         return model_inst
 
     def _get_extra_output_processors(self, outputs: Dict[str, torch.Tensor], gen_config: GenerationConfig,
diff --git a/src/turbomind/engine/model_request.cc b/src/turbomind/engine/model_request.cc
@@ -127,8 +127,8 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
     r->output_ids      = outputs_->at("output_ids");
     r->sequence_length = outputs_->at("sequence_length");
 
-    if (compiled_grammar_) {
-        r->matcher = std::make_shared<xgrammar::GrammarMatcher>(*compiled_grammar_);
+    if (grammar_) {
+        r->matcher = std::make_shared<xgrammar::GrammarMatcher>(*grammar_);
     }
 
     // Keep a weak reference for canceling the request
@@ -139,4 +139,9 @@ auto ModelRequest::Forward(InputParam param, std::function<void()> cb) -> Output
     return OutputParam{outputs_, state, metrics};
 }
 
+void ModelRequest::setGrammar(std::shared_ptr<xgrammar::CompiledGrammar> grammar)
+{
+    grammar_ = grammar;
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/engine/model_request.h b/src/turbomind/engine/model_request.h
@@ -40,6 +40,7 @@ class ModelRequest {
     };
 
     OutputParam Forward(InputParam param, std::function<void()> cb);
+    void setGrammar(std::shared_ptr<xgrammar::CompiledGrammar> grammar);
 
 protected:
     Gateway* const gateway_;
@@ -56,7 +57,7 @@ class ModelRequest {
 
     std::shared_ptr<TensorMap> inputs_;
     std::shared_ptr<TensorMap> outputs_;
-    std::shared_ptr<xgrammar::CompiledGrammar> compiled_grammar_;
+    std::shared_ptr<xgrammar::CompiledGrammar> grammar_;
 };
 
 }  // namespace turbomind
diff --git a/src/turbomind/python/bind.cpp b/src/turbomind/python/bind.cpp
@@ -588,6 +588,12 @@ PYBIND11_MODULE(_turbomind, m)
             py::call_guard<py::gil_scoped_release>(),
             "device_id"_a,
             "tags"_a)
+        .def(
+            "set_grammar",
+            &LlamaTritonModel::setGrammar,
+            py::call_guard<py::gil_scoped_release>(),
+            "grammar"_a
+        )
         .def("__str__", &LlamaTritonModel::toString)
         .def("__repr__", &LlamaTritonModel::toString)
         .def("get_tensor_para_size", &LlamaTritonModel::getTensorParaSize)
@@ -697,31 +703,18 @@ PYBIND11_MODULE(_xgrammar, m)
                 return TokenizerInfo::DeserializeJSON(str, CommonEncodedVocabType(encoded_vocab));
             });
 
-    py::class_<Grammar> pyGrammar(m, "Grammar");
-    pyGrammar
-        .def("to_string", &Grammar::ToString)
-
-        .def_static("from_ebnf", &Grammar::FromEBNF)
-
-        .def_static("from_json_schema",
-                    &Grammar::FromJSONSchema,
-                    py::arg("schema"),
-                    py::arg("any_whitespace"),
-                    py::arg("indent")     = py::none(),
-                    py::arg("separators") = py::none(),
-                    py::arg("strict_mode"),
-                    py::arg("print_converted_ebnf"),
-                    py::call_guard<py::gil_scoped_release>())
-
-        .def_static("from_regex", &Grammar::FromRegex, py::call_guard<py::gil_scoped_release>())
-
-        .def_static("builtin_json_grammar", &Grammar::BuiltinJSONGrammar)
-
-        .def_static("union", &Grammar::Union, py::call_guard<py::gil_scoped_release>())
-
-        .def_static("concat", &Grammar::Concat, py::call_guard<py::gil_scoped_release>())
-
-        .def("serialize_json", &Grammar::SerializeJSON)
-
-        .def_static("deserialize_json", &Grammar::DeserializeJSON);
+    py::class_<GrammarCompiler> pyGrammarCompiler(m, "GrammarCompiler");
+    pyGrammarCompiler.def(py::init<const TokenizerInfo&, int, bool, int64_t>())
+        .def("compile_json_schema",
+             &GrammarCompiler::CompileJSONSchema,
+             py::call_guard<py::gil_scoped_release>(),
+             py::arg("schema"),
+             py::arg("any_whitespace") = false,
+             py::arg("indent")         = py::none(),
+             py::arg("separators")     = py::none(),
+             py::arg("strict_mode")    = true)
+        .def("compile_regex",
+             &GrammarCompiler::CompileRegex,
+             py::call_guard<py::gil_scoped_release>(),
+             py::arg("schema"));
 }
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.cc b/src/turbomind/triton_backend/llama/LlamaTritonModel.cc
@@ -454,8 +454,12 @@ std::unique_ptr<ModelRequest> LlamaTritonModel::createModelInstance(int device_i
 {
     FT_CHECK(engines_[device_id] != nullptr);
 
-    return std::make_unique<ModelRequest>(
+    auto model_inst = std::make_unique<ModelRequest>(
         gateway_.get(), dtype_, engine_param_.session_len, model_param_.vocab_size, model_param_.hidden_units);
+    if (grammar_) {
+        model_inst->setGrammar(grammar_);
+    }
+    return model_inst;
 }
 
 void LlamaTritonModel::createSharedWeights(int device_id, int rank)
@@ -666,4 +670,8 @@ int LlamaTritonModel::getPipelineParaSize()
     return 1;
 }
 
+void LlamaTritonModel::setGrammar(const xgrammar::CompiledGrammar& grammar) {
+    grammar_ = std::make_shared<xgrammar::CompiledGrammar>(grammar);
+}
+
 }  // namespace turbomind
diff --git a/src/turbomind/triton_backend/llama/LlamaTritonModel.h b/src/turbomind/triton_backend/llama/LlamaTritonModel.h
@@ -24,6 +24,8 @@
 #include <string>
 #include <unordered_map>
 
+#include <xgrammar/xgrammar.h>
+
 #include "src/turbomind/comm/device_comm.h"
 
 #include "src/turbomind/engine/gateway.h"
@@ -59,6 +61,8 @@ class LlamaTritonModel {
 
     void wakeup(int device_id, const std::vector<std::string>& tags);
 
+    void setGrammar(const xgrammar::CompiledGrammar& grammar);
+
     std::string toString();
 
     int getTensorParaSize();
@@ -96,6 +100,7 @@ class LlamaTritonModel {
 
     std::string model_name_;
     std::string model_dir_;
+    std::shared_ptr<xgrammar::CompiledGrammar> grammar_;
 };
 
 }  // namespace turbomind

Original file line number	Diff line number	Diff line change
`@@ -454,8 +454,12 @@ std::unique_ptr<ModelRequest> LlamaTritonModel::createModelInstance(int device_i`
`454`	`454`	`{`
`455`	`455`	`FT_CHECK(engines_[device_id] != nullptr);`
`456`	`456`
`457`		`- return std::make_unique<ModelRequest>(`
	`457`	`+ auto model_inst = std::make_unique<ModelRequest>(`
`458`	`458`	`gateway_.get(), dtype_, engine_param_.session_len, model_param_.vocab_size, model_param_.hidden_units);`
	`459`	`+ if (grammar_) {`
	`460`	`+ model_inst->setGrammar(grammar_);`
	`461`	`+ }`
	`462`	`+ return model_inst;`
`459`	`463`	`}`
`460`	`464`
`461`	`465`	`void LlamaTritonModel::createSharedWeights(int device_id, int rank)`
`@@ -666,4 +670,8 @@ int LlamaTritonModel::getPipelineParaSize()`
`666`	`670`	`return 1;`
`667`	`671`	`}`
`668`	`672`
	`673`	`+void LlamaTritonModel::setGrammar(const xgrammar::CompiledGrammar& grammar) {`
	`674`	`+ grammar_ = std::make_shared<xgrammar::CompiledGrammar>(grammar);`
	`675`	`+}`
	`676`	`+`
`669`	`677`	`} // namespace turbomind`