theroyallab · Orion-zhen · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025 · Mar 1, 2025
diff --git a/common/config_models.py b/common/config_models.py
@@ -271,6 +271,21 @@ class ModelConfig(BaseConfigModel):
             "Enables vision support if the model supports it. (default: False)"
         ),
     )
+    reasoning: bool = Field(
+        False,
+        description=(
+            "Enable the reasoning parser (default: False).\n"
+            "Split response message into reasoning_content and content fields."
+        ),
+    )
+    reasoning_start_token: str = Field(
+        "<think>",
+        description=("Start token for the reasoning parser (default: <think>)."),
+    )
+    reasoning_end_token: str = Field(
+        "</think>",
+        description=("End token for the reasoning parser (default: </think>)."),
+    )
 
     _metadata: Metadata = PrivateAttr(Metadata())
     model_config = ConfigDict(protected_namespaces=())

diff --git a/config_sample.yml b/config_sample.yml
@@ -139,6 +139,16 @@ model:
   # Enables vision support if the model supports it. (default: False)
   vision: false
 
+  # Enable reasoning parser (default: False).
+  # Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series)
+  reasoning: false
+
+  # The start token for reasoning conetnt (default: "<think>")
+  reasoning_start_token: "<think>"
+
+  # The end token for reasoning conetnt (default: "</think>")
+  reasoning_end_token: "</think>"
+
 # Options for draft models (speculative decoding)
 # This will use more VRAM!
 draft_model:

diff --git a/endpoints/OAI/types/chat_completion.py b/endpoints/OAI/types/chat_completion.py
@@ -31,6 +31,7 @@ class ChatCompletionMessagePart(BaseModel):
 class ChatCompletionMessage(BaseModel):
     role: str = "user"
     content: Optional[Union[str, List[ChatCompletionMessagePart]]] = None
+    reasoning_content: Optional[str] = None
     tool_calls: Optional[List[ToolCall]] = None
     tool_calls_json: SkipJsonSchema[Optional[str]] = None
 

diff --git a/endpoints/OAI/utils/chat_completion.py b/endpoints/OAI/utils/chat_completion.py
@@ -17,6 +17,7 @@
     handle_request_error,
     request_disconnect_loop,
 )
+from common.tabby_config import config
 from common.utils import unwrap
 from endpoints.OAI.types.chat_completion import (
     ChatCompletionLogprobs,
@@ -33,6 +34,25 @@
 from endpoints.OAI.utils.tools import ToolCallProcessor
 
 
+def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
+    """Extract content between <think> tags and the remaining content.
+    Only available in none-streaming mode."""
+    if (
+        config.model.reasoning_start_token not in text
+        and config.model.reasoning_end_token not in text
+    ):
+        return None, text
+    elif config.model.reasoning_start_token in text:
+        start_reasoning = text.split(config.model.reasoning_start_token)[1]
+        reasoning_content = start_reasoning.split(config.model.reasoning_end_token)[0]
+        content = start_reasoning.split(config.model.reasoning_end_token)[1]
+        return reasoning_content.strip(), content.strip()
+    else:
+        reasoning_content = text.split(config.model.reasoning_end_token)[0]
+        content = text.split(config.model.reasoning_end_token)[1]
+        return reasoning_content.strip(), content.strip()
+
+
 def _create_response(
     request_id: str, generations: List[dict], model_name: Optional[str]
 ):
@@ -43,9 +63,16 @@ def _create_response(
 
     choices = []
     for index, generation in enumerate(generations):
-        message = ChatCompletionMessage(
-            role="assistant", content=unwrap(generation.get("text"), "")
-        )
+        if config.model.reasoning:
+            raw_content = unwrap(generation.get("text"), "")
+            reasoning_content, content = _extract_think_content(raw_content)
+            message = ChatCompletionMessage(
+                role="assistant", reasoning_content=reasoning_content, content=content
+            )
+        else:
+            message = ChatCompletionMessage(
+                role="assistant", content=unwrap(generation.get("text"), "")
+            )
 
         tool_calls = generation["tool_calls"]
         if tool_calls:
@@ -110,6 +137,7 @@ def _create_stream_chunk(
     generation: Optional[dict] = None,
     model_name: Optional[str] = None,
     is_usage_chunk: bool = False,
+    is_reasoning_chunk: bool = False,
 ):
     """Create a chat completion stream chunk from the provided text."""
 
@@ -144,8 +172,14 @@ def _create_stream_chunk(
         choices.append(choice)
 
     else:
-        message = ChatCompletionMessage(
-            role="assistant", content=unwrap(generation.get("text"), "")
+        message = (
+            ChatCompletionMessage(
+                role="assistant", reasoning_content=unwrap(generation.get("text"), "")
+            )
+            if is_reasoning_chunk
+            else ChatCompletionMessage(
+                role="assistant", content=unwrap(generation.get("text"), "")
+            )
         )
 
         logprob_response = None
@@ -337,6 +371,8 @@ async def stream_generate_chat_completion(
         # We need to keep track of the text generated so we can resume the tool calls
         current_generation_text = ""
 
+        is_reasoning_chunk = config.model.reasoning
+
         # Consumer loop
         while True:
             if disconnect_task.done():
@@ -364,8 +400,28 @@ async def stream_generate_chat_completion(
             if isinstance(generation, Exception):
                 raise generation
 
+            if (
+                unwrap(generation.get("text"), "") == config.model.reasoning_start_token
+                and config.model.reasoning
+            ):
+                # Update reasoning chunk flag
+                is_reasoning_chunk = True
+                # And skip this token
+                continue
+            if (
+                unwrap(generation.get("text"), "") == config.model.reasoning_end_token
+                and config.model.reasoning
+            ):
+                # Update reasoning chunk flag
+                is_reasoning_chunk = False
+                # And skip this token
+                continue
+
             response = _create_stream_chunk(
-                request.state.id, generation, model_path.name
+                request.state.id,
+                generation,
+                model_path.name,
+                is_reasoning_chunk=is_reasoning_chunk,
             )
             yield response.model_dump_json()