Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions common/config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,21 @@ class ModelConfig(BaseConfigModel):
"Enables vision support if the model supports it. (default: False)"
),
)
reasoning: bool = Field(
False,
description=(
"Enable the reasoning parser (default: False).\n"
"Split response message into reasoning_content and content fields."
),
)
reasoning_start_token: str = Field(
"<think>",
description=("Start token for the reasoning parser (default: <think>)."),
)
reasoning_end_token: str = Field(
"</think>",
description=("End token for the reasoning parser (default: </think>)."),
)

_metadata: Metadata = PrivateAttr(Metadata())
model_config = ConfigDict(protected_namespaces=())
Expand Down
10 changes: 10 additions & 0 deletions config_sample.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,16 @@ model:
# Enables vision support if the model supports it. (default: False)
vision: false

# Enable reasoning parser (default: False).
# Do NOT enable this if the model is not a reasoning model (e.g. deepseek-r1 series)
reasoning: false

# The start token for reasoning conetnt (default: "<think>")
reasoning_start_token: "<think>"

# The end token for reasoning conetnt (default: "</think>")
reasoning_end_token: "</think>"

# Options for draft models (speculative decoding)
# This will use more VRAM!
draft_model:
Expand Down
1 change: 1 addition & 0 deletions endpoints/OAI/types/chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class ChatCompletionMessagePart(BaseModel):
class ChatCompletionMessage(BaseModel):
role: str = "user"
content: Optional[Union[str, List[ChatCompletionMessagePart]]] = None
reasoning_content: Optional[str] = None
tool_calls: Optional[List[ToolCall]] = None
tool_calls_json: SkipJsonSchema[Optional[str]] = None

Expand Down
68 changes: 62 additions & 6 deletions endpoints/OAI/utils/chat_completion.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
handle_request_error,
request_disconnect_loop,
)
from common.tabby_config import config
from common.utils import unwrap
from endpoints.OAI.types.chat_completion import (
ChatCompletionLogprobs,
Expand All @@ -33,6 +34,25 @@
from endpoints.OAI.utils.tools import ToolCallProcessor


def _extract_think_content(text: str) -> tuple[Optional[str], Optional[str]]:
"""Extract content between <think> tags and the remaining content.
Only available in none-streaming mode."""
if (
config.model.reasoning_start_token not in text
and config.model.reasoning_end_token not in text
):
return None, text
elif config.model.reasoning_start_token in text:
start_reasoning = text.split(config.model.reasoning_start_token)[1]
reasoning_content = start_reasoning.split(config.model.reasoning_end_token)[0]
content = start_reasoning.split(config.model.reasoning_end_token)[1]
return reasoning_content.strip(), content.strip()
else:
reasoning_content = text.split(config.model.reasoning_end_token)[0]
content = text.split(config.model.reasoning_end_token)[1]
return reasoning_content.strip(), content.strip()


def _create_response(
request_id: str, generations: List[dict], model_name: Optional[str]
):
Expand All @@ -43,9 +63,16 @@ def _create_response(

choices = []
for index, generation in enumerate(generations):
message = ChatCompletionMessage(
role="assistant", content=unwrap(generation.get("text"), "")
)
if config.model.reasoning:
raw_content = unwrap(generation.get("text"), "")
reasoning_content, content = _extract_think_content(raw_content)
message = ChatCompletionMessage(
role="assistant", reasoning_content=reasoning_content, content=content
)
else:
message = ChatCompletionMessage(
role="assistant", content=unwrap(generation.get("text"), "")
)

tool_calls = generation["tool_calls"]
if tool_calls:
Expand Down Expand Up @@ -110,6 +137,7 @@ def _create_stream_chunk(
generation: Optional[dict] = None,
model_name: Optional[str] = None,
is_usage_chunk: bool = False,
is_reasoning_chunk: bool = False,
):
"""Create a chat completion stream chunk from the provided text."""

Expand Down Expand Up @@ -144,8 +172,14 @@ def _create_stream_chunk(
choices.append(choice)

else:
message = ChatCompletionMessage(
role="assistant", content=unwrap(generation.get("text"), "")
message = (
ChatCompletionMessage(
role="assistant", reasoning_content=unwrap(generation.get("text"), "")
)
if is_reasoning_chunk
else ChatCompletionMessage(
role="assistant", content=unwrap(generation.get("text"), "")
)
)

logprob_response = None
Expand Down Expand Up @@ -337,6 +371,8 @@ async def stream_generate_chat_completion(
# We need to keep track of the text generated so we can resume the tool calls
current_generation_text = ""

is_reasoning_chunk = config.model.reasoning

# Consumer loop
while True:
if disconnect_task.done():
Expand Down Expand Up @@ -364,8 +400,28 @@ async def stream_generate_chat_completion(
if isinstance(generation, Exception):
raise generation

if (
unwrap(generation.get("text"), "") == config.model.reasoning_start_token
and config.model.reasoning
):
# Update reasoning chunk flag
is_reasoning_chunk = True
# And skip this token
continue
if (
unwrap(generation.get("text"), "") == config.model.reasoning_end_token
and config.model.reasoning
):
# Update reasoning chunk flag
is_reasoning_chunk = False
# And skip this token
continue

response = _create_stream_chunk(
request.state.id, generation, model_path.name
request.state.id,
generation,
model_path.name,
is_reasoning_chunk=is_reasoning_chunk,
)
yield response.model_dump_json()

Expand Down