Skip to content

Commit 1a55fc2

Browse files
committed
fix: enable openai guided decoding function for turbomind
1 parent 297effb commit 1a55fc2

File tree

3 files changed

+15
-17
lines changed

3 files changed

+15
-17
lines changed

lmdeploy/messages.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ class GenerationConfig:
6363
around special tokens. The behavior of Fast tokenizers is to have
6464
this to False. This is setup to True in slow tokenizers.
6565
logprobs (int): Number of log probabilities to return per output token.
66-
response_format (Dict): Only pytorch backend support formatting
66+
response_format (Dict): Generate responses according to given formatting.
6767
response. Examples:
6868
{
6969
"type": "json_schema",

lmdeploy/serve/openai/api_server.py

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -129,17 +129,17 @@ def create_error_response(status: HTTPStatus, message: str, error_type='invalid_
129129
async def check_request(request) -> Optional[JSONResponse]:
130130
"""Check if a request is valid."""
131131
if hasattr(request, 'model') and request.model not in get_model_list():
132-
return create_error_response(HTTPStatus.NOT_FOUND, f'The model `{request.model}` does not exist.')
132+
return create_error_response(HTTPStatus.NOT_FOUND, f'The model {request.model!r} does not exist.')
133133
if hasattr(request, 'n') and request.n <= 0:
134-
return create_error_response(HTTPStatus.BAD_REQUEST, f'The n `{request.n}` must be a positive int.')
134+
return create_error_response(HTTPStatus.BAD_REQUEST, f'The n {request.n!r} must be a positive int.')
135135
if hasattr(request, 'top_p') and not (request.top_p > 0 and request.top_p <= 1):
136-
return create_error_response(HTTPStatus.BAD_REQUEST, f'The top_p `{request.top_p}` must be in (0, 1].')
136+
return create_error_response(HTTPStatus.BAD_REQUEST, f'The top_p {request.top_p!r} must be in (0, 1].')
137137
if hasattr(request, 'top_k') and request.top_k < 0:
138138
return create_error_response(HTTPStatus.BAD_REQUEST,
139-
f'The top_k `{request.top_k}` cannot be a negative integer.')
139+
f'The top_k {request.top_k!r} cannot be a negative integer.')
140140
if hasattr(request, 'temperature') and not (request.temperature <= 2 and request.temperature >= 0):
141141
return create_error_response(HTTPStatus.BAD_REQUEST,
142-
f'The temperature `{request.temperature}` must be in [0, 2]')
142+
f'The temperature {request.temperature!r} must be in [0, 2]')
143143
return
144144

145145

@@ -315,8 +315,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
315315
1.0 means no penalty
316316
- stop (str | List[str] | None): To stop generating further
317317
tokens. Only accept stop words that's encoded to one token idex.
318-
- response_format (Dict | None): Only pytorch backend support formatting
319-
response. Examples: `{"type": "json_schema", "json_schema": {"name":
318+
- response_format (Dict | None): To generate response according to given
319+
schema. Examples: `{"type": "json_schema", "json_schema": {"name":
320320
"test","schema": {"properties": {"name": {"type": "string"}},
321321
"required": ["name"], "type": "object"}}}`
322322
or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`
@@ -365,7 +365,7 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
365365
if error_check_ret is not None:
366366
return error_check_ret
367367
if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
368-
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.')
368+
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id {request.session_id!r} is occupied.')
369369

370370
model_name = request.model
371371
adapter_name = None
@@ -385,8 +385,6 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
385385
gen_logprobs = request.top_logprobs
386386
response_format = None
387387
if request.response_format and request.response_format.type != 'text':
388-
if VariableInterface.async_engine.backend != 'pytorch':
389-
return create_error_response(HTTPStatus.BAD_REQUEST, 'only pytorch backend can use response_format now')
390388
response_format = request.response_format.model_dump()
391389

392390
if request.logit_bias is not None:
@@ -717,7 +715,7 @@ async def completions_v1(request: CompletionRequest, raw_request: Request = None
717715
if error_check_ret is not None:
718716
return error_check_ret
719717
if VariableInterface.async_engine.id2step.get(request.session_id, 0) != 0:
720-
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id `{request.session_id}` is occupied.')
718+
return create_error_response(HTTPStatus.BAD_REQUEST, f'The session_id {request.session_id!r} is occupied.')
721719

722720
model_name = request.model
723721
adapter_name = None
@@ -1325,8 +1323,8 @@ def serve(model_path: str,
13251323
VariableInterface.proxy_url = proxy_url
13261324
VariableInterface.api_server_url = f'{http_or_https}://{server_name}:{server_port}' # noqa
13271325
for i in range(3):
1328-
print(f'HINT: Please open \033[93m\033[1m{http_or_https}://'
1329-
f'{server_name}:{server_port}\033[0m in a browser for detailed api'
1326+
print(f'HINT: Please open \033[93m\033[1m{http_or_https}://' # noqa: E231
1327+
f'{server_name}:{server_port}\033[0m in a browser for detailed api' # noqa: E231
13301328
' usage!!!')
13311329
uvicorn.run(app=app,
13321330
host=server_name,

lmdeploy/serve/proxy/proxy.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -312,7 +312,7 @@ async def check_request_model(self, model_name) -> Optional[JSONResponse]:
312312
"""Check if a request is valid."""
313313
if model_name in self.model_list:
314314
return
315-
ret = create_error_response(HTTPStatus.NOT_FOUND, f'The model `{model_name}` does not exist.')
315+
ret = create_error_response(HTTPStatus.NOT_FOUND, f'The model {model_name!r} does not exist.')
316316
return ret
317317

318318
def handle_unavailable_model(self, model_name):
@@ -538,8 +538,8 @@ async def chat_completions_v1(request: ChatCompletionRequest, raw_request: Reque
538538
1.0 means no penalty
539539
- stop (str | List[str] | None): To stop generating further
540540
tokens. Only accept stop words that's encoded to one token idex.
541-
- response_format (Dict | None): Only pytorch backend support formatting
542-
response. Examples: `{"type": "json_schema", "json_schema": {"name":
541+
- response_format (Dict | None): To generate response according to given
542+
schema. Examples: `{"type": "json_schema", "json_schema": {"name":
543543
"test","schema": {"properties": {"name": {"type": "string"}},
544544
"required": ["name"], "type": "object"}}}`
545545
or `{"type": "regex_schema", "regex_schema": "call me [A-Za-z]{1,10}"}`

0 commit comments

Comments
 (0)