diff --git a/README.md b/README.md index 2a644fd..0425c81 100644 --- a/README.md +++ b/README.md @@ -359,6 +359,13 @@ response = openai.ChatCompletion.create( stream=False ) print(response.choices[0].message.content) + +# creates an embedding vector representing the input text +response = openai.Embedding.create( + model="text-embedding-ada-002", + input="你好" +) +print(response.data[0].embedding) ```
diff --git a/README_CN.md b/README_CN.md index af4d8f9..e158780 100644 --- a/README_CN.md +++ b/README_CN.md @@ -362,6 +362,13 @@ response = openai.ChatCompletion.create( stream=False ) print(response.choices[0].message.content) + +# 创建输入文本的嵌入向量 +response = openai.Embedding.create( + model="text-embedding-ada-002", + input="你好" +) +print(response.data[0].embedding) ```
diff --git a/README_JA.md b/README_JA.md index f178493..7808392 100644 --- a/README_JA.md +++ b/README_JA.md @@ -342,7 +342,7 @@ import openai openai.api_base = "http://localhost:8000/v1" openai.api_key = "none" -# create a request activating streaming response +# ストリーミングレスポンスをアクティブ化するリクエストを作成してください。 for chunk in openai.ChatCompletion.create( model="Qwen-7B", messages=[ @@ -353,7 +353,7 @@ for chunk in openai.ChatCompletion.create( if hasattr(chunk.choices[0].delta, "content"): print(chunk.choices[0].delta.content, end="", flush=True) -# create a request not activating streaming response +# ストリーミングレスポンスをアクティブ化しないリクエストを作成してください。 response = openai.ChatCompletion.create( model="Qwen-7B", messages=[ @@ -362,6 +362,13 @@ response = openai.ChatCompletion.create( stream=False ) print(response.choices[0].message.content) + +# 入力テキストを表す埋め込みベクトルを作成します +response = openai.Embedding.create( + model="text-embedding-ada-002", + input="你好" +) +print(response.data[0].embedding) ```
diff --git a/openai_api.py b/openai_api.py index da105f3..82abb6f 100644 --- a/openai_api.py +++ b/openai_api.py @@ -5,6 +5,7 @@ from argparse import ArgumentParser import time +import tiktoken import torch import uvicorn from pydantic import BaseModel, Field @@ -15,6 +16,7 @@ from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM from transformers.generation import GenerationConfig from sse_starlette.sse import ServerSentEvent, EventSourceResponse +import torch.nn.functional as F @asynccontextmanager @@ -51,6 +53,12 @@ class ModelList(BaseModel): data: List[ModelCard] = [] +class UsageInfo(BaseModel): + prompt_tokens: int = 0 + total_tokens: int = 0 + completion_tokens: Optional[int] = 0 + + class ChatMessage(BaseModel): role: Literal["user", "assistant", "system"] content: str @@ -89,6 +97,20 @@ class ChatCompletionResponse(BaseModel): created: Optional[int] = Field(default_factory=lambda: int(time.time())) +class EmbeddingsRequest(BaseModel): + model: Optional[str] = None + engine: Optional[str] = None + input: Union[str, List[Any]] + user: Optional[str] = None + + +class EmbeddingsResponse(BaseModel): + object: str = "list" + data: List[Dict[str, Any]] + model: str + usage: UsageInfo + + @app.get("/v1/models", response_model=ModelList) async def list_models(): global model_args @@ -133,6 +155,66 @@ async def create_chat_completion(request: ChatCompletionRequest): return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion") +@app.post("/v1/embeddings", response_model=EmbeddingsResponse) +async def create_embeddings(request: EmbeddingsRequest): + global model, tokenizer + input = request.input + + # Decode inputs with different encodings into a string for subsequent model encoding. + if isinstance(input, str): + input = [input] + elif isinstance(input, list): + if isinstance(input[0], int): + decoding = tiktoken.model.encoding_for_model(request.model) + input = [decoding.decode(input)] + elif isinstance(input[0], list): + decoding = tiktoken.model.encoding_for_model(request.model) + input = [decoding.decode(text) for text in input] + + embedding_data = [] + total_tokens = 0 + batch_size = 4 + batches = [ + input[i: min(i + batch_size, len(input))] + for i in range(0, len(input), batch_size) + ] + + # Multi input batch processing + for num_batch, batch in enumerate(batches): + embedding = [] + token_num = 0 + + for text in batch: + input_ids = tokenizer.encode(text, return_tensors="pt").to( + model.device + ) + model_output = model(input_ids, output_hidden_states=True) + data = model_output.hidden_states[-1][0] + data = F.normalize(torch.mean(data, dim=0), p=2, dim=0) + embedding.append(data.tolist()) + token_num += len(input_ids[0]) + + embedding_data += [ + { + "object": "embedding", + "embedding": emb, + "index": num_batch * batch_size + i, + } + for i, emb in enumerate(embedding) + ] + total_tokens += token_num + + return EmbeddingsResponse( + data=embedding_data, + model=request.model, + usage=UsageInfo( + prompt_tokens=token_num, + total_tokens=token_num, + completion_tokens=None, + ), + ).dict(exclude_none=True) + + async def predict(query: str, history: List[List[str]], model_id: str): global model, tokenizer