Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file removed llm_bench/.load_test.py.swp
Binary file not shown.
58 changes: 41 additions & 17 deletions llm_bench/load_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@


def add_custom_metric(name, value, length_value=0):
if name == "total_latency":
value = value * 1000
events.request.fire(
request_type="METRIC",
name=name,
Expand Down Expand Up @@ -273,23 +275,45 @@ def format_payload(self, prompt, max_tokens, images):
def parse_output_json(self, data, prompt):
usage = data.get("usage", None)

assert len(data["choices"]) == 1, f"Too many choices {len(data['choices'])}"
choice = data["choices"][0]
if self.parsed_options.chat:
if self.parsed_options.stream:
text = choice["delta"].get("content", "")
else:
text = choice["message"]["content"]
if self.parsed_options.n > 1:
texts = []
for choice in data["choices"]:
if self.parsed_options.chat:
if self.parsed_options.stream:
text = choice["delta"].get("content", "")
else:
text = choice["message"]["content"]
else:
text = choice["text"]
texts.append(text)

combined_text = "\n".join(texts)

logprobs = data["choices"][0].get("logprobs", None)

return ChunkMetadata(
text=combined_text,
logprob_tokens=len(logprobs["tokens"]) if logprobs else None,
usage_tokens=usage["completion_tokens"] if usage else None,
prompt_usage_tokens=usage.get("prompt_tokens", None) if usage else None,
)
else:
text = choice["text"]
choice = data["choices"][0]
if self.parsed_options.chat:
if self.parsed_options.stream:
text = choice["delta"].get("content", "")
else:
text = choice["message"]["content"]
else:
text = choice["text"]

logprobs = choice.get("logprobs", None)
return ChunkMetadata(
text=text,
logprob_tokens=len(logprobs["tokens"]) if logprobs else None,
usage_tokens=usage["completion_tokens"] if usage else None,
prompt_usage_tokens=usage.get("prompt_tokens", None) if usage else None,
)
logprobs = choice.get("logprobs", None)
return ChunkMetadata(
text=text,
logprob_tokens=len(logprobs["tokens"]) if logprobs else None,
usage_tokens=usage["completion_tokens"] if usage else None,
prompt_usage_tokens=usage.get("prompt_tokens", None) if usage else None,
)


class FireworksProvider(OpenAIProvider):
Expand Down Expand Up @@ -823,8 +847,8 @@ def generate_text(self):
add_custom_metric("time_to_first_token", dur_first_token * 1000)
add_custom_metric("total_latency", dur_total * 1000)
if num_tokens:
if num_tokens != max_tokens:
print(f"WARNING: wrong number of tokens: {num_tokens}, expected {max_tokens}")
if num_tokens != max_tokens * self.environment.parsed_options.n:
print(f"WARNING: wrong number of tokens: {num_tokens}, expected {max_tokens * self.environment.parsed_options.n} (max_tokens={max_tokens}, n={self.environment.parsed_options.n})")
add_custom_metric("num_tokens", num_tokens)
add_custom_metric("latency_per_token", dur_generation / num_tokens * 1000, num_tokens)
add_custom_metric(
Expand Down