Skip to content

Commit b5a0806

Browse files
committed
Added possibility for passed model to have multiple outputs
1 parent d31c3aa commit b5a0806

File tree

4 files changed

+59
-41
lines changed

4 files changed

+59
-41
lines changed

src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -592,9 +592,9 @@ std::shared_ptr<ov::Model> cvt_kvcache_to_fp16(const std::shared_ptr<ov::Model>&
592592
return ppp.build();
593593
}
594594

595-
std::shared_ptr<ov::Model> redirect_new_kv_to_output(const std::shared_ptr<ov::Model>& model) {
596-
const auto kStartOutputKVCacheLayers = 1u;
597-
for (std::size_t i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) {
595+
std::shared_ptr<ov::Model> redirect_new_kv_to_output(const std::shared_ptr<ov::Model>& model,
596+
std::size_t start_idx_in_output) {
597+
for (std::size_t i = start_idx_in_output; i < model->outputs().size(); ++i) {
598598
auto kvout = model->output(i);
599599
auto kvrslt = kvout.get_node();
600600
auto kvcat = kvrslt->inputs()[0].get_source_output().get_node();
@@ -1083,7 +1083,8 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
10831083
}
10841084
}
10851085

1086-
m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim};
1086+
m_kvcache_desc =
1087+
KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim, false, model->outputs().size()};
10871088
LOG_DEBUG("Make prefill model with static shapes");
10881089
if (use_chunk_prefill) {
10891090
reshape_to_static(prefill_model,
@@ -1104,7 +1105,7 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
11041105
reshape_sliced_head_to_static(lm_head_model, axes.batch);
11051106
}
11061107

1107-
LOG_DEBUG("5.1, decompose GroupQueryAttention OP");
1108+
LOG_DEBUG("Decompose GroupQueryAttention OP");
11081109
decompose_GQA(prefill_model, true);
11091110
decompose_GQA(kvcache_model, false);
11101111

@@ -1127,11 +1128,11 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& m
11271128
} else {
11281129
LOG_DEBUG("Don't remove input key/values from prefill model.");
11291130
LOG_DEBUG("Ask prefill model to output key/values for prefill chunk size tokens.");
1130-
prefill_model = redirect_new_kv_to_output(prefill_model);
1131+
prefill_model = redirect_new_kv_to_output(prefill_model, m_kvcache_desc.start_idx_in_outputs);
11311132
}
11321133

11331134
LOG_DEBUG("Optimize kvcache model to output key/values for new token.");
1134-
kvcache_model = redirect_new_kv_to_output(kvcache_model);
1135+
kvcache_model = redirect_new_kv_to_output(kvcache_model, m_kvcache_desc.start_idx_in_outputs);
11351136
LOG_DEBUG("Converting KV-cache in kvcache model to FP16.");
11361137
kvcache_model = cvt_kvcache_to_fp16(kvcache_model);
11371138
LOG_DEBUG("Converting KV-cache in prefill model to FP16.");

src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,11 @@ class LLMCompiledModel : public ov::npuw::ICompiledModel {
2525
uint32_t num_stored_tokens = 0u;
2626
uint32_t dim = 0u;
2727
bool v_tensors_transposed = false;
28+
// TODO: As of now KV-cache blocks that are converted from states to
29+
// model's I/O are appended to original model's I/O at the end,
30+
// thus it is safe to loop over KVCache I/O blocks just using some
31+
// start offsets.
32+
std::size_t start_idx_in_outputs = 0u;
2833
};
2934

3035
LLMCompiledModel(const std::shared_ptr<ov::Model>& model,

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp

Lines changed: 27 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -180,8 +180,6 @@ void pad_position_ids(const ov::SoPtr<ov::ITensor>& padded_position_ids, const o
180180

181181
constexpr uint32_t INPUT_IDS_SEQ_LEN_DIM = 1;
182182

183-
constexpr std::size_t kStartOutputKVCacheLayers = 1;
184-
185183
} // anonymous namespace
186184

187185
ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
@@ -192,6 +190,7 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
192190
}
193191
for (const auto& output_port : m_npuw_llm_compiled_model->outputs()) {
194192
init_tensor(output_port);
193+
m_out_tensors[output_port] = get_tensor(output_port);
195194
}
196195

197196
auto input_ids_port = find_port_by_name(compiled_model->m_prefill_compiled->inputs(), layer_names::input_ids);
@@ -203,8 +202,8 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
203202
m_input_ids_name = layer_names::inputs_embeds;
204203
}
205204

206-
m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request();
207-
m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request();
205+
m_kvcache_request = m_npuw_llm_compiled_model->m_kvcache_compiled->create_infer_request();
206+
m_prefill_request = m_npuw_llm_compiled_model->m_prefill_compiled->create_infer_request();
208207

209208
for (const auto& input_port : m_prefill_request->get_compiled_model()->inputs()) {
210209
m_prefill_in_ports.emplace(input_port.get_any_name(), input_port);
@@ -230,7 +229,6 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
230229
m_lm_head_request = compiled_model->m_lm_head_compiled->create_infer_request();
231230
OPENVINO_ASSERT(m_lm_head_request);
232231
const ov::Output<const ov::Node> lm_head_embed_port = m_lm_head_request->get_inputs()[0];
233-
m_lm_head_logits_port = m_lm_head_request->get_outputs()[0];
234232
m_prefill_request->set_tensor(m_prefill_out_ports.at(layer_names::output_embeds),
235233
m_lm_head_request->get_tensor(lm_head_embed_port));
236234
m_kvcache_request->set_tensor(m_kvcache_out_ports.at(layer_names::output_embeds),
@@ -274,16 +272,13 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
274272
auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
275273
const auto& kvcache_compiled = m_kvcache_request->get_compiled_model();
276274
// FIXME: Find only matching by names outputs and copy them, having previously checked that such inputs exist
277-
for (std::size_t i = kStartOutputKVCacheLayers; i < kvcache_compiled->outputs().size(); ++i) {
275+
for (std::size_t i = kvcache_desc.start_idx_in_outputs; i < kvcache_compiled->outputs().size(); ++i) {
278276
const auto& output_name = kvcache_compiled->outputs()[i].get_any_name();
279277
auto prefill_out_tensor = m_prefill_request->get_tensor(m_prefill_out_ports.at(output_name));
280278

281279
const auto& input_name = std::regex_replace(output_name, std::regex("present"), layer_names::past_key_values);
282-
if (m_kvcache_in_ports.find(input_name) == m_kvcache_in_ports.end()) {
283-
// FIXME: Totally wrong debug message. input_name is an invalid name of input layer.
284-
LOG_DEBUG("Input name " << input_name << " doesn't contain kv cache. Skipping.");
285-
continue;
286-
}
280+
NPUW_ASSERT(m_kvcache_in_ports.find(input_name) == m_kvcache_in_ports.end());
281+
287282
auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name));
288283

289284
const auto& kv_dim = (output_name.find("value") != std::string::npos && kvcache_desc.v_tensors_transposed)
@@ -353,14 +348,10 @@ void ov::npuw::LLMInferRequest::update_kvcache_for(
353348
auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
354349
auto& compiled = request->get_compiled_model();
355350
// FIXME: Find only matching by names outputs and copy them, having previously checked that such inputs exist
356-
for (std::size_t i = kStartOutputKVCacheLayers; i < compiled->outputs().size(); ++i) {
351+
for (std::size_t i = kvcache_desc.start_idx_in_outputs; i < compiled->outputs().size(); ++i) {
357352
const auto& output_name = compiled->outputs()[i].get_any_name();
358353
const auto& input_name = std::regex_replace(output_name, std::regex("present"), layer_names::past_key_values);
359-
if (in_ports.find(input_name) == in_ports.end()) {
360-
// FIXME: Totally wrong debug message. input_name is an invalid name of input layer.
361-
LOG_DEBUG("Input name " << input_name << " doesn't contain kv cache. Skipping.");
362-
continue;
363-
}
354+
OPENVINO_ASSERT(in_ports.find(input_name) == in_ports.end());
364355
auto dst_tensor = request->get_tensor(in_ports.at(input_name));
365356
const auto& kv_dim = (output_name.find("value") != std::string::npos && kvcache_desc.v_tensors_transposed)
366357
? 3u
@@ -378,14 +369,11 @@ void ov::npuw::LLMInferRequest::update_kvcache_for(
378369
void ov::npuw::LLMInferRequest::clear_chunk_prefill_kv_cache() {
379370
const auto& prefill_compiled = m_prefill_request->get_compiled_model();
380371

381-
for (std::size_t i = kStartOutputKVCacheLayers; i < prefill_compiled->outputs().size(); ++i) {
372+
auto& kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc;
373+
for (std::size_t i = kvcache_desc.start_idx_in_outputs; i < prefill_compiled->outputs().size(); ++i) {
382374
const auto& output_name = prefill_compiled->outputs()[i].get_any_name();
383375
const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values");
384-
if (m_prefill_in_ports.find(input_name) == m_prefill_in_ports.end()) {
385-
// FIXME: Totally wrong debug message. input_name is an invalid name of input layer.
386-
LOG_DEBUG("Input name " << input_name << " doesn't contain kv cache. Skipping.");
387-
continue;
388-
}
376+
OPENVINO_ASSERT(m_prefill_in_ports.find(input_name) == m_prefill_in_ports.end());
389377

390378
auto chunk_prefill_kvcache_in_tensor = m_prefill_request->get_tensor(m_prefill_in_ports.at(input_name));
391379

@@ -504,6 +492,17 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
504492
LOG_DEBUG("Done");
505493
}
506494

495+
void ov::npuw::LLMInferRequest::update_out_tensors_from(std::shared_ptr<ov::IAsyncInferRequest> request) {
496+
auto orig_outputs = m_npuw_llm_compiled_model->outputs();
497+
auto request_outputs = request->get_outputs();
498+
// FIXME: We rely here on a strong assumption, that all outputs are
499+
// ordered the same way between m_npuw_llm_compiled_model
500+
// and a model of passed request.
501+
for (std::size_t idx = 0; idx < orig_outputs.size(); ++idx) {
502+
m_out_tensors[orig_outputs[idx]] = request->get_tensor(request_outputs[idx]);
503+
}
504+
}
505+
507506
void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
508507
ov::SoPtr<ov::ITensor> attention_mask,
509508
ov::SoPtr<ov::ITensor> position_ids) {
@@ -531,9 +530,9 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
531530
if (m_lm_head_request) {
532531
LOG_DEBUG("Calling inference for LM head model.");
533532
m_lm_head_request->infer();
534-
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
533+
update_out_tensors_from(m_lm_head_request);
535534
} else {
536-
m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at(layer_names::logits));
535+
update_out_tensors_from(m_prefill_request);
537536
}
538537

539538
m_generate_initialized = false;
@@ -594,13 +593,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
594593
m_lm_head_request->wait();
595594
LOG_DEBUG("Calling inference for LM head model -- done.");
596595

597-
m_logits = m_lm_head_request->get_tensor(m_lm_head_logits_port);
596+
update_out_tensors_from(m_lm_head_request);
598597
} else {
599598
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size) {
600599
update_kvcache_for(m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, 1);
601600
}
602601

603-
m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(layer_names::logits));
602+
update_out_tensors_from(m_kvcache_request);
604603
}
605604

606605
LOG_DEBUG("Done");
@@ -630,9 +629,5 @@ void ov::npuw::LLMInferRequest::infer() {
630629
}
631630

632631
ov::SoPtr<ov::ITensor> ov::npuw::LLMInferRequest::get_tensor(const ov::Output<const ov::Node>& port) const {
633-
// NB: If asked for logits...
634-
if (port == get_outputs()[0]) {
635-
return m_logits;
636-
}
637-
return ov::ISyncInferRequest::get_tensor(port);
632+
return m_out_tensors.at(port);
638633
}

src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,22 @@
1010
#include "openvino/core/descriptor/output.hpp"
1111
#include "openvino/runtime/isync_infer_request.hpp"
1212

13+
template <>
14+
struct std::hash<ov::Output<const ov::Node>> {
15+
std::size_t operator()(const ov::Output<const ov::Node>& port) const {
16+
using std::hash;
17+
using std::size_t;
18+
using std::string;
19+
20+
// Compute individual hash values for first,
21+
// second and third and combine them using XOR
22+
// and bit shifting:
23+
24+
return ((hash<std::size_t>()(port.get_index()) ^ (hash<const ov::Node*>()(port.get_node()) << 1)) >> 1) ^
25+
(hash<std::string>()(port.get_any_name()) << 1);
26+
}
27+
};
28+
1329
namespace ov {
1430
namespace npuw {
1531

@@ -60,6 +76,8 @@ class LLMInferRequest final : public ov::ISyncInferRequest {
6076
ov::SoPtr<ov::ITensor> attention_mask,
6177
ov::SoPtr<ov::ITensor> position_ids);
6278

79+
void update_out_tensors_from(std::shared_ptr<ov::IAsyncInferRequest> request);
80+
6381
void infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
6482
ov::SoPtr<ov::ITensor> attention_mask,
6583
ov::SoPtr<ov::ITensor> position_ids);
@@ -73,16 +91,15 @@ class LLMInferRequest final : public ov::ISyncInferRequest {
7391
// This infer request is optional, so can be null.
7492
std::shared_ptr<ov::IAsyncInferRequest> m_lm_head_request;
7593
std::shared_ptr<LLMCompiledModel> m_npuw_llm_compiled_model;
76-
ov::SoPtr<ov::ITensor> m_logits;
7794

7895
std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_in_ports;
7996
std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_out_ports;
8097
std::unordered_map<std::string, ov::Output<const ov::Node>> m_kvcache_in_ports;
8198
std::unordered_map<std::string, ov::Output<const ov::Node>> m_kvcache_out_ports;
82-
ov::Output<const ov::Node> m_lm_head_logits_port;
8399

84100
// NB: It can be either input_ids(LLM) or inputs_embeds(VLM)
85101
std::string m_input_ids_name;
102+
std::unordered_map<ov::Output<const ov::Node>, ov::SoPtr<ov::ITensor>> m_out_tensors;
86103

87104
bool m_generate_initialized = false;
88105
};

0 commit comments

Comments
 (0)