@@ -180,8 +180,6 @@ void pad_position_ids(const ov::SoPtr<ov::ITensor>& padded_position_ids, const o
180
180
181
181
constexpr uint32_t INPUT_IDS_SEQ_LEN_DIM = 1 ;
182
182
183
- constexpr std::size_t kStartOutputKVCacheLayers = 1 ;
184
-
185
183
} // anonymous namespace
186
184
187
185
ov::npuw::LLMInferRequest::LLMInferRequest (const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model)
@@ -192,6 +190,7 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
192
190
}
193
191
for (const auto & output_port : m_npuw_llm_compiled_model->outputs ()) {
194
192
init_tensor (output_port);
193
+ m_out_tensors[output_port] = get_tensor (output_port);
195
194
}
196
195
197
196
auto input_ids_port = find_port_by_name (compiled_model->m_prefill_compiled ->inputs (), layer_names::input_ids);
@@ -203,8 +202,8 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
203
202
m_input_ids_name = layer_names::inputs_embeds;
204
203
}
205
204
206
- m_kvcache_request = compiled_model ->m_kvcache_compiled ->create_infer_request ();
207
- m_prefill_request = compiled_model ->m_prefill_compiled ->create_infer_request ();
205
+ m_kvcache_request = m_npuw_llm_compiled_model ->m_kvcache_compiled ->create_infer_request ();
206
+ m_prefill_request = m_npuw_llm_compiled_model ->m_prefill_compiled ->create_infer_request ();
208
207
209
208
for (const auto & input_port : m_prefill_request->get_compiled_model ()->inputs ()) {
210
209
m_prefill_in_ports.emplace (input_port.get_any_name (), input_port);
@@ -230,7 +229,6 @@ ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCo
230
229
m_lm_head_request = compiled_model->m_lm_head_compiled ->create_infer_request ();
231
230
OPENVINO_ASSERT (m_lm_head_request);
232
231
const ov::Output<const ov::Node> lm_head_embed_port = m_lm_head_request->get_inputs ()[0 ];
233
- m_lm_head_logits_port = m_lm_head_request->get_outputs ()[0 ];
234
232
m_prefill_request->set_tensor (m_prefill_out_ports.at (layer_names::output_embeds),
235
233
m_lm_head_request->get_tensor (lm_head_embed_port));
236
234
m_kvcache_request->set_tensor (m_kvcache_out_ports.at (layer_names::output_embeds),
@@ -274,16 +272,13 @@ void ov::npuw::LLMInferRequest::copy_kvcache() {
274
272
auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
275
273
const auto & kvcache_compiled = m_kvcache_request->get_compiled_model ();
276
274
// FIXME: Find only matching by names outputs and copy them, having previously checked that such inputs exist
277
- for (std::size_t i = kStartOutputKVCacheLayers ; i < kvcache_compiled->outputs ().size (); ++i) {
275
+ for (std::size_t i = kvcache_desc. start_idx_in_outputs ; i < kvcache_compiled->outputs ().size (); ++i) {
278
276
const auto & output_name = kvcache_compiled->outputs ()[i].get_any_name ();
279
277
auto prefill_out_tensor = m_prefill_request->get_tensor (m_prefill_out_ports.at (output_name));
280
278
281
279
const auto & input_name = std::regex_replace (output_name, std::regex (" present" ), layer_names::past_key_values);
282
- if (m_kvcache_in_ports.find (input_name) == m_kvcache_in_ports.end ()) {
283
- // FIXME: Totally wrong debug message. input_name is an invalid name of input layer.
284
- LOG_DEBUG (" Input name " << input_name << " doesn't contain kv cache. Skipping." );
285
- continue ;
286
- }
280
+ NPUW_ASSERT (m_kvcache_in_ports.find (input_name) == m_kvcache_in_ports.end ());
281
+
287
282
auto kvcache_in_tensor = m_kvcache_request->get_tensor (m_kvcache_in_ports.at (input_name));
288
283
289
284
const auto & kv_dim = (output_name.find (" value" ) != std::string::npos && kvcache_desc.v_tensors_transposed )
@@ -353,14 +348,10 @@ void ov::npuw::LLMInferRequest::update_kvcache_for(
353
348
auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
354
349
auto & compiled = request->get_compiled_model ();
355
350
// FIXME: Find only matching by names outputs and copy them, having previously checked that such inputs exist
356
- for (std::size_t i = kStartOutputKVCacheLayers ; i < compiled->outputs ().size (); ++i) {
351
+ for (std::size_t i = kvcache_desc. start_idx_in_outputs ; i < compiled->outputs ().size (); ++i) {
357
352
const auto & output_name = compiled->outputs ()[i].get_any_name ();
358
353
const auto & input_name = std::regex_replace (output_name, std::regex (" present" ), layer_names::past_key_values);
359
- if (in_ports.find (input_name) == in_ports.end ()) {
360
- // FIXME: Totally wrong debug message. input_name is an invalid name of input layer.
361
- LOG_DEBUG (" Input name " << input_name << " doesn't contain kv cache. Skipping." );
362
- continue ;
363
- }
354
+ OPENVINO_ASSERT (in_ports.find (input_name) == in_ports.end ());
364
355
auto dst_tensor = request->get_tensor (in_ports.at (input_name));
365
356
const auto & kv_dim = (output_name.find (" value" ) != std::string::npos && kvcache_desc.v_tensors_transposed )
366
357
? 3u
@@ -378,14 +369,11 @@ void ov::npuw::LLMInferRequest::update_kvcache_for(
378
369
void ov::npuw::LLMInferRequest::clear_chunk_prefill_kv_cache () {
379
370
const auto & prefill_compiled = m_prefill_request->get_compiled_model ();
380
371
381
- for (std::size_t i = kStartOutputKVCacheLayers ; i < prefill_compiled->outputs ().size (); ++i) {
372
+ auto & kvcache_desc = m_npuw_llm_compiled_model->m_kvcache_desc ;
373
+ for (std::size_t i = kvcache_desc.start_idx_in_outputs ; i < prefill_compiled->outputs ().size (); ++i) {
382
374
const auto & output_name = prefill_compiled->outputs ()[i].get_any_name ();
383
375
const auto & input_name = std::regex_replace (output_name, std::regex (" present" ), " past_key_values" );
384
- if (m_prefill_in_ports.find (input_name) == m_prefill_in_ports.end ()) {
385
- // FIXME: Totally wrong debug message. input_name is an invalid name of input layer.
386
- LOG_DEBUG (" Input name " << input_name << " doesn't contain kv cache. Skipping." );
387
- continue ;
388
- }
376
+ OPENVINO_ASSERT (m_prefill_in_ports.find (input_name) == m_prefill_in_ports.end ());
389
377
390
378
auto chunk_prefill_kvcache_in_tensor = m_prefill_request->get_tensor (m_prefill_in_ports.at (input_name));
391
379
@@ -504,6 +492,17 @@ void ov::npuw::LLMInferRequest::infer_whole_prefill(ov::SoPtr<ov::ITensor> input
504
492
LOG_DEBUG (" Done" );
505
493
}
506
494
495
+ void ov::npuw::LLMInferRequest::update_out_tensors_from (std::shared_ptr<ov::IAsyncInferRequest> request) {
496
+ auto orig_outputs = m_npuw_llm_compiled_model->outputs ();
497
+ auto request_outputs = request->get_outputs ();
498
+ // FIXME: We rely here on a strong assumption, that all outputs are
499
+ // ordered the same way between m_npuw_llm_compiled_model
500
+ // and a model of passed request.
501
+ for (std::size_t idx = 0 ; idx < orig_outputs.size (); ++idx) {
502
+ m_out_tensors[orig_outputs[idx]] = request->get_tensor (request_outputs[idx]);
503
+ }
504
+ }
505
+
507
506
void ov::npuw::LLMInferRequest::infer_prefill (ov::SoPtr<ov::ITensor> input_ids,
508
507
ov::SoPtr<ov::ITensor> attention_mask,
509
508
ov::SoPtr<ov::ITensor> position_ids) {
@@ -531,9 +530,9 @@ void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids,
531
530
if (m_lm_head_request) {
532
531
LOG_DEBUG (" Calling inference for LM head model." );
533
532
m_lm_head_request->infer ();
534
- m_logits = m_lm_head_request-> get_tensor (m_lm_head_logits_port );
533
+ update_out_tensors_from (m_lm_head_request );
535
534
} else {
536
- m_logits = m_prefill_request-> get_tensor (m_prefill_out_ports. at (layer_names::logits) );
535
+ update_out_tensors_from (m_prefill_request );
537
536
}
538
537
539
538
m_generate_initialized = false ;
@@ -594,13 +593,13 @@ void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids,
594
593
m_lm_head_request->wait ();
595
594
LOG_DEBUG (" Calling inference for LM head model -- done." );
596
595
597
- m_logits = m_lm_head_request-> get_tensor (m_lm_head_logits_port );
596
+ update_out_tensors_from (m_lm_head_request );
598
597
} else {
599
598
if (kvcache_desc.num_stored_tokens < kvcache_desc.total_size ) {
600
599
update_kvcache_for (m_kvcache_request, m_kvcache_in_ports, m_kvcache_out_ports, 1 );
601
600
}
602
601
603
- m_logits = m_kvcache_request-> get_tensor (m_kvcache_out_ports. at (layer_names::logits) );
602
+ update_out_tensors_from (m_kvcache_request );
604
603
}
605
604
606
605
LOG_DEBUG (" Done" );
@@ -630,9 +629,5 @@ void ov::npuw::LLMInferRequest::infer() {
630
629
}
631
630
632
631
ov::SoPtr<ov::ITensor> ov::npuw::LLMInferRequest::get_tensor (const ov::Output<const ov::Node>& port) const {
633
- // NB: If asked for logits...
634
- if (port == get_outputs ()[0 ]) {
635
- return m_logits;
636
- }
637
- return ov::ISyncInferRequest::get_tensor (port);
632
+ return m_out_tensors.at (port);
638
633
}
0 commit comments