openvinotoolkit · AsyaPronina · Jan 8, 2025 · Jan 10, 2025 · Jan 10, 2025 · Jan 10, 2025
diff --git a/samples/python/text_generation/chat_sample.py b/samples/python/text_generation/chat_sample.py
@@ -18,21 +18,22 @@ def main():
     parser.add_argument('model_dir')
     args = parser.parse_args()
 
-    device = 'CPU'  # GPU can be used as well
-    pipe = openvino_genai.LLMPipeline(args.model_dir, device)
+    device = 'NPU'  # GPU can be used as well
+    pipe = openvino_genai.LLMPipeline(args.model_dir, device,
+    { "STATIC_PIPELINE" : "STATEFUL", "NPUW_DEVICES" : "CPU" })
 
     config = openvino_genai.GenerationConfig()
     config.max_new_tokens = 100
 
-    pipe.start_chat()
+    # pipe.start_chat()
     while True:
         try:
             prompt = input('question:\n')
         except EOFError:
             break
         pipe.generate(prompt, config, streamer)
         print('\n----------')
-    pipe.finish_chat()
+    # pipe.finish_chat()
 
 
 if '__main__' == __name__:

diff --git a/src/cpp/src/llm_pipeline_static.cpp b/src/cpp/src/llm_pipeline_static.cpp
@@ -29,7 +29,41 @@
 #include "json_utils.hpp"
 #include "utils.hpp"
 
+#define PRINT_VAR(var) std::cout << #var << " : " << var << std::endl;
+
 namespace {
+void dump_tensor(const ov::Tensor& input, const std::string& base_path) {
+    ov::Tensor tensor;
+
+    if (input.is_continuous()) {
+        tensor = input;
+    } else {
+        // Create temporary tensor and copy data in. Dumping is never fast, anyway
+        tensor = ov::Tensor(input.get_element_type(), input.get_shape());
+        input.copy_to(tensor);
+    }
+
+    const auto bin_path = base_path + ".bin";
+    {
+        std::ofstream bin_file(bin_path, std::ios_base::out | std::ios_base::binary);
+        bin_file.write(static_cast<const char*>(tensor.data()), static_cast<std::streamsize>(tensor.get_byte_size()));
+    }
+    const auto meta_path = base_path + ".txt";
+    {
+        std::ofstream meta_file(meta_path);
+        meta_file << tensor.get_element_type() << ' ' << tensor.get_shape() << std::endl;
+    }
+}
+
+void dump_config(const std::string& name, const ov::AnyMap& config) {
+    std::cout << name << std::endl;
+    for (auto&& [key, value] : config) {
+        std::cout << key << " : ";
+        value.print(std::cout);
+        std::cout  << ", ";
+    }
+    std::cout << std::endl;
+}
 
 namespace opp = ov::pass::pattern;
 class TransposeValueTensors : public ov::pass::MatcherPass {
@@ -686,14 +720,16 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     const std::string&,
     const ov::AnyMap& config
 ) : LLMPipelineImplBase(tokenizer,
-                        utils::from_config_json_if_exists(models_path)) {
+                        utils::from_config_json_if_exists(models_path)),
+    m_sampler(m_tokenizer) {
 
     auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
     ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
     ov::AnyMap properties = config;
 
     auto compiled = setupAndCompileModel(model, model_desc, properties);
     m_request = compiled->create_infer_request();
+    m_sampler.set_seed(m_generation_config.rng_seed);
 }
 
 
@@ -704,10 +740,12 @@ StatefulLLMPipeline::StatefulLLMPipeline(
     const std::string&,
     const ov::AnyMap& properties,
     const ov::genai::GenerationConfig& generation_config
-) : LLMPipelineImplBase(tokenizer, generation_config) {
+) : LLMPipelineImplBase(tokenizer, generation_config),
+    m_sampler(m_tokenizer) {
     ov::AnyMap properties_copy = properties;
     auto compiled = setupAndCompileModel(model, model_desc, properties_copy);
     m_request = compiled->create_infer_request();
+    m_sampler.set_seed(m_generation_config.rng_seed);
 }
 
 std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
@@ -717,6 +755,7 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
 
     const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
     const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
+    m_max_prompt_len = kMaxPromptLen;
     m_kvcache_total = kMaxPromptLen + kMinResponseLen;
 
     update_config(pipeline_config, {"NPU_USE_NPUW", "YES"});
@@ -725,6 +764,7 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
     KVAxesPosition axes = get_kv_axes(model_desc.type);
     update_config(pipeline_config, {"NPUW_LLM_BATCH_DIM", axes.batch});
     update_config(pipeline_config, {"NPUW_LLM_SEQ_LEN_DIM", axes.seq_len});
+    pipeline_config["NPUW_LLM_PAD_TOKEN_ID"] = m_tokenizer.get_pad_token_id();
 
     update_config(pipeline_config, {"NPUW_LLM_MAX_PROMPT_LEN", kMaxPromptLen});
     update_config(pipeline_config, {"NPUW_LLM_MIN_RESPONSE_LEN", kMinResponseLen});
@@ -744,7 +784,9 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
     // Replace CACHE_DIR option if NPUW is enabled
     set_npuw_cache_dir(pipeline_config);
 
-    return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config));
+    auto comp_model = genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config);
+    ov::genai::utils::print_compiled_model_properties(comp_model, "Stateful LLM NPU Compiled Model");
+    return std::make_shared<ov::CompiledModel>(comp_model);
 }
 
 DecodedResults StatefulLLMPipeline::generate(
@@ -817,7 +859,9 @@ EncodedResults StatefulLLMPipeline::generate(
         attention_mask = data->attention_mask;
     }
 
-    OPENVINO_ASSERT(input_ids.get_shape().at(0) == 1u, "Currently only batch size=1 is supported");
+    ov::Shape prompts_shape = input_ids.get_shape();
+    const size_t batch_size = prompts_shape[0];
+    OPENVINO_ASSERT(batch_size == 1u, "Currently only batch size=1 is supported");
 
     GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
     // If eos_token_id was not provided, take value from default m_generation_config
@@ -834,49 +878,91 @@ EncodedResults StatefulLLMPipeline::generate(
         streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
     }
 
-    OPENVINO_ASSERT(config.is_greedy_decoding(), "Currently only greedy decoding is supported");
+    OPENVINO_ASSERT(config.is_greedy_decoding() || config.is_multinomial(),
+        "Currently only greedy and multinomial decoding are supported");
+
+    OPENVINO_ASSERT(config.num_return_sequences == 1u,
+        "Currently only \"num_return_sequences\" equal to 1 is supported!");
 
-    ov::Shape prompts_shape = input_ids.get_shape();
-    const size_t batch_size = prompts_shape[0];
     ov::genai::EncodedResults results;
     auto& raw_perf_counters = results.perf_metrics.raw_metrics;
     // NB: Only batch=1 is supported now
     results.scores.resize(1u);
     results.scores[0] = 0u;
     results.tokens.resize(1u);
 
-    // TODO: Check if there is enough space in KV-cache to process input prompt
+    // NB: Check if there is enough space in KV-cache to process input prompt
     auto prompt_len = input_ids.get_size();
+    if (prompt_len > m_max_prompt_len) {
+        OPENVINO_THROW("Static Stateful LLM pipeline may only process prompts up to "
+                       + std::to_string(m_max_prompt_len) + " tokens. "
+                       + "Set the \"MAX_PROMPT_LEN\" config option to increase the limit.");
+    }
 
     ov::Tensor position_ids{ov::element::i64, input_ids.get_shape()};
     utils::initialize_position_ids(position_ids, attention_mask);
 
+    ov::genai::utils::print_tensor(input_ids);
+    ov::genai::utils::print_tensor(attention_mask);
+    ov::genai::utils::print_tensor(position_ids);
     m_request.set_tensor("input_ids", input_ids);
     m_request.set_tensor("attention_mask", attention_mask);
     m_request.set_tensor("position_ids", position_ids);
 
     m_request.infer();
 
-    int64_t last_token = utils::argmax(m_request.get_tensor("logits"), 0);
-
-    results.tokens[0].push_back(last_token);
-    if (streamer_ptr && streamer_ptr->put(last_token)) {
-        return results;
+    auto padded_logits = m_request.get_tensor("logits");
+    //dump_tensor(padded_logits, "stateful_padded_logits");
+    auto last_token = utils::argmax(padded_logits, 0);
+    PRINT_VAR(last_token);
+    // FIXME: Here is workaround to get only useful units of returned logits.
+    //        If SliceOut is applied, there will be only 1 useful logit returned,
+    //        nothing is required here.
+    //        Other way, model will return logits of full context length,
+    //        as internally prefill model is specially reshaped to return them.
+    //        Fix should be done on OpenVINO side, so the model should return only
+    //        useful logits of input prompt length, dropping the implementation-related
+    //        padding ones.
+    auto logits = padded_logits;
+    auto padded_sequence_len = padded_logits.get_shape()[1];
+    PRINT_VAR(padded_sequence_len);
+    if (padded_sequence_len > 1) {
+        // If SliceOut is not applied:
+        logits = make_tensor_slice(padded_logits, 1, padded_sequence_len - prompt_len, padded_sequence_len);
     }
+    int64_t output_sequence_len = logits.get_shape().at(1);
+    PRINT_VAR(output_sequence_len);
+    auto sequence_group = std::make_shared<SequenceGroup>(
+        0 /* request_id */, input_ids, config, 1 /* block_size */);
+    sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
+    sequence_group->schedule_tokens(output_sequence_len);
+
+    // NB: Controls what tokens are ready to be pushed into the streamer
+    GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
+        sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters());
+
+    SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
+    stream_generated_tokens(streamer_ptr, handle);
 
     int64_t input_ids_data = -1;
     int64_t position_ids_data = prompt_len - 1;
     std::vector<int64_t> attention_mask_data(prompt_len - 1, 1);
     m_request.set_tensor("input_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1},  reinterpret_cast<void*>(&input_ids_data)));
     m_request.set_tensor("position_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1}, reinterpret_cast<void*>(&position_ids_data)));
 
-    const size_t max_tokens = config.get_max_new_tokens(prompt_len);
-    for (int i = 0; i < max_tokens - 1; ++i) {
+    while (sequence_group->is_running()) {
         // KV Cache is full, no further generation is possible
         if (position_ids_data + 1 == m_kvcache_total) {
+            sequence_group->set_out_of_memory();
             break;
         }
 
+        sequence_group->schedule_tokens(1);
+        const auto running_sequences = sequence_group->get_running_sequences();
+        OPENVINO_ASSERT(running_sequences.size() == 1u);
+        auto last_token = running_sequences.front()->get_generated_ids().back();
+        PRINT_VAR(last_token);
+
         // Just change the variables here, as pointers to them are already set to corresponding tensors
         input_ids_data = last_token;
         ++position_ids_data;
@@ -886,24 +972,29 @@ EncodedResults StatefulLLMPipeline::generate(
 
         m_request.infer();
 
-        last_token = utils::argmax(m_request.get_tensor("logits"), 0);
-        results.tokens[0].push_back(last_token);
-
         raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
-        if (streamer_ptr && streamer_ptr->put(last_token)) {
-            break;
-        }
 
-        if (last_token == config.eos_token_id && !config.ignore_eos) {
-            break;
-        }
+        auto len =  m_request.get_tensor("logits").get_shape()[1];
+        PRINT_VAR(len);
+        auto new_token = utils::argmax(m_request.get_tensor("logits"), 0);
+        PRINT_VAR(new_token);
+
+        SamplerOutput sampler_output = m_sampler.sample(
+            {sequence_group}, m_request.get_tensor("logits"));
+        stream_generated_tokens(streamer_ptr, handle);
     }
 
     if (streamer_ptr) {
         streamer_ptr->end();
     }
 
+    OPENVINO_ASSERT(sequence_group->get_finished_sequences().size() == 1u);
+    auto sequence = sequence_group->get_finished_sequences().front();
+    results.tokens[0] = sequence->get_generated_ids();
+    results.scores[0] = sequence->get_cumulative_log_prob();
+    m_sampler.clear_request_info(sequence_group->get_request_id());
+
     auto stop_time = std::chrono::steady_clock::now();
     // If is called without tokenization then that stat will not be reported.
     auto& metrics = results.perf_metrics;
@@ -1028,8 +1119,8 @@ void StatelessLLMPipeline::setupAndCompileModels(
     auto kvcache_model = model;
     // (2) Expose KV-cache input and output layers from kvcache model
     ov::pass::StatefulToStateless().run_on_model(kvcache_model);
-    // (3) Align u4 ZP constants
-    align_u4_zp_constants(kvcache_model);
+    // // (3) Align u4 ZP constants
+    // align_u4_zp_constants(kvcache_model);
     // (4) Clone the model - this will be prefill
     auto prefill_model = kvcache_model->clone();
     prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
@@ -1071,15 +1162,17 @@ void StatelessLLMPipeline::setupAndCompileModels(
     set_npuw_cache_dir(prefill_config);
     set_npuw_cache_dir(generate_config);
 
+    dump_config("Stateless: Generate config", generate_config);
     auto kv_compiled_model = core.compile_model(
         kvcache_model, device, generate_config
     );
-    ov::genai::utils::print_compiled_model_properties(kv_compiled_model, "Static LLM kv compiled model");
+    ov::genai::utils::print_compiled_model_properties(kv_compiled_model, "Stateless LLM kv compiled model");
     m_kvcache_request = kv_compiled_model.create_infer_request();
 
+    dump_config("Stateless: Prefill config", prefill_config);
     auto prefill_compiled_model = core.compile_model(prefill_model, device, prefill_config);
     m_prefill_request = prefill_compiled_model.create_infer_request();
-    ov::genai::utils::print_compiled_model_properties(prefill_compiled_model, "Static LLM prefill compiled model");
+    ov::genai::utils::print_compiled_model_properties(prefill_compiled_model, "Stateless LLM prefill compiled model");
 }
 
 void StatelessLLMPipeline::setupAndImportModels(
@@ -1288,7 +1381,7 @@ EncodedResults StatelessLLMPipeline::generate(
     // NB: Check if there is enough space in KV-cache to process input prompt
     auto prompt_len = input_ids.get_size();
     if (prompt_len > m_kvcache_desc.max_prompt_size) {
-        OPENVINO_THROW("Static LLM pipeline may only process prompts up to "
+        OPENVINO_THROW("Static Stateless LLM pipeline may only process prompts up to "
                        + std::to_string(m_kvcache_desc.max_prompt_size) + " tokens. "
                        + "Set the \"MAX_PROMPT_LEN\" config option to increase the limit.");
     }
@@ -1297,16 +1390,22 @@ EncodedResults StatelessLLMPipeline::generate(
     // but if continuation is needed, prompt contains information about the entire conversation.
     prepare_for_new_conversation();
 
+    ov::genai::utils::print_tensor(input_ids);
+
     auto padded_input_ids = m_prefill_request.get_tensor("input_ids");
     const size_t offset = padded_input_ids.get_size() - input_ids.get_size();
     copy_with_offset(input_ids, offset, padded_input_ids);
 
     auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask");
     fill_tensor<int64_t>(padded_attention_mask, 1u, offset);
+    ov::genai::utils::print_tensor(make_tensor_slice(padded_attention_mask, 1, offset, padded_attention_mask.get_size()));
 
     auto padded_position_ids = m_prefill_request.get_tensor("position_ids");
+    auto padded_inputs_len = padded_input_ids.get_size();
+    PRINT_VAR(padded_inputs_len);
     auto* padded_pos_data = padded_position_ids.data<int64_t>();
     std::iota(padded_pos_data + offset, padded_pos_data + padded_position_ids.get_size(), 0u);
+    ov::genai::utils::print_tensor(make_tensor_slice(padded_position_ids, 1, offset, padded_position_ids.get_size()));
 
     m_prefill_request.infer();
     raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
@@ -1316,8 +1415,13 @@ EncodedResults StatelessLLMPipeline::generate(
     m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len);
 
     auto logits = m_prefill_request.get_tensor("logits");
+    //dump_tensor(logits, "stateless_padded_logits");
+    auto last_token = utils::argmax(logits, 0);
+    PRINT_VAR(last_token);
     int64_t output_sequence_len = logits.get_shape().at(1);
-
+    PRINT_VAR(output_sequence_len);
+    // TODO: Pass input_ids to say that there is room for generation.
+    //       Retrive only useful logits and work only with them here.
     auto sequence_group = std::make_shared<SequenceGroup>(
         0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
     sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
@@ -1377,13 +1481,21 @@ EncodedResults StatelessLLMPipeline::generate(
         input_ids_data[0] = running_sequences.front()->get_generated_ids().back();
         position_ids_data[0] = m_kvcache_desc.num_stored_tokens;
         attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u;
+        auto last_token = running_sequences.front()->get_generated_ids().back();
+        PRINT_VAR(last_token);
 
         m_kvcache_request.infer();
         m_kvcache_desc.num_stored_tokens += 1;
 
         raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
         raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
 
+        auto len =  m_kvcache_request.get_tensor("logits").get_shape()[1];
+        PRINT_VAR(len);
+        auto new_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
+        PRINT_VAR(new_token);
+
+        std::cout << std::endl;
         SamplerOutput sampler_output = m_sampler.sample(
             {sequence_group}, m_kvcache_request.get_tensor("logits"));
         stream_generated_tokens(streamer_ptr, handle);

diff --git a/src/cpp/src/llm_pipeline_static.hpp b/src/cpp/src/llm_pipeline_static.hpp
@@ -75,8 +75,12 @@ class StatefulLLMPipeline : public LLMPipelineImplBase {
     void finish_chat() override;
 
 private:
+    uint32_t m_max_prompt_len = 0u;
     uint32_t m_kvcache_total = 0u;
     ov::InferRequest m_request;
+
+    Sampler m_sampler;
+
     bool m_is_chat_conversation = false;
     ChatHistory m_history;
 };