Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Debug qwen #1537

Draft
wants to merge 6 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions samples/python/text_generation/chat_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,22 @@ def main():
parser.add_argument('model_dir')
args = parser.parse_args()

device = 'CPU' # GPU can be used as well
pipe = openvino_genai.LLMPipeline(args.model_dir, device)
device = 'NPU' # GPU can be used as well
pipe = openvino_genai.LLMPipeline(args.model_dir, device,
{ "STATIC_PIPELINE" : "STATEFUL", "NPUW_DEVICES" : "CPU" })

config = openvino_genai.GenerationConfig()
config.max_new_tokens = 100

pipe.start_chat()
# pipe.start_chat()
while True:
try:
prompt = input('question:\n')
except EOFError:
break
pipe.generate(prompt, config, streamer)
print('\n----------')
pipe.finish_chat()
# pipe.finish_chat()


if '__main__' == __name__:
Expand Down
172 changes: 142 additions & 30 deletions src/cpp/src/llm_pipeline_static.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,41 @@
#include "json_utils.hpp"
#include "utils.hpp"

#define PRINT_VAR(var) std::cout << #var << " : " << var << std::endl;

namespace {
void dump_tensor(const ov::Tensor& input, const std::string& base_path) {
ov::Tensor tensor;

if (input.is_continuous()) {
tensor = input;
} else {
// Create temporary tensor and copy data in. Dumping is never fast, anyway
tensor = ov::Tensor(input.get_element_type(), input.get_shape());
input.copy_to(tensor);
}

const auto bin_path = base_path + ".bin";
{
std::ofstream bin_file(bin_path, std::ios_base::out | std::ios_base::binary);
bin_file.write(static_cast<const char*>(tensor.data()), static_cast<std::streamsize>(tensor.get_byte_size()));
}
const auto meta_path = base_path + ".txt";
{
std::ofstream meta_file(meta_path);
meta_file << tensor.get_element_type() << ' ' << tensor.get_shape() << std::endl;
}
}

void dump_config(const std::string& name, const ov::AnyMap& config) {
std::cout << name << std::endl;
for (auto&& [key, value] : config) {
std::cout << key << " : ";
value.print(std::cout);
std::cout << ", ";
}
std::cout << std::endl;
}

namespace opp = ov::pass::pattern;
class TransposeValueTensors : public ov::pass::MatcherPass {
Expand Down Expand Up @@ -686,14 +720,16 @@ StatefulLLMPipeline::StatefulLLMPipeline(
const std::string&,
const ov::AnyMap& config
) : LLMPipelineImplBase(tokenizer,
utils::from_config_json_if_exists(models_path)) {
utils::from_config_json_if_exists(models_path)),
m_sampler(m_tokenizer) {

auto model = genai::utils::singleton_core().read_model(models_path / "openvino_model.xml", {}, config);
ModelConfigDesc model_desc = get_modeldesc_from_json(models_path / "config.json");
ov::AnyMap properties = config;

auto compiled = setupAndCompileModel(model, model_desc, properties);
m_request = compiled->create_infer_request();
m_sampler.set_seed(m_generation_config.rng_seed);
}


Expand All @@ -704,10 +740,12 @@ StatefulLLMPipeline::StatefulLLMPipeline(
const std::string&,
const ov::AnyMap& properties,
const ov::genai::GenerationConfig& generation_config
) : LLMPipelineImplBase(tokenizer, generation_config) {
) : LLMPipelineImplBase(tokenizer, generation_config),
m_sampler(m_tokenizer) {
ov::AnyMap properties_copy = properties;
auto compiled = setupAndCompileModel(model, model_desc, properties_copy);
m_request = compiled->create_infer_request();
m_sampler.set_seed(m_generation_config.rng_seed);
}

std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
Expand All @@ -717,6 +755,7 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(

const uint32_t kMaxPromptLen = pop_int_and_cast(pipeline_config, "MAX_PROMPT_LEN").value_or(1024u);
const uint32_t kMinResponseLen = pop_int_and_cast(pipeline_config, "MIN_RESPONSE_LEN").value_or(128u);
m_max_prompt_len = kMaxPromptLen;
m_kvcache_total = kMaxPromptLen + kMinResponseLen;

update_config(pipeline_config, {"NPU_USE_NPUW", "YES"});
Expand All @@ -725,6 +764,7 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
KVAxesPosition axes = get_kv_axes(model_desc.type);
update_config(pipeline_config, {"NPUW_LLM_BATCH_DIM", axes.batch});
update_config(pipeline_config, {"NPUW_LLM_SEQ_LEN_DIM", axes.seq_len});
pipeline_config["NPUW_LLM_PAD_TOKEN_ID"] = m_tokenizer.get_pad_token_id();

update_config(pipeline_config, {"NPUW_LLM_MAX_PROMPT_LEN", kMaxPromptLen});
update_config(pipeline_config, {"NPUW_LLM_MIN_RESPONSE_LEN", kMinResponseLen});
Expand All @@ -744,7 +784,9 @@ std::shared_ptr<ov::CompiledModel> StatefulLLMPipeline::setupAndCompileModel(
// Replace CACHE_DIR option if NPUW is enabled
set_npuw_cache_dir(pipeline_config);

return std::make_shared<ov::CompiledModel>(genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config));
auto comp_model = genai::utils::singleton_core().compile_model(model, "NPU", pipeline_config);
ov::genai::utils::print_compiled_model_properties(comp_model, "Stateful LLM NPU Compiled Model");
return std::make_shared<ov::CompiledModel>(comp_model);
}

DecodedResults StatefulLLMPipeline::generate(
Expand Down Expand Up @@ -817,7 +859,9 @@ EncodedResults StatefulLLMPipeline::generate(
attention_mask = data->attention_mask;
}

OPENVINO_ASSERT(input_ids.get_shape().at(0) == 1u, "Currently only batch size=1 is supported");
ov::Shape prompts_shape = input_ids.get_shape();
const size_t batch_size = prompts_shape[0];
OPENVINO_ASSERT(batch_size == 1u, "Currently only batch size=1 is supported");

GenerationConfig config = (generation_config.has_value()) ? *generation_config : m_generation_config;
// If eos_token_id was not provided, take value from default m_generation_config
Expand All @@ -834,49 +878,91 @@ EncodedResults StatefulLLMPipeline::generate(
streamer_ptr = std::make_shared<TextCallbackStreamer>(m_tokenizer, *callback);
}

OPENVINO_ASSERT(config.is_greedy_decoding(), "Currently only greedy decoding is supported");
OPENVINO_ASSERT(config.is_greedy_decoding() || config.is_multinomial(),
"Currently only greedy and multinomial decoding are supported");

OPENVINO_ASSERT(config.num_return_sequences == 1u,
"Currently only \"num_return_sequences\" equal to 1 is supported!");

ov::Shape prompts_shape = input_ids.get_shape();
const size_t batch_size = prompts_shape[0];
ov::genai::EncodedResults results;
auto& raw_perf_counters = results.perf_metrics.raw_metrics;
// NB: Only batch=1 is supported now
results.scores.resize(1u);
results.scores[0] = 0u;
results.tokens.resize(1u);

// TODO: Check if there is enough space in KV-cache to process input prompt
// NB: Check if there is enough space in KV-cache to process input prompt
auto prompt_len = input_ids.get_size();
if (prompt_len > m_max_prompt_len) {
OPENVINO_THROW("Static Stateful LLM pipeline may only process prompts up to "
+ std::to_string(m_max_prompt_len) + " tokens. "
+ "Set the \"MAX_PROMPT_LEN\" config option to increase the limit.");
}

ov::Tensor position_ids{ov::element::i64, input_ids.get_shape()};
utils::initialize_position_ids(position_ids, attention_mask);

ov::genai::utils::print_tensor(input_ids);
ov::genai::utils::print_tensor(attention_mask);
ov::genai::utils::print_tensor(position_ids);
m_request.set_tensor("input_ids", input_ids);
m_request.set_tensor("attention_mask", attention_mask);
m_request.set_tensor("position_ids", position_ids);

m_request.infer();

int64_t last_token = utils::argmax(m_request.get_tensor("logits"), 0);

results.tokens[0].push_back(last_token);
if (streamer_ptr && streamer_ptr->put(last_token)) {
return results;
auto padded_logits = m_request.get_tensor("logits");
//dump_tensor(padded_logits, "stateful_padded_logits");
auto last_token = utils::argmax(padded_logits, 0);
PRINT_VAR(last_token);
// FIXME: Here is workaround to get only useful units of returned logits.
// If SliceOut is applied, there will be only 1 useful logit returned,
// nothing is required here.
// Other way, model will return logits of full context length,
// as internally prefill model is specially reshaped to return them.
// Fix should be done on OpenVINO side, so the model should return only
// useful logits of input prompt length, dropping the implementation-related
// padding ones.
auto logits = padded_logits;
auto padded_sequence_len = padded_logits.get_shape()[1];
PRINT_VAR(padded_sequence_len);
if (padded_sequence_len > 1) {
// If SliceOut is not applied:
logits = make_tensor_slice(padded_logits, 1, padded_sequence_len - prompt_len, padded_sequence_len);
}
int64_t output_sequence_len = logits.get_shape().at(1);
PRINT_VAR(output_sequence_len);
auto sequence_group = std::make_shared<SequenceGroup>(
0 /* request_id */, input_ids, config, 1 /* block_size */);
sequence_group->update_processed_tokens_num(sequence_group->get_prompt_len() - output_sequence_len);
sequence_group->schedule_tokens(output_sequence_len);

// NB: Controls what tokens are ready to be pushed into the streamer
GenerationHandle handle = std::make_shared<GenerationHandleImpl>(
sequence_group->get_generation_stream(), sequence_group->get_sampling_parameters());

SamplerOutput sampler_output = m_sampler.sample({sequence_group}, logits);
stream_generated_tokens(streamer_ptr, handle);

int64_t input_ids_data = -1;
int64_t position_ids_data = prompt_len - 1;
std::vector<int64_t> attention_mask_data(prompt_len - 1, 1);
m_request.set_tensor("input_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1}, reinterpret_cast<void*>(&input_ids_data)));
m_request.set_tensor("position_ids", ov::Tensor(ov::element::i64, ov::Shape{1,1}, reinterpret_cast<void*>(&position_ids_data)));

const size_t max_tokens = config.get_max_new_tokens(prompt_len);
for (int i = 0; i < max_tokens - 1; ++i) {
while (sequence_group->is_running()) {
// KV Cache is full, no further generation is possible
if (position_ids_data + 1 == m_kvcache_total) {
sequence_group->set_out_of_memory();
break;
}

sequence_group->schedule_tokens(1);
const auto running_sequences = sequence_group->get_running_sequences();
OPENVINO_ASSERT(running_sequences.size() == 1u);
auto last_token = running_sequences.front()->get_generated_ids().back();
PRINT_VAR(last_token);

// Just change the variables here, as pointers to them are already set to corresponding tensors
input_ids_data = last_token;
++position_ids_data;
Expand All @@ -886,24 +972,29 @@ EncodedResults StatefulLLMPipeline::generate(

m_request.infer();

last_token = utils::argmax(m_request.get_tensor("logits"), 0);
results.tokens[0].push_back(last_token);

raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
raw_perf_counters.m_batch_sizes.emplace_back(batch_size);
if (streamer_ptr && streamer_ptr->put(last_token)) {
break;
}

if (last_token == config.eos_token_id && !config.ignore_eos) {
break;
}
auto len = m_request.get_tensor("logits").get_shape()[1];
PRINT_VAR(len);
auto new_token = utils::argmax(m_request.get_tensor("logits"), 0);
PRINT_VAR(new_token);

SamplerOutput sampler_output = m_sampler.sample(
{sequence_group}, m_request.get_tensor("logits"));
stream_generated_tokens(streamer_ptr, handle);
}

if (streamer_ptr) {
streamer_ptr->end();
}

OPENVINO_ASSERT(sequence_group->get_finished_sequences().size() == 1u);
auto sequence = sequence_group->get_finished_sequences().front();
results.tokens[0] = sequence->get_generated_ids();
results.scores[0] = sequence->get_cumulative_log_prob();
m_sampler.clear_request_info(sequence_group->get_request_id());

auto stop_time = std::chrono::steady_clock::now();
// If is called without tokenization then that stat will not be reported.
auto& metrics = results.perf_metrics;
Expand Down Expand Up @@ -1028,8 +1119,8 @@ void StatelessLLMPipeline::setupAndCompileModels(
auto kvcache_model = model;
// (2) Expose KV-cache input and output layers from kvcache model
ov::pass::StatefulToStateless().run_on_model(kvcache_model);
// (3) Align u4 ZP constants
align_u4_zp_constants(kvcache_model);
// // (3) Align u4 ZP constants
// align_u4_zp_constants(kvcache_model);
// (4) Clone the model - this will be prefill
auto prefill_model = kvcache_model->clone();
prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill");
Expand Down Expand Up @@ -1071,15 +1162,17 @@ void StatelessLLMPipeline::setupAndCompileModels(
set_npuw_cache_dir(prefill_config);
set_npuw_cache_dir(generate_config);

dump_config("Stateless: Generate config", generate_config);
auto kv_compiled_model = core.compile_model(
kvcache_model, device, generate_config
);
ov::genai::utils::print_compiled_model_properties(kv_compiled_model, "Static LLM kv compiled model");
ov::genai::utils::print_compiled_model_properties(kv_compiled_model, "Stateless LLM kv compiled model");
m_kvcache_request = kv_compiled_model.create_infer_request();

dump_config("Stateless: Prefill config", prefill_config);
auto prefill_compiled_model = core.compile_model(prefill_model, device, prefill_config);
m_prefill_request = prefill_compiled_model.create_infer_request();
ov::genai::utils::print_compiled_model_properties(prefill_compiled_model, "Static LLM prefill compiled model");
ov::genai::utils::print_compiled_model_properties(prefill_compiled_model, "Stateless LLM prefill compiled model");
}

void StatelessLLMPipeline::setupAndImportModels(
Expand Down Expand Up @@ -1288,7 +1381,7 @@ EncodedResults StatelessLLMPipeline::generate(
// NB: Check if there is enough space in KV-cache to process input prompt
auto prompt_len = input_ids.get_size();
if (prompt_len > m_kvcache_desc.max_prompt_size) {
OPENVINO_THROW("Static LLM pipeline may only process prompts up to "
OPENVINO_THROW("Static Stateless LLM pipeline may only process prompts up to "
+ std::to_string(m_kvcache_desc.max_prompt_size) + " tokens. "
+ "Set the \"MAX_PROMPT_LEN\" config option to increase the limit.");
}
Expand All @@ -1297,16 +1390,22 @@ EncodedResults StatelessLLMPipeline::generate(
// but if continuation is needed, prompt contains information about the entire conversation.
prepare_for_new_conversation();

ov::genai::utils::print_tensor(input_ids);

auto padded_input_ids = m_prefill_request.get_tensor("input_ids");
const size_t offset = padded_input_ids.get_size() - input_ids.get_size();
copy_with_offset(input_ids, offset, padded_input_ids);

auto padded_attention_mask = m_prefill_request.get_tensor("attention_mask");
fill_tensor<int64_t>(padded_attention_mask, 1u, offset);
ov::genai::utils::print_tensor(make_tensor_slice(padded_attention_mask, 1, offset, padded_attention_mask.get_size()));

auto padded_position_ids = m_prefill_request.get_tensor("position_ids");
auto padded_inputs_len = padded_input_ids.get_size();
PRINT_VAR(padded_inputs_len);
auto* padded_pos_data = padded_position_ids.data<int64_t>();
std::iota(padded_pos_data + offset, padded_pos_data + padded_position_ids.get_size(), 0u);
ov::genai::utils::print_tensor(make_tensor_slice(padded_position_ids, 1, offset, padded_position_ids.get_size()));

m_prefill_request.infer();
raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
Expand All @@ -1316,8 +1415,13 @@ EncodedResults StatelessLLMPipeline::generate(
m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(prompt_len);

auto logits = m_prefill_request.get_tensor("logits");
//dump_tensor(logits, "stateless_padded_logits");
auto last_token = utils::argmax(logits, 0);
PRINT_VAR(last_token);
int64_t output_sequence_len = logits.get_shape().at(1);

PRINT_VAR(output_sequence_len);
// TODO: Pass input_ids to say that there is room for generation.
// Retrive only useful logits and work only with them here.
auto sequence_group = std::make_shared<SequenceGroup>(
0 /* request_id */, padded_input_ids, config, 1 /* block_size */);
sequence_group->update_processed_tokens_num(m_kvcache_desc.max_prompt_size - output_sequence_len);
Expand Down Expand Up @@ -1377,13 +1481,21 @@ EncodedResults StatelessLLMPipeline::generate(
input_ids_data[0] = running_sequences.front()->get_generated_ids().back();
position_ids_data[0] = m_kvcache_desc.num_stored_tokens;
attention_mask_data[m_kvcache_desc.num_stored_tokens - 1] = 1u;
auto last_token = running_sequences.front()->get_generated_ids().back();
PRINT_VAR(last_token);

m_kvcache_request.infer();
m_kvcache_desc.num_stored_tokens += 1;

raw_perf_counters.m_new_token_times.emplace_back(std::chrono::steady_clock::now());
raw_perf_counters.m_batch_sizes.emplace_back(batch_size);

auto len = m_kvcache_request.get_tensor("logits").get_shape()[1];
PRINT_VAR(len);
auto new_token = utils::argmax(m_kvcache_request.get_tensor("logits"), 0);
PRINT_VAR(new_token);

std::cout << std::endl;
SamplerOutput sampler_output = m_sampler.sample(
{sequence_group}, m_kvcache_request.get_tensor("logits"));
stream_generated_tokens(streamer_ptr, handle);
Expand Down
4 changes: 4 additions & 0 deletions src/cpp/src/llm_pipeline_static.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,12 @@ class StatefulLLMPipeline : public LLMPipelineImplBase {
void finish_chat() override;

private:
uint32_t m_max_prompt_len = 0u;
uint32_t m_kvcache_total = 0u;
ov::InferRequest m_request;

Sampler m_sampler;

bool m_is_chat_conversation = false;
ChatHistory m_history;
};
Expand Down
Loading