Skip to content

Commit

Permalink
tokenizer: read simplified_chat_template (#1712)
Browse files Browse the repository at this point in the history
Depends on huggingface/optimum-intel#1151

Close #1663

Ticket 161313
  • Loading branch information
Wovchena authored Feb 12, 2025
1 parent cfd220e commit 982a0dd
Show file tree
Hide file tree
Showing 8 changed files with 208 additions and 97 deletions.
17 changes: 16 additions & 1 deletion src/cpp/include/openvino/genai/tokenizer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,22 @@ struct TokenizedInputs {
};

/**
* @brief class is used to encode prompts and decode resulting tokens
* @brief The class is used to encode prompts and decode resulting tokens
*
* Chat tempalte is initialized from sources in the following order
* overriding the previos value:
* 1. chat_template entry from tokenizer_config.json
* 2. chat_template entry from processor_config.json
* 3. chat_template entry from chat_template.json
* 4. chat_tempalte entry from rt_info section of ov::Model
* 5. If the tempalte is known to be not supported by GenAI, it's
* replaced with a simplified supported version.
* 6. Patch chat_tempalte replacing not supported instructions with
* eqvivalents.
* 7. If the template was not in the list of not supported GenAI
* templates from (5), it's blindly replaced with
* simplified_chat_template entry from rt_info section of
* ov::Model if the entry exists.
*/
class OPENVINO_GENAI_EXPORTS Tokenizer {
public:
Expand Down
153 changes: 82 additions & 71 deletions src/cpp/src/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,61 @@ const std::pair<std::string, std::string> chat_template_fallback_map[] = {
}
};

std::optional<std::string> remap_template(const std::string& chat_template) {
for (const auto& [known, fallback] : chat_template_fallback_map) {
if (chat_template == known) {
return fallback;
}
}
return std::nullopt;
}

void parse_if_exists(const std::filesystem::path& path, std::string& value) {
if (std::filesystem::exists(path)) {
ov::genai::utils::read_json_param(nlohmann::json::parse(std::ifstream{path}), "chat_template", value);
}
}

template <typename T>
const T& find_or_fallback(const ov::AnyMap& rt_info, const char name[], const T& fallback) {
auto iter = rt_info.find(name);
if (rt_info.end() == iter) {
return fallback;
}
return iter->second.as<T>();
}

std::string patch_template(std::string&& chat_template) {
// Replace what jinja2cpp doesn't support
std::pair<std::string, std::string> replace_str_map[] = {
{"'}", "' }"},
{"{'", "{ '"},
{".strip()", ""},
{"is not none", "is defined"},
{"is none", "is undefined"},
{"= none", "= undefined"},
// Jinja2Cpp does not support Python-style slicing, e.g. [1:].
// If chat template contains such slicing, we replace it with
// a placeholder at the moment.
{"messages[1:]", "slice(messages, 1)"},
};

for (const auto& [from, to] : replace_str_map) {
size_t pos = 0;
while ((pos = chat_template.find(from, pos)) != std::string::npos) {
chat_template.replace(pos, from.size(), to);
pos += to.size();
}
}
return chat_template;
}

std::string remap_and_patch(const std::string& chat_template) {
return patch_template(
remap_template(chat_template).value_or(chat_template)
);
}

} // namespace

namespace ov {
Expand Down Expand Up @@ -195,11 +250,10 @@ class Tokenizer::TokenizerImpl {
read_special_tokens_map(models_path);
// Try to read tokenizer_config if some token ids or token str are not defined.
read_tokenizer_config_if_necessary(models_path);
parse_if_exists(models_path / "tokenizer_config.json", m_chat_template);
parse_if_exists(models_path / "processor_config.json", m_chat_template);
parse_if_exists(models_path / "chat_template.json", m_chat_template);
setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
m_chat_template = chat_template_from_file_if_exists(models_path, "tokenizer_config.json");
if (m_chat_template.empty()) {
m_chat_template = chat_template_from_file_if_exists(models_path, "chat_template.json");
}
}

void setup_tokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
Expand All @@ -209,10 +263,8 @@ class Tokenizer::TokenizerImpl {
auto core = get_core_singleton();
std::string device = "CPU"; // only CPU is supported for now

std::string version_str;
utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str);
// Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5
m_older_than_24_5 = version_str.empty();
// Saving IR version was added only in 24.5, so if it's missing, then it's older than 24.5
m_older_than_24_5 = (ov_tokenizer ? ov_tokenizer: ov_detokenizer)->get_rt_info().count("openvino_tokenizers_version") == 0;

if (ov_tokenizer) {
ov::pass::Manager manager;
Expand All @@ -227,6 +279,18 @@ class Tokenizer::TokenizerImpl {
[this]() -> ov::InferRequest {
return std::move(this->m_tokenizer.create_infer_request());
});

const ov::AnyMap& rt_info = ov_tokenizer->get_rt_info();
m_pad_token_id = find_or_fallback(rt_info, "pad_token_id", m_pad_token_id);
m_bos_token_id = find_or_fallback(rt_info, "bos_token_id", m_bos_token_id);
m_eos_token_id = find_or_fallback(rt_info, "eos_token_id", m_eos_token_id);

m_chat_template = find_or_fallback(rt_info, "chat_template", m_chat_template);
std::optional<std::string> fallback = remap_template(m_chat_template);
m_chat_template = patch_template(fallback.value_or(m_chat_template));
if (!fallback.has_value()) {
m_chat_template = find_or_fallback(rt_info, "simplified_chat_template", m_chat_template);
}
}

if (ov_detokenizer) {
Expand All @@ -241,6 +305,14 @@ class Tokenizer::TokenizerImpl {
[this]() -> ov::InferRequest {
return std::move(this->m_detokenizer.create_infer_request());
});

// Unset/-1 token causes exception in SentencePiece detokenization.
if (m_pad_token_id != -1 && m_pad_token.empty())
m_pad_token = decode(std::vector{m_pad_token_id}, {ov::genai::add_special_tokens(true)});
if (m_bos_token_id != -1 && m_bos_token.empty())
m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::add_special_tokens(true)});
if (m_eos_token_id != -1 && m_eos_token.empty())
m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::add_special_tokens(true)});
}

// Initialize tokenizer's cache to save time later.
Expand All @@ -251,24 +323,6 @@ class Tokenizer::TokenizerImpl {
if (m_detokenizer) {
decode({1, 33, 199, 42, 42});
}

if (m_tokenizer) {
utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template);
utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id);
utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id);
utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id);
}

m_chat_template = patch_chat_template(m_chat_template);
if (m_detokenizer) {
// Unset/-1 token causes exception in SentencePiece detokenization.
if (m_pad_token_id != -1 && m_pad_token.empty())
m_pad_token = decode(std::vector{m_pad_token_id}, {ov::genai::add_special_tokens(true)});
if (m_bos_token_id != -1 && m_bos_token.empty())
m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::add_special_tokens(true)});
if (m_eos_token_id != -1 && m_eos_token.empty())
m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::add_special_tokens(true)});
}
}

// load special tokens ids from config.json
Expand Down Expand Up @@ -495,53 +549,10 @@ class Tokenizer::TokenizerImpl {
return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
}

std::string patch_chat_template(std::string template_str) const {
for (const auto& [chat_template, fallback] : chat_template_fallback_map) {
if (template_str == chat_template) {
return fallback;
}
}

// Replace what jinja2cpp doesn't support
std::pair<std::string, std::string> replace_str_map[] = {
{"'}", "' }"},
{"{'", "{ '"},
{".strip()", ""},
{"is not none", "is defined"},
{"is none", "is undefined"},
{"= none", "= undefined"},
// Jinja2Cpp does not support Python-style slicing, e.g. [1:].
// If chat template contains such slicing, we replace it with
// a placeholder at the moment.
{"messages[1:]", "slice(messages, 1)"},
};

for (const auto& [from, to] : replace_str_map) {
size_t pos = 0;
while ((pos = template_str.find(from, pos)) != std::string::npos) {
template_str.replace(pos, from.size(), to);
pos += to.size();
}
}
return template_str;
}

std::string chat_template_from_file_if_exists(const std::filesystem::path& path, const std::string& file_name) {
auto tokenizer_config_file_path = path / file_name;
if (!std::filesystem::exists(tokenizer_config_file_path))
return "";
std::ifstream file(tokenizer_config_file_path);

std::string res;
ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res);

return patch_chat_template(res);
}

std::string apply_chat_template(ChatHistory history,
bool add_generation_prompt,
const std::string& chat_template) const {
std::string chat_tpl = chat_template.empty() ? m_chat_template : patch_chat_template(chat_template);
std::string chat_tpl = chat_template.empty() ? m_chat_template : remap_and_patch(chat_template);
OPENVINO_ASSERT(!chat_tpl.empty(),
"Chat template wasn't found. This may indicate that the model wasn't trained for chat scenario."
" Please add 'chat_template' to tokenizer_config.json to use the model in chat scenario."
Expand Down Expand Up @@ -599,7 +610,7 @@ class Tokenizer::TokenizerImpl {
}

void set_chat_template(const std::string& chat_template) {
m_chat_template = patch_chat_template(chat_template);
m_chat_template = remap_and_patch(chat_template);
}

std::string get_chat_template() {
Expand Down
17 changes: 0 additions & 17 deletions src/cpp/src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -283,23 +283,6 @@ void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model)
}
}

template <typename T>
void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value) {
if (!model)
return;
if (model->get_rt_info().count(name) == 0)
return;
auto str_value = model->get_rt_info().at(name).as<std::string>();
if constexpr (std::is_same<T, int64_t>::value) {
value = std::stoll(str_value);
} else if constexpr (std::is_same<T, std::string>::value) {
value = str_value;
}
}

template void read_rt_info<int64_t>(std::shared_ptr<ov::Model>&, const char*, int64_t&);
template void read_rt_info<std::string>(std::shared_ptr<ov::Model>&, const char*, std::string&);

ov::Core singleton_core() {
static ov::Core core;
return core;
Expand Down
3 changes: 0 additions & 3 deletions src/cpp/src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,9 +111,6 @@ void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model)

ov::Core singleton_core();

template <typename T>
void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value);

size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history, std::set<int64_t> stop_tokens);

size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);
Expand Down
19 changes: 17 additions & 2 deletions src/python/openvino_genai/py_openvino_genai.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -1713,8 +1713,23 @@ class TokenizedInputs:
...
class Tokenizer:
"""
openvino_genai.Tokenizer object is used to initialize Tokenizer
if it's located in a different path than the main model.
The class is used to encode prompts and decode resulting tokens
Chat tempalte is initialized from sources in the following order
overriding the previos value:
1. chat_template entry from tokenizer_config.json
2. chat_template entry from processor_config.json
3. chat_template entry from chat_template.json
4. chat_tempalte entry from rt_info section of openvino.Model
5. If the tempalte is known to be not supported by GenAI, it's
replaced with a simplified supported version.
6. Patch chat_tempalte replacing not supported instructions with
eqvivalents.
7. If the template was not in the list of not supported GenAI
templates from (5), it's blindly replaced with
simplified_chat_template entry from rt_info section of
openvino.Model if the entry exists.
"""
chat_template: str
def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None:
Expand Down
27 changes: 24 additions & 3 deletions src/python/py_tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,29 @@

#include "py_utils.hpp"

namespace {

constexpr char class_docstring[] = R"(
The class is used to encode prompts and decode resulting tokens
Chat tempalte is initialized from sources in the following order
overriding the previos value:
1. chat_template entry from tokenizer_config.json
2. chat_template entry from processor_config.json
3. chat_template entry from chat_template.json
4. chat_tempalte entry from rt_info section of openvino.Model
5. If the tempalte is known to be not supported by GenAI, it's
replaced with a simplified supported version.
6. Patch chat_tempalte replacing not supported instructions with
eqvivalents.
7. If the template was not in the list of not supported GenAI
templates from (5), it's blindly replaced with
simplified_chat_template entry from rt_info section of
openvino.Model if the entry exists.
)";

} // namespace

namespace py = pybind11;
namespace pyutils = ov::genai::pybind::utils;

Expand All @@ -26,9 +49,7 @@ void init_tokenizer(py::module_& m) {
.def_readwrite("input_ids", &TokenizedInputs::input_ids)
.def_readwrite("attention_mask", &TokenizedInputs::attention_mask);

py::class_<ov::genai::Tokenizer>(m, "Tokenizer",
R"(openvino_genai.Tokenizer object is used to initialize Tokenizer
if it's located in a different path than the main model.)")
py::class_<ov::genai::Tokenizer>(m, "Tokenizer", class_docstring)

.def(py::init([](const std::filesystem::path& tokenizer_path, const std::map<std::string, py::object>& properties, const py::kwargs& kwargs) {
ScopedVar env_manager(pyutils::ov_tokenizers_module_path());
Expand Down
1 change: 1 addition & 0 deletions tests/python_tests/test_kv_cache_eviction.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
generation_config = GenerationConfig() # expecting default greedy sampling
generation_config.num_return_sequences = 1
generation_config.max_new_tokens = test_struct.max_new_tokens
generation_config.apply_chat_template = False

scheduler_config_opt = get_scheduler_config(test_struct.num_kv_blocks)
scheduler_config_opt.use_cache_eviction = test_struct.use_cache_eviction
Expand Down
Loading

0 comments on commit 982a0dd

Please sign in to comment.