tokenizer: read simplified_chat_template (#1712)

Depends on huggingface/optimum-intel#1151 Close #1663 Ticket 161313
openvinotoolkit · Feb 12, 2025 · 982a0dd · 982a0dd
1 parent cfd220e
commit 982a0dd
Show file tree

Hide file tree

Showing 8 changed files with 208 additions and 97 deletions.
diff --git a/src/cpp/include/openvino/genai/tokenizer.hpp b/src/cpp/include/openvino/genai/tokenizer.hpp
@@ -23,7 +23,22 @@ struct TokenizedInputs {
 };
 
 /**
-* @brief class is used to encode prompts and decode resulting tokens
+ * @brief The class is used to encode prompts and decode resulting tokens
+ *
+ * Chat tempalte is initialized from sources in the following order
+ * overriding the previos value:
+ * 1. chat_template entry from tokenizer_config.json
+ * 2. chat_template entry from processor_config.json
+ * 3. chat_template entry from chat_template.json
+ * 4. chat_tempalte entry from rt_info section of ov::Model
+ * 5. If the tempalte is known to be not supported by GenAI, it's
+ *     replaced with a simplified supported version.
+ * 6. Patch chat_tempalte replacing not supported instructions with
+ *     eqvivalents.
+ * 7. If the template was not in the list of not supported GenAI
+ *     templates from (5), it's blindly replaced with
+ *     simplified_chat_template entry from rt_info section of
+ *     ov::Model if the entry exists.
 */
 class OPENVINO_GENAI_EXPORTS Tokenizer {
 public:

diff --git a/src/cpp/src/tokenizer.cpp b/src/cpp/src/tokenizer.cpp
@@ -70,6 +70,61 @@ const std::pair<std::string, std::string> chat_template_fallback_map[] = {
     }
 };    
 
+std::optional<std::string> remap_template(const std::string& chat_template) {
+    for (const auto& [known, fallback] : chat_template_fallback_map) {
+        if (chat_template == known) {
+            return fallback;
+        }
+    }
+    return std::nullopt;
+}
+
+void parse_if_exists(const std::filesystem::path& path, std::string& value) {
+    if (std::filesystem::exists(path)) {
+        ov::genai::utils::read_json_param(nlohmann::json::parse(std::ifstream{path}), "chat_template", value);
+    }
+}
+
+template <typename T>
+const T& find_or_fallback(const ov::AnyMap& rt_info, const char name[], const T& fallback) {
+    auto iter = rt_info.find(name);
+    if (rt_info.end() == iter) {
+        return fallback;
+    }
+    return iter->second.as<T>();
+}
+
+std::string patch_template(std::string&& chat_template) {
+    // Replace what jinja2cpp doesn't support
+    std::pair<std::string, std::string> replace_str_map[] = {
+        {"'}", "' }"},
+        {"{'", "{ '"},
+        {".strip()", ""},
+        {"is not none", "is defined"},
+        {"is none", "is undefined"},
+        {"= none", "= undefined"},
+        // Jinja2Cpp does not support Python-style slicing, e.g. [1:].
+        // If chat template contains such slicing, we replace it with
+        // a placeholder at the moment.
+        {"messages[1:]", "slice(messages, 1)"},
+    };
+
+    for (const auto& [from, to] : replace_str_map) {
+        size_t pos = 0;
+        while ((pos = chat_template.find(from, pos)) != std::string::npos) {
+            chat_template.replace(pos, from.size(), to);
+            pos += to.size();
+        }
+    }
+    return chat_template;
+}
+
+std::string remap_and_patch(const std::string& chat_template) {
+    return patch_template(
+        remap_template(chat_template).value_or(chat_template)
+    );
+}
+
 }  // namespace
 
 namespace ov {
@@ -195,11 +250,10 @@ class Tokenizer::TokenizerImpl {
         read_special_tokens_map(models_path);
         // Try to read tokenizer_config if some token ids or token str are not defined.
         read_tokenizer_config_if_necessary(models_path);
+        parse_if_exists(models_path / "tokenizer_config.json", m_chat_template);
+        parse_if_exists(models_path / "processor_config.json", m_chat_template);
+        parse_if_exists(models_path / "chat_template.json", m_chat_template);
         setup_tokenizer(std::make_pair(ov_tokenizer, ov_detokenizer), properties);
-        m_chat_template = chat_template_from_file_if_exists(models_path, "tokenizer_config.json");
-        if (m_chat_template.empty()) {
-            m_chat_template = chat_template_from_file_if_exists(models_path, "chat_template.json");
-        }
     }
 
     void setup_tokenizer(const std::pair<std::shared_ptr<ov::Model>, std::shared_ptr<ov::Model>>& models, const ov::AnyMap& properties) {
@@ -209,10 +263,8 @@ class Tokenizer::TokenizerImpl {
         auto core = get_core_singleton();
         std::string device = "CPU"; // only CPU is supported for now
 
-        std::string version_str;
-        utils::read_rt_info(ov_tokenizer != nullptr ? ov_tokenizer: ov_detokenizer , "openvino_tokenizers_version", version_str);
-        // Saving IR version was added only in 24.5, so if it's empty, then it's older than 24.5
-        m_older_than_24_5 = version_str.empty();
+        // Saving IR version was added only in 24.5, so if it's missing, then it's older than 24.5
+        m_older_than_24_5 = (ov_tokenizer ? ov_tokenizer: ov_detokenizer)->get_rt_info().count("openvino_tokenizers_version") == 0;
 
         if (ov_tokenizer) {
             ov::pass::Manager manager;
@@ -227,6 +279,18 @@ class Tokenizer::TokenizerImpl {
                 [this]() -> ov::InferRequest {
                     return std::move(this->m_tokenizer.create_infer_request());
                 });
+
+            const ov::AnyMap& rt_info = ov_tokenizer->get_rt_info();
+            m_pad_token_id = find_or_fallback(rt_info, "pad_token_id", m_pad_token_id);
+            m_bos_token_id = find_or_fallback(rt_info, "bos_token_id", m_bos_token_id);
+            m_eos_token_id = find_or_fallback(rt_info, "eos_token_id", m_eos_token_id);
+
+            m_chat_template = find_or_fallback(rt_info, "chat_template", m_chat_template);
+            std::optional<std::string> fallback = remap_template(m_chat_template);
+            m_chat_template = patch_template(fallback.value_or(m_chat_template));
+            if (!fallback.has_value()) {
+                m_chat_template = find_or_fallback(rt_info, "simplified_chat_template", m_chat_template);
+            }
         }
 
         if (ov_detokenizer) {
@@ -241,6 +305,14 @@ class Tokenizer::TokenizerImpl {
                 [this]() -> ov::InferRequest {
                     return std::move(this->m_detokenizer.create_infer_request());
                 });
+
+            // Unset/-1 token causes exception in SentencePiece detokenization.
+            if (m_pad_token_id != -1 && m_pad_token.empty())
+                m_pad_token = decode(std::vector{m_pad_token_id}, {ov::genai::add_special_tokens(true)});
+            if (m_bos_token_id != -1 && m_bos_token.empty())
+                m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::add_special_tokens(true)});
+            if (m_eos_token_id != -1 && m_eos_token.empty())
+                m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::add_special_tokens(true)});
         }
 
         // Initialize tokenizer's cache to save time later.
@@ -251,24 +323,6 @@ class Tokenizer::TokenizerImpl {
         if (m_detokenizer) {
             decode({1, 33, 199, 42, 42});
         }
-
-        if (m_tokenizer) {
-            utils::read_rt_info(ov_tokenizer, "chat_template", m_chat_template);
-            utils::read_rt_info(ov_tokenizer, "pad_token_id", m_pad_token_id);
-            utils::read_rt_info(ov_tokenizer, "bos_token_id", m_bos_token_id);
-            utils::read_rt_info(ov_tokenizer, "eos_token_id", m_eos_token_id);
-        }
-
-        m_chat_template = patch_chat_template(m_chat_template);
-        if (m_detokenizer) {
-            // Unset/-1 token causes exception in SentencePiece detokenization.
-            if (m_pad_token_id != -1 && m_pad_token.empty())
-                m_pad_token = decode(std::vector{m_pad_token_id}, {ov::genai::add_special_tokens(true)});
-            if (m_bos_token_id != -1 && m_bos_token.empty())
-                m_bos_token = decode(std::vector{m_bos_token_id}, {ov::genai::add_special_tokens(true)});
-            if (m_eos_token_id != -1 && m_eos_token.empty())
-                m_eos_token = decode(std::vector{m_eos_token_id}, {ov::genai::add_special_tokens(true)});
-        }
     }
 
     // load special tokens ids from config.json
@@ -495,53 +549,10 @@ class Tokenizer::TokenizerImpl {
         return std::vector<std::string>(res_data, res_data + res.get_shape()[0]);
     }
 
-    std::string patch_chat_template(std::string template_str) const {
-        for (const auto& [chat_template, fallback] : chat_template_fallback_map) {
-            if (template_str == chat_template) {
-                return fallback;
-            }
-        }
-
-        // Replace what jinja2cpp doesn't support
-        std::pair<std::string, std::string> replace_str_map[] = {
-            {"'}", "' }"},
-            {"{'", "{ '"},
-            {".strip()", ""},
-            {"is not none", "is defined"},
-            {"is none", "is undefined"},
-            {"= none", "= undefined"},
-            // Jinja2Cpp does not support Python-style slicing, e.g. [1:].
-            // If chat template contains such slicing, we replace it with
-            // a placeholder at the moment.
-            {"messages[1:]", "slice(messages, 1)"},
-        };
-
-        for (const auto& [from, to] : replace_str_map) {
-            size_t pos = 0;
-            while ((pos = template_str.find(from, pos)) != std::string::npos) {
-                template_str.replace(pos, from.size(), to);
-                pos += to.size();
-            }
-        }
-        return template_str;
-    }
-
-    std::string chat_template_from_file_if_exists(const std::filesystem::path& path, const std::string& file_name) {
-        auto tokenizer_config_file_path = path / file_name;
-        if (!std::filesystem::exists(tokenizer_config_file_path))
-            return "";
-        std::ifstream file(tokenizer_config_file_path);
-
-        std::string res;
-        ov::genai::utils::read_json_param(nlohmann::json::parse(file), "chat_template", res);
-
-        return patch_chat_template(res);
-    }
-
     std::string apply_chat_template(ChatHistory history,
                                     bool add_generation_prompt,
                                     const std::string& chat_template) const {
-        std::string chat_tpl = chat_template.empty() ? m_chat_template : patch_chat_template(chat_template);
+        std::string chat_tpl = chat_template.empty() ? m_chat_template : remap_and_patch(chat_template);
         OPENVINO_ASSERT(!chat_tpl.empty(),
                         "Chat template wasn't found. This may indicate that the model wasn't trained for chat scenario."
                         " Please add 'chat_template' to tokenizer_config.json to use the model in chat scenario."
@@ -599,7 +610,7 @@ class Tokenizer::TokenizerImpl {
     }
 
     void set_chat_template(const std::string& chat_template) {
-        m_chat_template = patch_chat_template(chat_template);
+        m_chat_template = remap_and_patch(chat_template);
     }
 
     std::string get_chat_template() {

diff --git a/src/cpp/src/utils.cpp b/src/cpp/src/utils.cpp
@@ -283,23 +283,6 @@ void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model)
     }
 }
 
-template <typename T>
-void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value) {
-    if (!model)
-        return;
-    if (model->get_rt_info().count(name) == 0)
-        return;
-    auto str_value = model->get_rt_info().at(name).as<std::string>();
-    if constexpr (std::is_same<T, int64_t>::value) {
-        value = std::stoll(str_value);
-    } else if constexpr (std::is_same<T, std::string>::value) {
-        value = str_value;
-    }
-}
-
-template void read_rt_info<int64_t>(std::shared_ptr<ov::Model>&,  const char*, int64_t&);
-template void read_rt_info<std::string>(std::shared_ptr<ov::Model>&,  const char*, std::string&);
-
 ov::Core singleton_core() {
     static ov::Core core;
     return core;

diff --git a/src/cpp/src/utils.hpp b/src/cpp/src/utils.hpp
@@ -111,9 +111,6 @@ void apply_gather_before_matmul_transformation(std::shared_ptr<ov::Model> model)
 
 ov::Core singleton_core();
 
-template <typename T>
-void read_rt_info(std::shared_ptr<ov::Model>& model, const char* name, T& value);
-
 size_t get_first_history_difference(const ov::Tensor& encoded_history, const std::vector<int64_t> tokenized_history, std::set<int64_t> stop_tokens);
 
 size_t get_seq_len_axis(std::shared_ptr<const ov::Model> model);

diff --git a/src/python/openvino_genai/py_openvino_genai.pyi b/src/python/openvino_genai/py_openvino_genai.pyi
@@ -1713,8 +1713,23 @@ class TokenizedInputs:
         ...
 class Tokenizer:
     """
-    openvino_genai.Tokenizer object is used to initialize Tokenizer
-               if it's located in a different path than the main model.
+    
+        The class is used to encode prompts and decode resulting tokens
+    
+        Chat tempalte is initialized from sources in the following order
+        overriding the previos value:
+        1. chat_template entry from tokenizer_config.json
+        2. chat_template entry from processor_config.json
+        3. chat_template entry from chat_template.json
+        4. chat_tempalte entry from rt_info section of openvino.Model
+        5. If the tempalte is known to be not supported by GenAI, it's
+            replaced with a simplified supported version.
+        6. Patch chat_tempalte replacing not supported instructions with
+            eqvivalents.
+        7. If the template was not in the list of not supported GenAI
+            templates from (5), it's blindly replaced with
+            simplified_chat_template entry from rt_info section of
+            openvino.Model if the entry exists.
     """
     chat_template: str
     def __init__(self, tokenizer_path: os.PathLike, properties: dict[str, typing.Any] = {}, **kwargs) -> None:

diff --git a/src/python/py_tokenizer.cpp b/src/python/py_tokenizer.cpp
@@ -13,6 +13,29 @@
 
 #include "py_utils.hpp"
 
+namespace {
+
+constexpr char class_docstring[] = R"(
+    The class is used to encode prompts and decode resulting tokens
+
+    Chat tempalte is initialized from sources in the following order
+    overriding the previos value:
+    1. chat_template entry from tokenizer_config.json
+    2. chat_template entry from processor_config.json
+    3. chat_template entry from chat_template.json
+    4. chat_tempalte entry from rt_info section of openvino.Model
+    5. If the tempalte is known to be not supported by GenAI, it's
+        replaced with a simplified supported version.
+    6. Patch chat_tempalte replacing not supported instructions with
+        eqvivalents.
+    7. If the template was not in the list of not supported GenAI
+        templates from (5), it's blindly replaced with
+        simplified_chat_template entry from rt_info section of
+        openvino.Model if the entry exists.
+)";
+
+}  // namespace
+
 namespace py = pybind11;
 namespace pyutils = ov::genai::pybind::utils;
 
@@ -26,9 +49,7 @@ void init_tokenizer(py::module_& m) {
         .def_readwrite("input_ids", &TokenizedInputs::input_ids)
         .def_readwrite("attention_mask", &TokenizedInputs::attention_mask);
 
-    py::class_<ov::genai::Tokenizer>(m, "Tokenizer",
-        R"(openvino_genai.Tokenizer object is used to initialize Tokenizer
-           if it's located in a different path than the main model.)")
+    py::class_<ov::genai::Tokenizer>(m, "Tokenizer", class_docstring)
 
         .def(py::init([](const std::filesystem::path& tokenizer_path, const std::map<std::string, py::object>& properties, const py::kwargs& kwargs) {
             ScopedVar env_manager(pyutils::ov_tokenizers_module_path());

diff --git a/tests/python_tests/test_kv_cache_eviction.py b/tests/python_tests/test_kv_cache_eviction.py
@@ -117,6 +117,7 @@ def test_cache_optimized_generation_is_similar_to_unoptimized(converted_model, t
     generation_config = GenerationConfig()  # expecting default greedy sampling
     generation_config.num_return_sequences = 1
     generation_config.max_new_tokens = test_struct.max_new_tokens
+    generation_config.apply_chat_template = False
 
     scheduler_config_opt = get_scheduler_config(test_struct.num_kv_blocks)
     scheduler_config_opt.use_cache_eviction = test_struct.use_cache_eviction