Update llama.cpp submodule to latest release b4516 (#376)

* Update submodule to latest release b4516 * fix: patch * fix: API --------- Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com> Co-authored-by: vansangpfiev <[email protected]>
janhq · Jan 21, 2025 · 56f9500 · 56f9500
1 parent fed4677
commit 56f9500
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 34 deletions.
diff --git a/llama.cpp b/llama.cpp
diff --git a/patches/0001-Add-API-query-buffer-size.patch b/patches/0001-Add-API-query-buffer-size.patch
@@ -4,55 +4,84 @@ Date: Mon, 30 Sep 2024 15:51:16 +0700
 Subject: [PATCH] Add API query buffer size
 
 ---
- include/llama.h |  3 +++
- src/llama.cpp   | 20 ++++++++++++++++++++
- 2 files changed, 23 insertions(+)
-
 diff --git a/include/llama.h b/include/llama.h
-index 7cae1bbe..fdcbf949 100644
+index 298b8d1b..0011dd8e 100644
 --- a/include/llama.h
 +++ b/include/llama.h
-@@ -447,6 +447,9 @@ extern "C" {
+@@ -468,6 +468,8 @@ extern "C" {
+     DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
 
-     LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
+     LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
++    LLAMA_API size_t llama_get_cpu_buffer(const struct llama_model * model);
++    LLAMA_API size_t llama_get_other_buffer(const struct llama_model * model);
+     LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
 
-+    LLAMA_API size_t const llama_get_cpu_buffer(const struct llama_model * model);
-+    LLAMA_API size_t const llama_get_other_buffer(const struct llama_model * model);
-+
-     LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
-     LLAMA_API enum llama_vocab_type   llama_vocab_type  (const struct llama_model * model);
-     LLAMA_API enum llama_rope_type    llama_rope_type   (const struct llama_model * model);
+     LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
 diff --git a/src/llama-context.cpp b/src/llama-context.cpp
-index 38a55fb2..80b3532e 100644
+index 671d2a81..2d802349 100644
 --- a/src/llama-context.cpp
 +++ b/src/llama-context.cpp
-@@ -602,6 +602,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
+@@ -606,6 +606,14 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
      return &ctx->model;
  }
 
-+const size_t llama_get_cpu_buffer(const struct llama_model * model) {
++size_t llama_get_cpu_buffer(const struct llama_model * model) {
++    return model->llama_get_cpu_buffer();
++}
++
++size_t llama_get_other_buffer(const struct llama_model * model) {
++    return model->llama_get_other_buffer();
++}
++
+ enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
+     return ctx->cparams.pooling_type;
+ }
+diff --git a/src/llama-model.cpp b/src/llama-model.cpp
+index 590386e6..e7ead0fb 100644
+--- a/src/llama-model.cpp
++++ b/src/llama-model.cpp
+@@ -3750,6 +3750,26 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
+     return it->second;
+ }
+
++size_t llama_model::llama_get_cpu_buffer() const {
 +    size_t buffer{0};
-+    for (const auto& buf : model->bufs) {
++    for (const auto& buf : pimpl->bufs) {
 +        if (strcmp(ggml_backend_buffer_name(buf.get()), "CPU") == 0) {
 +            buffer += ggml_backend_buffer_get_size(buf.get());
 +        }
 +    }
 +    return buffer;
 +}
 +
-+const size_t llama_get_other_buffer(const struct llama_model * model) {
++size_t llama_model::llama_get_other_buffer() const {
 +    size_t buffer{0};
-+    for (const auto& buf : model->bufs) {
++    for (const auto& buf : pimpl->bufs) {
 +        if (strcmp(ggml_backend_buffer_name(buf.get()), "CPU") != 0) {
 +            buffer += ggml_backend_buffer_get_size(buf.get());
 +        }
 +    }
 +    return buffer;
 +}
 +
- enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
-     return ctx->cparams.pooling_type;
- }
+ //
+ // interface implementation
+ //
+diff --git a/src/llama-model.h b/src/llama-model.h
+index a7c30444..e04233ad 100644
+--- a/src/llama-model.h
++++ b/src/llama-model.h
+@@ -362,6 +362,10 @@ struct llama_model {
+
+     const struct ggml_tensor * get_tensor(const char * name) const;
+
++    size_t llama_get_cpu_buffer() const;
++
++    size_t llama_get_other_buffer() const;
++
+ private:
+     struct impl;
+     std::unique_ptr<impl> pimpl;
 -- 
 2.39.5 (Apple Git-154)
 
diff --git a/src/llama_engine.cc b/src/llama_engine.cc
@@ -652,7 +652,9 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
     }
     params.n_predict = json_body->get("n_predict", -1).asInt();
     params.prompt = json_body->get("prompt", "").asString();
-    params.conversation = json_body->get("conversation", false).asBool();
+    params.conversation_mode =
+        (common_conversation_mode)json_body->get("conversation", false)
+            .asBool();
     params.special = json_body->get("special", false).asBool();
 
     server_map_[model_id].caching_enabled =

diff --git a/src/llama_server_context.cc b/src/llama_server_context.cc
@@ -232,10 +232,11 @@ bool LlamaServerContext::LoadModel(const common_params& params_) {
     LOG_ERROR_LLAMA("Unable to get llama.cpp context", {});
     return false;
   }
+  vocab = llama_model_get_vocab(model);
   n_ctx = llama_n_ctx(ctx);
 
-  add_bos_token = llama_add_bos_token(model);
-  has_eos_token = !llama_add_eos_token(model);
+  add_bos_token = llama_add_bos_token(vocab);
+  has_eos_token = !llama_add_eos_token(vocab);
 
   return true;
 }
@@ -508,12 +509,12 @@ bool LlamaServerContext::LaunchSlotWithData(LlamaClientSlot*& slot, json data) {
     slot->sparams.logit_bias.clear();
 
     if (json_value(data, "ignore_eos", false) && has_eos_token) {
-      slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
+      slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
     }
 
     const auto& logit_bias = data.find("logit_bias");
     if (logit_bias != data.end() && logit_bias->is_array()) {
-      const int n_vocab = llama_n_vocab(model);
+      const int n_vocab = llama_vocab_n_tokens(vocab);
       for (const auto& el : *logit_bias) {
         // TODO: we may want to throw errors here, in case "el" is incorrect
         if (el.is_array() && el.size() == 2) {
@@ -532,7 +533,7 @@ bool LlamaServerContext::LaunchSlotWithData(LlamaClientSlot*& slot, json data) {
               slot->sparams.logit_bias.push_back({tok, bias});
             }
           } else if (el[0].is_string()) {
-            auto toks = common_tokenize(model, el[0].get<std::string>(), false);
+            auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
             for (auto tok : toks) {
               slot->sparams.logit_bias.push_back({tok, bias});
             }
@@ -788,7 +789,7 @@ bool LlamaServerContext::ProcessToken(CompletionTokenOutput& result,
     slot.has_next_token = false;
   }
 
-  if (llama_token_is_eog(model, result.tok)) {
+  if (llama_vocab_is_eog(vocab, result.tok)) {
     slot.stopped_eos = true;
     slot.has_next_token = false;
     LOG_VERBOSE("eos token found", {});
@@ -1397,14 +1398,14 @@ bool LlamaServerContext::UpdateSlots() {
             }
 
             prefix_tokens.insert(prefix_tokens.begin(),
-                                 llama_token_prefix(model));
+                                 llama_vocab_fim_pre(vocab));
             prefix_tokens.insert(prefix_tokens.begin(),
-                                 llama_token_bos(model));  // always add BOS
+                                 llama_vocab_bos(vocab));  // always add BOS
             prefix_tokens.insert(prefix_tokens.end(),
-                                 llama_token_suffix(model));
+                                 llama_vocab_fim_suf(vocab));
             prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(),
                                  suffix_tokens.end());
-            prefix_tokens.push_back(llama_token_middle(model));
+            prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
             prompt_tokens = prefix_tokens;
           } else {
             prompt_tokens =

diff --git a/src/llama_server_context.h b/src/llama_server_context.h
@@ -111,6 +111,8 @@ struct LlamaServerContext {
   llama_model* model = nullptr;
   llama_context* ctx = nullptr;
 
+  const llama_vocab * vocab = nullptr;
+
   clip_ctx* clp_ctx = nullptr;
 
   common_params params;