Skip to content

Commit

Permalink
Update llama.cpp submodule to latest release b4516 (#376)
Browse files Browse the repository at this point in the history
* Update submodule to latest release b4516

* fix: patch

* fix: API

---------

Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
Co-authored-by: vansangpfiev <[email protected]>
  • Loading branch information
3 people authored Jan 21, 2025
1 parent fed4677 commit 56f9500
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 34 deletions.
2 changes: 1 addition & 1 deletion llama.cpp
73 changes: 51 additions & 22 deletions patches/0001-Add-API-query-buffer-size.patch
Original file line number Diff line number Diff line change
Expand Up @@ -4,55 +4,84 @@ Date: Mon, 30 Sep 2024 15:51:16 +0700
Subject: [PATCH] Add API query buffer size

---
include/llama.h | 3 +++
src/llama.cpp | 20 ++++++++++++++++++++
2 files changed, 23 insertions(+)

diff --git a/include/llama.h b/include/llama.h
index 7cae1bbe..fdcbf949 100644
index 298b8d1b..0011dd8e 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -447,6 +447,9 @@ extern "C" {
@@ -468,6 +468,8 @@ extern "C" {
DEPRECATED(LLAMA_API int32_t llama_n_vocab (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");

LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
LLAMA_API const struct llama_model * llama_get_model (const struct llama_context * ctx);
+ LLAMA_API size_t llama_get_cpu_buffer(const struct llama_model * model);
+ LLAMA_API size_t llama_get_other_buffer(const struct llama_model * model);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);

+ LLAMA_API size_t const llama_get_cpu_buffer(const struct llama_model * model);
+ LLAMA_API size_t const llama_get_other_buffer(const struct llama_model * model);
+
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 38a55fb2..80b3532e 100644
index 671d2a81..2d802349 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -602,6 +602,26 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
@@ -606,6 +606,14 @@ const struct llama_model * llama_get_model(const struct llama_context * ctx) {
return &ctx->model;
}

+const size_t llama_get_cpu_buffer(const struct llama_model * model) {
+size_t llama_get_cpu_buffer(const struct llama_model * model) {
+ return model->llama_get_cpu_buffer();
+}
+
+size_t llama_get_other_buffer(const struct llama_model * model) {
+ return model->llama_get_other_buffer();
+}
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
return ctx->cparams.pooling_type;
}
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 590386e6..e7ead0fb 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3750,6 +3750,26 @@ const struct ggml_tensor * llama_model::get_tensor(const char * name) const {
return it->second;
}

+size_t llama_model::llama_get_cpu_buffer() const {
+ size_t buffer{0};
+ for (const auto& buf : model->bufs) {
+ for (const auto& buf : pimpl->bufs) {
+ if (strcmp(ggml_backend_buffer_name(buf.get()), "CPU") == 0) {
+ buffer += ggml_backend_buffer_get_size(buf.get());
+ }
+ }
+ return buffer;
+}
+
+const size_t llama_get_other_buffer(const struct llama_model * model) {
+size_t llama_model::llama_get_other_buffer() const {
+ size_t buffer{0};
+ for (const auto& buf : model->bufs) {
+ for (const auto& buf : pimpl->bufs) {
+ if (strcmp(ggml_backend_buffer_name(buf.get()), "CPU") != 0) {
+ buffer += ggml_backend_buffer_get_size(buf.get());
+ }
+ }
+ return buffer;
+}
+
enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx) {
return ctx->cparams.pooling_type;
}
//
// interface implementation
//
diff --git a/src/llama-model.h b/src/llama-model.h
index a7c30444..e04233ad 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -362,6 +362,10 @@ struct llama_model {

const struct ggml_tensor * get_tensor(const char * name) const;

+ size_t llama_get_cpu_buffer() const;
+
+ size_t llama_get_other_buffer() const;
+
private:
struct impl;
std::unique_ptr<impl> pimpl;
--
2.39.5 (Apple Git-154)

4 changes: 3 additions & 1 deletion src/llama_engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -652,7 +652,9 @@ bool LlamaEngine::LoadModelImpl(std::shared_ptr<Json::Value> json_body) {
}
params.n_predict = json_body->get("n_predict", -1).asInt();
params.prompt = json_body->get("prompt", "").asString();
params.conversation = json_body->get("conversation", false).asBool();
params.conversation_mode =
(common_conversation_mode)json_body->get("conversation", false)
.asBool();
params.special = json_body->get("special", false).asBool();

server_map_[model_id].caching_enabled =
Expand Down
21 changes: 11 additions & 10 deletions src/llama_server_context.cc
Original file line number Diff line number Diff line change
Expand Up @@ -232,10 +232,11 @@ bool LlamaServerContext::LoadModel(const common_params& params_) {
LOG_ERROR_LLAMA("Unable to get llama.cpp context", {});
return false;
}
vocab = llama_model_get_vocab(model);
n_ctx = llama_n_ctx(ctx);

add_bos_token = llama_add_bos_token(model);
has_eos_token = !llama_add_eos_token(model);
add_bos_token = llama_add_bos_token(vocab);
has_eos_token = !llama_add_eos_token(vocab);

return true;
}
Expand Down Expand Up @@ -508,12 +509,12 @@ bool LlamaServerContext::LaunchSlotWithData(LlamaClientSlot*& slot, json data) {
slot->sparams.logit_bias.clear();

if (json_value(data, "ignore_eos", false) && has_eos_token) {
slot->sparams.logit_bias.push_back({llama_token_eos(model), -INFINITY});
slot->sparams.logit_bias.push_back({llama_vocab_eos(vocab), -INFINITY});
}

const auto& logit_bias = data.find("logit_bias");
if (logit_bias != data.end() && logit_bias->is_array()) {
const int n_vocab = llama_n_vocab(model);
const int n_vocab = llama_vocab_n_tokens(vocab);
for (const auto& el : *logit_bias) {
// TODO: we may want to throw errors here, in case "el" is incorrect
if (el.is_array() && el.size() == 2) {
Expand All @@ -532,7 +533,7 @@ bool LlamaServerContext::LaunchSlotWithData(LlamaClientSlot*& slot, json data) {
slot->sparams.logit_bias.push_back({tok, bias});
}
} else if (el[0].is_string()) {
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
auto toks = common_tokenize(vocab, el[0].get<std::string>(), false);
for (auto tok : toks) {
slot->sparams.logit_bias.push_back({tok, bias});
}
Expand Down Expand Up @@ -788,7 +789,7 @@ bool LlamaServerContext::ProcessToken(CompletionTokenOutput& result,
slot.has_next_token = false;
}

if (llama_token_is_eog(model, result.tok)) {
if (llama_vocab_is_eog(vocab, result.tok)) {
slot.stopped_eos = true;
slot.has_next_token = false;
LOG_VERBOSE("eos token found", {});
Expand Down Expand Up @@ -1397,14 +1398,14 @@ bool LlamaServerContext::UpdateSlots() {
}

prefix_tokens.insert(prefix_tokens.begin(),
llama_token_prefix(model));
llama_vocab_fim_pre(vocab));
prefix_tokens.insert(prefix_tokens.begin(),
llama_token_bos(model)); // always add BOS
llama_vocab_bos(vocab)); // always add BOS
prefix_tokens.insert(prefix_tokens.end(),
llama_token_suffix(model));
llama_vocab_fim_suf(vocab));
prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(),
suffix_tokens.end());
prefix_tokens.push_back(llama_token_middle(model));
prefix_tokens.push_back(llama_vocab_fim_mid(vocab));
prompt_tokens = prefix_tokens;
} else {
prompt_tokens =
Expand Down
2 changes: 2 additions & 0 deletions src/llama_server_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,8 @@ struct LlamaServerContext {
llama_model* model = nullptr;
llama_context* ctx = nullptr;

const llama_vocab * vocab = nullptr;

clip_ctx* clp_ctx = nullptr;

common_params params;
Expand Down

0 comments on commit 56f9500

Please sign in to comment.