feat(http-api-bindings): adapt llama.cpp embedding kind to latest for… (

#3828) * feat(http-api-bindings): adapt llama.cpp embedding kind to latest format and add kind the legacy type Signed-off-by: Wei Zhang <[email protected]> * chore: use before_b4356 instead of legacy Signed-off-by: Wei Zhang <[email protected]> * doc(models-http-api): introduce llama.cpp/before_b4356_embedding kind Signed-off-by: Wei Zhang <[email protected]> --------- Signed-off-by: Wei Zhang <[email protected]>
TabbyML · Feb 13, 2025 · f8f0e00 · f8f0e00
1 parent 22ef752
commit f8f0e00
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 13 deletions.
diff --git a/crates/http-api-bindings/src/embedding/llama.rs b/crates/http-api-bindings/src/embedding/llama.rs
@@ -11,18 +11,36 @@ use tracing::Instrument;
 use crate::{create_reqwest_client, embedding_info_span};
 
 pub struct LlamaCppEngine {
+    // Determine whether to use the legacy endpoint and response format.
+    // Llama.cpp has updated the endpoint from `/embedding` to `/embeddings`,
+    // and wrapped both the response and embedding in an array from b4357.
+    //
+    // Ref: https://github.com/ggerganov/llama.cpp/pull/10861
+    before_b4356: bool,
+
     client: reqwest::Client,
     api_endpoint: String,
     api_key: Option<String>,
 }
 
 impl LlamaCppEngine {
-    pub fn create(api_endpoint: &str, api_key: Option<String>) -> Box<dyn Embedding> {
+    pub fn create(
+        api_endpoint: &str,
+        api_key: Option<String>,
+        before_b4356: bool,
+    ) -> Box<dyn Embedding> {
         let client = create_reqwest_client(api_endpoint);
+        let api_endpoint = if before_b4356 {
+            format!("{}/embedding", api_endpoint)
+        } else {
+            format!("{}/embeddings", api_endpoint)
+        };
 
         Box::new(Self {
+            before_b4356,
+
             client,
-            api_endpoint: format!("{}/embedding", api_endpoint),
+            api_endpoint,
             api_key,
         })
     }
@@ -38,6 +56,11 @@ struct EmbeddingResponse {
     embedding: Vec<Vec<f32>>,
 }
 
+#[derive(Deserialize)]
+struct EmbeddingLegacyResponse {
+    embedding: Vec<f32>,
+}
+
 #[async_trait]
 impl Embedding for LlamaCppEngine {
     async fn embed(&self, prompt: &str) -> anyhow::Result<Vec<f32>> {
@@ -84,14 +107,19 @@ impl Embedding for LlamaCppEngine {
             ));
         }
 
-        let response = response.json::<Vec<EmbeddingResponse>>().await?;
-        Ok(response
-            .first()
-            .ok_or_else(|| anyhow!("Error from server: no embedding found"))?
-            .embedding
-            .first()
-            .ok_or_else(|| anyhow!("Error from server: no embedding found"))?
-            .clone())
+        if self.before_b4356 {
+            let response = response.json::<EmbeddingLegacyResponse>().await?;
+            Ok(response.embedding)
+        } else {
+            let response = response.json::<Vec<EmbeddingResponse>>().await?;
+            Ok(response
+                .first()
+                .ok_or_else(|| anyhow!("Error from server: no embedding found"))?
+                .embedding
+                .first()
+                .ok_or_else(|| anyhow!("Error from server: no embedding found"))?
+                .clone())
+        }
     }
 }
 
@@ -105,7 +133,7 @@ mod tests {
     #[tokio::test]
     #[ignore]
     async fn test_embedding() {
-        let engine = LlamaCppEngine::create("http://localhost:8000", None);
+        let engine = LlamaCppEngine::create("http://localhost:8000", None, false);
         let embedding = engine.embed("hello").await.unwrap();
         assert_eq!(embedding.len(), 768);
     }

diff --git a/crates/http-api-bindings/src/embedding/mod.rs b/crates/http-api-bindings/src/embedding/mod.rs
@@ -21,6 +21,15 @@ pub async fn create(config: &HttpModelConfig) -> Arc<dyn Embedding> {
                 .as_deref()
                 .expect("api_endpoint is required"),
             config.api_key.clone(),
+            false,
+        ),
+        "llama.cpp/before_b4356_embedding" => LlamaCppEngine::create(
+            config
+                .api_endpoint
+                .as_deref()
+                .expect("api_endpoint is required"),
+            config.api_key.clone(),
+            true,
         ),
         "openai/embedding" => OpenAIEmbeddingEngine::create(
             config

diff --git a/...s/references/models-http-api/llama.cpp.md → .../references/models-http-api/llama.cpp.mdx b/...s/references/models-http-api/llama.cpp.md → .../references/models-http-api/llama.cpp.mdx
@@ -1,3 +1,5 @@
+import Collapse from '@site/src/components/Collapse';
+
 # llama.cpp
 
 [llama.cpp](https://github.com/ggerganov/llama.cpp/blob/master/examples/server/README.md#api-endpoints) is a popular C++ library for serving gguf-based models. It provides a server implementation that supports completion, chat, and embedding functionalities through HTTP APIs.
@@ -27,8 +29,23 @@ prompt_template = "<PRE> {prefix} <SUF>{suffix} <MID>"  # Example prompt templat
 
 llama.cpp provides embedding functionality through its HTTP API.
 
+The llama.cpp embedding API interface and response format underwent some changes in version `b4356`.
+Therefore, we have provided two different kinds to accommodate the various versions of the llama.cpp embedding interface.
+
+You can refer to the configuration as follows:
+
 ```toml title="~/.tabby/config.toml"
 [model.embedding.http]
 kind = "llama.cpp/embedding"
 api_endpoint = "http://localhost:8888"
 ```
+
+<Collapse title="For the versions prior to b4356">
+
+```toml title="~/.tabby/config.toml"
+[model.embedding.http]
+kind = "llama.cpp/before_b4356_embedding"
+api_endpoint = "http://localhost:8888"
+```
+
+</Collapse>
diff --git a/website/docs/references/models-http-api/llamafile.md b/website/docs/references/models-http-api/llamafile.md
@@ -31,11 +31,15 @@ prompt_template = "<|fim_prefix|>{prefix}<|fim_suffix|>{suffix}<|fim_middle|>" #
 
 ## Embeddings model
 
-llamafile provides embedding functionality through llama.cpp's API interface. Note that the endpoint URL should NOT include the `v1` suffix.
+llamafile provides embedding functionality via llama.cpp's API interface,
+but it utilizes the API interface defined prior to version b4356.
+Therefore, we should use the kind `llama.cpp/before_b4356_embedding`.
+
+Note that the endpoint URL should NOT include the `v1` suffix.
 
 ```toml title="~/.tabby/config.toml"
 [model.embedding.http]
-kind = "llama.cpp/embedding"
+kind = "llama.cpp/before_b4356_embedding"
 model_name = "your_model"
 api_endpoint = "http://localhost:8082"  # DO NOT append the `v1` suffix
 api_key = ""