second-state · q82419 · Jan 21, 2025
diff --git a/wasmedge-ggml/mllama/Cargo.toml b/wasmedge-ggml/mllama/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "wasmedge-ggml-mllama"
+version = "0.1.0"
+edition = "2021"
+
+[dependencies]
+serde_json = "1.0"
+wasmedge-wasi-nn = "0.8.0"
diff --git a/wasmedge-ggml/mllama/README.md b/wasmedge-ggml/mllama/README.md
@@ -0,0 +1,82 @@
+# Mllama Example For WASI-NN with GGML Backend
+
+> [!NOTE]
+> Please refer to the [wasmedge-ggml/README.md](../README.md) for the general introduction and the setup of the WASI-NN plugin with GGML backend. This document will focus on the specific example of the Mllama model.
+
+## Get Mllama Model
+
+In this example, we are going to use the pre-converted [llama3.2-vision-11b](https://ollama.com/library/llama3.2-vision) model.
+
+For downloading the mllama model, please download and install [Ollama](https://ollama.com/) first.
+
+After installing the `Ollama`, fetching the model by the command:
+
+```bash
+ollama pull llama3.2-vision
+```
+
+The model will in the `~/.ollama/models/blobs` directory. (Take `llama3.2-vision-11b` for example.)
+
+* Model: `sha256-11f274007f093fefeec994a5dbbb33d0733a4feb87f7ab66dcd7c1069fef0068`
+* Projector: `sha256-ece5e659647a20a5c28ab9eea1c12a1ad430bc0f2a27021d00ad103b3bf5206f`
+
+## Prepare the Image
+
+Download the image for the Mllama model:
+
+```bash
+curl -LO https://llava-vl.github.io/static/images/monalisa.jpg
+```
+
+## Parameters
+
+> [!NOTE]
+> Please check the parameters section of [wasmedge-ggml/README.md](https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters) first.
+
+For GPU offloading, please adjust the `n-gpu-layers` options to the number of layers that you want to offload to the GPU.
+
+```rust
+options.insert("n-gpu-layers", Value::from(...));
+```
+
+## Build (WIP)
+
+> Note: Currently, users should build from source.
+
+1. Fetch WasmEdge and checkout the `wasi_nn_ggml_mllama` branch.
+
+    ```bash
+    git clone https://github.com/WasmEdge/WasmEdge.git
+    cd WasmEdge
+    git checkout wasi_nn_ggml_mllama
+    ```
+
+2. Build with WASI-NN GGML backend.
+
+    ```bash
+    cd <path/to/your/wasmedge/source/folder>
+    cmake -GNinja -Bbuild -DCMAKE_BUILD_TYPE=Release -DWASMEDGE_PLUGIN_WASI_NN_BACKEND="GGML"
+    cmake --build build
+    ```
+
+## Execute (WIP)
+
+Execute the WASM with the `wasmedge` using the named model feature to preload a large model:
+
+> Note: Because of building from source, we take run in the build folder for example.
+
+```console
+$ cd <path/to/your/wasmedge/source/folder>
+$ cd build/tools/wasmedge 
+$ WASMEDGE_PLUGIN_PATH=../../plugins/wasi_nn ./wasmedge --dir .:. \
+  --env mllamaproj=sha256-ece5e659647a20a5c28ab9eea1c12a1ad430bc0f2a27021d00ad103b3bf5206f \
+  --nn-preload default:GGML:AUTO:sha256-11f274007f093fefeec994a5dbbb33d0733a4feb87f7ab66dcd7c1069fef0068 \
+  wasmedge-ggml-mllama.wasm default
+
+USER:
+please describe this image
+IMAGE_PATH: (press enter if you don't want to add image)
+monalisa.jpg
+ASSISTANT:
+The image is a painting of a woman with long dark hair and a slight smile, wearing a Renaissance-style dress. The painting is likely a self-portrait of Leonardo da Vinci, as it is similar in style to his famous works such as the Mona Lisa. The woman's face is serene and calm, with a subtle hint of a smile playing on her lips. Her eyes are cast downward, giving the impression that she is lost in thought. Her hair is dark and flowing, cascading down her back in loose waves. She wears a Renaissance-style dress that is loose-fitting and elegant, with a high neckline and long sleeves. The dress is a deep shade of blue or purple, which complements the woman's skin tone nicely. The background of the painting is a soft, muted color that blends seamlessly into the woman's dress. There are no distinct features or objects in the background, which helps to focus the viewer's attention on the woman's face and figure. Overall, the painting is a beautiful example of Renaissance art, with its use of muted colors, elegant lines, and serene expression. It is likely that the woman depicted in the painting is a member of the artist's family or a wealthy patron, given the level of detail and craftsmanship that went into creating the piece.<|eot_id|>
+```
diff --git a/wasmedge-ggml/mllama/src/main.rs b/wasmedge-ggml/mllama/src/main.rs
@@ -0,0 +1,182 @@
+use serde_json::Value;
+use std::collections::HashMap;
+use std::env;
+use std::io;
+use wasmedge_wasi_nn::{
+    self, BackendError, Error, ExecutionTarget, GraphBuilder, GraphEncoding, GraphExecutionContext,
+    TensorType,
+};
+
+fn read_input() -> String {
+    loop {
+        let mut answer = String::new();
+        io::stdin()
+            .read_line(&mut answer)
+            .expect("Failed to read line");
+        if !answer.is_empty() && answer != "\n" && answer != "\r\n" {
+            return answer.trim().to_string();
+        }
+    }
+}
+
+fn read_image_path() -> String {
+    let mut answer = String::new();
+    io::stdin()
+        .read_line(&mut answer)
+        .expect("");
+    return answer.trim().to_string();
+}
+
+fn get_options_from_env() -> HashMap<&'static str, Value> {
+    let mut options = HashMap::new();
+
+    // Required parameters for mllama
+    if let Ok(val) = env::var("mllamaproj") {
+        options.insert("mllamaproj", Value::from(val.as_str()));
+    } else {
+        eprintln!("Failed to get mllamaproj model.");
+        std::process::exit(1);
+    }
+
+    // Optional parameters
+    if let Ok(val) = env::var("enable_log") {
+        options.insert("enable-log", serde_json::from_str(val.as_str()).unwrap());
+    } else {
+        options.insert("enable-log", Value::from(false));
+    }
+    if let Ok(val) = env::var("ctx_size") {
+        options.insert("ctx-size", serde_json::from_str(val.as_str()).unwrap());
+    } else {
+        options.insert("ctx-size", Value::from(2048));
+    }
+    if let Ok(val) = env::var("n_gpu_layers") {
+        options.insert("n-gpu-layers", serde_json::from_str(val.as_str()).unwrap());
+    } else {
+        options.insert("n-gpu-layers", Value::from(0));
+    }
+    options
+}
+
+fn set_data_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
+    context.set_input(0, TensorType::U8, &[1], &data)
+}
+
+
+fn set_metadata_to_context(context: &mut GraphExecutionContext, data: Vec<u8>) -> Result<(), Error> {
+    context.set_input(1, TensorType::U8, &[1], &data)
+}
+
+fn get_data_from_context(context: &GraphExecutionContext, index: usize) -> String {
+    // Preserve for 4096 tokens with average token length 6
+    const MAX_OUTPUT_BUFFER_SIZE: usize = 4096 * 6;
+    let mut output_buffer = vec![0u8; MAX_OUTPUT_BUFFER_SIZE];
+    let mut output_size = context
+        .get_output(index, &mut output_buffer)
+        .expect("Failed to get output");
+    output_size = std::cmp::min(MAX_OUTPUT_BUFFER_SIZE, output_size);
+
+    String::from_utf8_lossy(&output_buffer[..output_size]).to_string()
+}
+
+fn get_output_from_context(context: &GraphExecutionContext) -> String {
+    get_data_from_context(context, 0)
+}
+
+fn main() {
+    let args: Vec<String> = env::args().collect();
+    let model_name: &str = &args[1];
+
+    // Set options for the graph. Check our README for more details:
+    // https://github.com/second-state/WasmEdge-WASINN-examples/tree/master/wasmedge-ggml#parameters
+    let mut options = get_options_from_env();
+    // Set the stream-stdout option to true to make the response more interactive.
+    options.insert("stream-stdout", serde_json::from_str("true").unwrap());
+    // You could also set the options manually like this:
+    // options.insert("enable-log", Value::from(false));
+
+    // Create graph and initialize context.
+    let graph = GraphBuilder::new(GraphEncoding::Ggml, ExecutionTarget::AUTO)
+        .config(serde_json::to_string(&options).expect("Failed to serialize options"))
+        .build_from_cache(model_name)
+        .expect("Failed to build graph");
+    let mut context = graph
+        .init_execution_context()
+        .expect("Failed to init context");
+
+    // If there is a third argument, use it as the prompt and enter non-interactive mode.
+    // This is mainly for the CI workflow.
+    if args.len() >= 3 {
+        let prompt = &args[2];
+        println!("Prompt:\n{}", prompt);
+        let tensor_data = prompt.as_bytes().to_vec();
+        context
+            .set_input(0, TensorType::U8, &[1], &tensor_data)
+            .expect("Failed to set input");
+        println!("Response:");
+        context.compute().expect("Failed to compute");
+        let output = get_output_from_context(&context);
+        if let Some(true) = options["stream-stdout"].as_bool() {
+            println!();
+        } else {
+            println!("{}", output.trim());
+        }
+        std::process::exit(0);
+    }
+
+    let image_placeholder = "<|image|>";
+
+    loop {
+        println!("USER:");
+        let input = read_input();
+
+        println!("IMAGE_PATH: (press enter if you don't want to add image)");
+        let image_path = read_image_path();
+
+        if !image_path.is_empty() {
+            set_metadata_to_context(
+                &mut context,
+                format!("{{\"image\": \"{}\"}}", image_path).as_bytes().to_vec(),
+            ).expect("Failed to set metadata");
+        }
+
+        let prompt: String;
+        // mllama chat format is "<|start_header_id|>user<|end_header_id|>{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+        if !image_path.is_empty() {
+            prompt = format!(
+                "<|start_header_id|>user<|end_header_id|>{} {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>",
+                image_placeholder, input
+            );
+        } else {
+            prompt = format!(
+                "<|start_header_id|>user<|end_header_id|>{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>",
+                input);
+        }
+
+        // Set prompt to the input tensor.
+        set_data_to_context(&mut context, prompt.as_bytes().to_vec())
+            .expect("Failed to set input");
+
+        // Execute the inference.
+        println!("ASSISTANT:");
+        match context.compute() {
+            Ok(_) => (),
+            Err(Error::BackendError(BackendError::ContextFull)) => {
+                println!("\n[INFO] Context full, we'll reset the context and continue.");
+            }
+            Err(Error::BackendError(BackendError::PromptTooLong)) => {
+                println!("\n[INFO] Prompt too long, we'll reset the context and continue.");
+            }
+            Err(err) => {
+                println!("\n[ERROR] {}", err);
+            }
+        }
+
+        // Retrieve the output.
+        let output = get_output_from_context(&context);
+        if let Some(true) = options["stream-stdout"].as_bool() {
+            println!();
+        } else {
+            println!("{}", output.trim());
+        }
+    }
+}
diff --git a/wasmedge-ggml/mllama/wasmedge-ggml-mllama.wasm b/wasmedge-ggml/mllama/wasmedge-ggml-mllama.wasm