Resize to 768x1024 to match upstream VLM capabilities

awwaiid · Nov 24, 2024 · b45412b · b45412b
1 parent f7ee1d8
commit b45412b
Show file tree

Hide file tree

Showing 4 changed files with 39 additions and 24 deletions.
diff --git a/README.md b/README.md
@@ -135,17 +135,14 @@ scp target/armv7-unknown-linux-gnueabihf/release/ghostwriter remarkable:
 
 ## Scratch
 
-I got evtest by getting the ipkg from trotek and untaring it a few levels and then scping it over. Surprisingly it works!
-
-Now I can see that /dev/input/event1 is pen input and /dev/input/event2 is touch input
-
-You can detect distance. The value gets smaller as you get close to the screen with the tip of the pen or eraser
-
-  Event: time 1728139017.789746, type 3 (EV_ABS), code 25 (ABS_DISTANCE), value 105
+```
+# Run an evaluation
+./target/release/ghostwriter --input-png evaluations/x_in_box/input.png --output-file tmp/result.out --model-output-file tmp/result.json --save-bitmap tmp/result.png --no-draw --no-draw-progress --no-loop claude-assist
 
-EV_KEY 320 (BTN_TOOL_PEN) is for pen presence/range
-EV_KEY 330 (BTN_TOUCH) is for actual drawing
+# Layer the input and output
+magick \( evaluations/x_in_box/input.png -colorspace RGB \) \( tmp/result.png -type truecolormatte -transparent white -fill red -colorize 100 \) -compose Over -composite tmp/merged-output.png
+```
 
-* https://github.com/rmkit-dev/rmkit/tree/master is great to learn from
-* https://github.com/rmkit-dev/rmkit/blob/master/src/lamp/main.cpy -- they've already worked out some other pen-input-drawing! See if we can translate or learn about a reliable way to draw
+Resize from 1872x1404 to 1268x951 px (I think claude does it for us already)
+OR maybe 768x1024 is better. Same ratio, but "standard" XGA
 
diff --git a/src/main.rs b/src/main.rs
@@ -26,8 +26,8 @@ use crate::touch::Touch;
 mod util;
 use crate::util::{svg_to_bitmap, write_bitmap_to_file};
 
-const REMARKABLE_WIDTH: u32 = 1404;
-const REMARKABLE_HEIGHT: u32 = 1872;
+const REMARKABLE_WIDTH: u32 = 768;
+const REMARKABLE_HEIGHT: u32 = 1024;
 
 #[derive(Parser)]
 #[command(author, version, about, long_about = None)]
@@ -218,7 +218,7 @@ fn ghostwriter(args: &Args) -> Result<()> {
                 "content": [
                 {
                     "type": "text",
-                    "text": "You are a helpful assistant. You live inside of a remarkable2 notepad, which has a 1404x1872 sized screen which can only display grayscale. Your input is the current content of the screen, which may contain content written by the user or previously written by you (the assistant). Look at this content, interpret it, and respond to the content. The content will contain handwritten notes, diagrams, and maybe typewritten text. Respond by calling a tool. Call draw_text to output text which will be sent using simulated keyboard input. Call draw_svg to respond with an SVG drawing which will be drawn on top of the existing content. Try to place the output on the screen at coordinates that make sense. If you need to place text at a very specific location, you should output an SVG instead of keyboard text."
+                    "text": "You are a helpful assistant. You live inside of a remarkable2 notepad, which has a 768x1024 px sized screen which can only display grayscale. Your input is the current content of the screen, which may contain content written by the user or previously written by you (the assistant). Look at this content, interpret it, and respond to the content. The content will contain handwritten notes, diagrams, and maybe typewritten text. Respond by calling a tool. Call draw_text to output text which will be sent using simulated keyboard input. Call draw_svg to respond with an SVG drawing which will be drawn on top of the existing content. Try to place the output on the screen at coordinates that make sense. If you need to place text at a very specific location, you should output an SVG instead of keyboard text."
                 },
 
                 {
@@ -367,11 +367,11 @@ fn claude_assist(args: &Args) -> Result<()> {
                 "properties": {
                     "input_description": {
                         "type": "string",
-                        "description": "Description of what was detected in the input image"
+                        "description": "Description of what was detected in the input image. Include the x,y,w,h bounding box coordinates of interesting regions."
                     },
                     "output_description": {
                         "type": "string",
-                        "description": "Description of what will be output"
+                        "description": "Description of what will be output. Include x,y,w,h bounding box coordinates of specific regions."
                     },
                     "text": {
                         "type": "string",
@@ -389,11 +389,11 @@ fn claude_assist(args: &Args) -> Result<()> {
                 "properties": {
                     "input_description": {
                         "type": "string",
-                        "description": "Description of what was detected in the input image"
+                        "description": "Description of what was detected in the input image. Include the exact pixel x, y, width, height bounding box coordinates of everything."
                     },
                     "output_description": {
                         "type": "string",
-                        "description": "Description of what will be drawn"
+                        "description": "Description of what will be drawn. Include the exact pixel x, y, width, height bounding box coordinates of what you want to draw."
                     },
                     "svg": {
                         "type": "string",
@@ -414,7 +414,7 @@ fn claude_assist(args: &Args) -> Result<()> {
                 "content": [
                     {
                         "type": "text",
-                        "text": "You are a helpful assistant. You live inside of a remarkable2 notepad, which has a 1404x1872 sized screen which can only display grayscale. Your input is the current content of the screen, which may contain content written by the user or previously written by you (the assistant). Look at this content, interpret it, and respond to the content. The content will contain handwritten notes, diagrams, and maybe typewritten text. Respond by calling a tool. Call draw_text to output text which will be sent using simulated keyboard input. Call draw_svg to respond with an SVG drawing which will be drawn on top of the existing content. Try to place the output on the screen at coordinates that make sense. If you need to place text at a very specific location, you should output an SVG instead of keyboard text."
+                        "text": "You are a helpful assistant. You live inside of a remarkable2 notepad, which has a 768x1024 px sized screen which can only display grayscale. Your input is the current content of the screen, which may contain content written by the user or previously written by you (the assistant). Look at this content, interpret it, and respond to the content. The content will contain handwritten notes, diagrams, and maybe typewritten text. Respond by calling a tool. Call draw_text to output text which will be sent using simulated keyboard input. Call draw_svg to respond with an SVG drawing which will be drawn on top of the existing content. Try to place the output on the screen at coordinates that make sense. If you need to place text at a very specific location, you should output an SVG instead of keyboard text."
                     },
                     {
                         "type": "image",

diff --git a/src/pen.rs b/src/pen.rs
@@ -6,8 +6,8 @@ use std::time::Duration;
 const INPUT_WIDTH: usize = 15725;
 const INPUT_HEIGHT: usize = 20966;
 
-const REMARKABLE_WIDTH: u32 = 1404;
-const REMARKABLE_HEIGHT: u32 = 1872;
+const REMARKABLE_WIDTH: u32 = 768;
+const REMARKABLE_HEIGHT: u32 = 1024;
 
 pub struct Pen {
     device: Option<Device>,

diff --git a/src/screenshot.rs b/src/screenshot.rs
@@ -15,6 +15,9 @@ const WINDOW_BYTES: usize = WIDTH * HEIGHT * BYTES_PER_PIXEL;
 const REMARKABLE_WIDTH: u32 = 1404;
 const REMARKABLE_HEIGHT: u32 = 1872;
 
+const OUTPUT_WIDTH: u32 = 768;
+const OUTPUT_HEIGHT: u32 = 1024;
+
 pub struct Screenshot {
     data: Vec<u8>,
 }
@@ -79,9 +82,24 @@ impl Screenshot {
     }
 
     fn process_image(data: Vec<u8>) -> Result<Vec<u8>> {
-        // Implement image processing here (transpose, color correction, etc.)
-        // For now, we'll just encode the raw data to PNG
-        Ok(Self::encode_png(&data)?)
+        // Encode the raw data to PNG
+        let png_data = Self::encode_png(&data)?;
+
+        // Resize the PNG to OUTPUT_WIDTH x OUTPUT_HEIGHT
+        let img = image::load_from_memory(&png_data)?;
+        let resized_img = img.resize(OUTPUT_WIDTH, OUTPUT_HEIGHT, image::imageops::FilterType::Lanczos3);
+
+        // Encode the resized image back to PNG
+        let mut resized_png_data = Vec::new();
+        let encoder = image::codecs::png::PngEncoder::new(&mut resized_png_data);
+        encoder.encode(
+            resized_img.as_luma8().unwrap().as_raw(),
+            OUTPUT_WIDTH,
+            OUTPUT_HEIGHT,
+            image::ColorType::L8,
+        )?;
+
+        Ok(resized_png_data)
     }
 
     fn encode_png(raw_data: &[u8]) -> Result<Vec<u8>> {