Extract Anthropic as an engine and make tools into callbacks

awwaiid · Dec 14, 2024 · 9952626 · 9952626
1 parent 250c9f7
commit 9952626
Show file tree

Hide file tree

Showing 12 changed files with 354 additions and 363 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -19,3 +19,10 @@ dotenv = "0.15"
 # opencv = "0.93.4"
 imageproc = "0.25.0"
 
+[lib]
+name = "ghostwriter"
+path = "src/lib.rs"
+
+[[bin]]
+name = "ghostwriter"
+path = "src/main.rs"
diff --git a/README.md b/README.md
@@ -179,3 +179,45 @@ mv tmp/* evaluations/$evaluation_name
 magick \( evaluations/$evaluation_name/input.png -colorspace RGB \) \( tmp/result.png -type truecolormatte -transparent white -fill red -colorize 100 \) -compose Over -composite tmp/merged-output.png
 ```
 
+Prompt / Tool ideas:
+* There are a few models for tools -- each tool can be re-usable and generalized or each tool could include things like extra-inputs for chain-of thought and hints for what goes into each parameter
+* The prompts should be plain JSON or YAML and should be normalized across V/LLM models
+* A general direction I'm thinking is to have top-level "modes" that each have a main prompt and a set of tools they can use
+* But maybe there can be a whole state-machine flow that the follow also?
+* So like ... a math-helper might have a different state-machine than a todo-helper
+* The states would be start, intermediate, and terminal
+* The terminal states should all have some output or effect, those are the ones that do something
+* The start state is the initial prompt
+* One intermediate state could be `thinking` where it can use the input of the tool as a place to write out thoughts, and the output of the tool is ignored
+* But overall what we're leading to here is a system where the prompts are easy to write, easy to copy/paste, easy to maintain
+* And then maybe we can have a set of evals or examples that are easy to use on top of a prompt mode
+* Increasingly, the reMarkable2 case might HAPPEN to be a specific prompt we set up in this system...
+* So the state machine chould be:
+
+```plantuml
+  [*] --> Screenshot
+  Screenshot --> OutputScreen
+  Screenshot --> OutputKeyboardText
+```
+
+```plantuml
+  [*] --> WaitForTouch
+  WaitForTouch --> Screenshot
+  Screenshot --> OutputScreen
+  Screenshot --> OutputKeyboardText
+  OutputScreen --> [*]
+  OutputKeyboardText --> [*]
+```
+
+```plantuml
+  [*] -> WaitForTouch
+  WaitForTouch --> Screenshot
+  Screenshot --> Thinking
+  Thinking -> Thinking
+  Thinking --> OutputScreen
+  Thinking --> OutputKeyboardText
+  OutputScreen --> [*]
+  OutputKeyboardText --> [*]
+```
+
+
diff --git a/prompts/base.json b/prompts/base.json
diff --git a/prompts/general.json b/prompts/general.json
@@ -0,0 +1,4 @@
+{
+  "prompt": "You are a helpful assistant. You live inside of a remarkable2 notepad, which has a 768x1024 px sized screen which can only display grayscale. Your input is the current content of the screen, which may contain content written by the user or previously written by you (the assistant). Look at this content, interpret it, and respond to the content. The content will contain handwritten notes, diagrams, and maybe typewritten text. Respond by calling a tool. Call draw_text to output text which will be sent using simulated keyboard input. Call draw_svg to respond with an SVG drawing which will be drawn on top of the existing content. Try to place the output on the screen at coordinates that make sense. If you need to place text at a very specific location, you should output an SVG instead of keyboard text.",
+  "tools": ["draw_text", "draw_svg"]
+}
diff --git a/prompts/text.json b/prompts/text.json
diff --git a/prompts/tool_draw_svg.json b/prompts/tool_draw_svg.json
@@ -0,0 +1,64 @@
+{
+  "name": "draw_svg",
+  "description": "Draw an SVG to the screen using simulated pen input. The input_description and output_description are used to build a plan for the actual output.",
+  "input_schema": {
+    "type": "object",
+    "properties": {
+      "input_description": {
+        "type": "string",
+        "description": "Description of what was detected in the input image. Include the exact pixel x, y, width, height bounding box coordinates of everything."
+      },
+      "input_features": {
+        "type": "array",
+        "description": "A list of exact bounding boxes for important features of the input",
+        "items": {
+          "type": "object",
+          "description": "A specific feature and bounding box",
+          "properties": {
+            "feature_description": {
+              "type": "string",
+              "description": "Description of the feature"
+            },
+            "top_left_x_px": {
+              "type": "integer",
+              "description": "The top-left x coordinate in px"
+            },
+            "top_left_y_px": {
+              "type": "integer",
+              "description": "The top-left y coordinate in px"
+            },
+            "bottom_right_x_px": {
+              "type": "integer",
+              "description": "The bottom-right x coordinate in px"
+            },
+            "bottom_right_y_px": {
+              "type": "integer",
+              "description": "The bottom-right y coordinate in px"
+            }
+          },
+          "required": [
+            "feature_description",
+            "top_left_x_px",
+            "top_left_y_px",
+            "bottom_right_x_px",
+            "bottom_right_y_px"
+          ]
+        }
+      },
+      "output_description": {
+        "type": "string",
+        "description": "Description of what will be drawn. Include the exact pixel x, y, width, height bounding box coordinates of what you want to draw."
+      },
+      "svg": {
+        "type": "string",
+        "description": "SVG data to be rendered. This is drawn on top of the input image, and should be the same size as the input image (768x1024 px). The display can only show black and white. Try to place the output in an integrated position. Use the `Noto Sans` font-family when you are showing text. Do not use a style tag tag. Do not use any fill colors or gradients or transparency or shadows. Do include the xmlns in the main svg tag."
+      }
+    },
+    "required": [
+      "input_description",
+      "input_features",
+      "output_description",
+      "svg"
+    ]
+  }
+}
diff --git a/prompts/tool_draw_text.json b/prompts/tool_draw_text.json
@@ -0,0 +1,26 @@
+{
+  "name": "draw_text",
+  "description": "Draw text to the screen using simulated keyboard input. The input_description and output_description are used to build a plan for the actual output.",
+  "input_schema": {
+    "type": "object",
+    "properties": {
+      "input_description": {
+        "type": "string",
+        "description": "Description of what was detected in the input image. Include the x,y,w,h bounding box coordinates of interesting regions."
+      },
+      "output_description": {
+        "type": "string",
+        "description": "Description of what will be output. Include x,y,w,h bounding box coordinates of specific regions."
+      },
+      "text": {
+        "type": "string",
+        "description": "Text to be written"
+      }
+    },
+    "required": [
+      "input_description",
+      "output_description",
+      "text"
+    ]
+  }
+}
diff --git a/prompts/tool_fetch_todo.json b/prompts/tool_fetch_todo.json
@@ -0,0 +1,6 @@
+{
+  "name": "fetch_todo",
+  "description": "Use an API to fetch the current TODO list",
+  "external_command": "tools/fetch_todo.sh",
+  "next_action": "loop"
+}