Add GroundingDINO (#30)

jamjamjon · Aug 9, 2024 · b81b5e3 · b81b5e3
1 parent 53d14ee
commit b81b5e3
Show file tree

Hide file tree

Showing 24 changed files with 536 additions and 131 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "usls"
-version = "0.0.9"
+version = "0.0.10"
 edition = "2021"
 description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
 repository = "https://github.com/jamjamjon/usls"

diff --git a/README.md b/README.md
@@ -5,7 +5,7 @@
 
 
 
-A Rust library integrated with **ONNXRuntime**, providing a collection of **Computer Vison** and **Vision-Language** models including [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [RTDETR](https://arxiv.org/abs/2304.08069), [SAM](https://github.com/facebookresearch/segment-anything), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM), [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [DINOv2](https://github.com/facebookresearch/dinov2), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything) and others.
+A Rust library integrated with **ONNXRuntime**, providing a collection of **Computer Vison** and **Vision-Language** models including [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [RTDETR](https://arxiv.org/abs/2304.08069), [SAM](https://github.com/facebookresearch/segment-anything), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM), [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [DINOv2](https://github.com/facebookresearch/dinov2), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) and others.
 
 
 |                     Segment Anything                     |
@@ -55,6 +55,7 @@ A Rust library integrated with **ONNXRuntime**, providing a collection of **Comp
 |             [YOLOPv2](https://arxiv.org/abs/2208.11434)             |                                              Panoptic Driving Perception                                              |     [demo](examples/yolop)     |      ✅      |      ✅      |             ✅             |            ✅            |
 |    [Depth-Anything<br />(v1, v2)](https://github.com/LiheYoung/Depth-Anything)    |                                               Monocular Depth Estimation                                               | [demo](examples/depth-anything) |      ✅      |      ✅      |             ❌             |            ❌            |
 |              [MODNet](https://github.com/ZHKKKe/MODNet)              |                                                     Image Matting                                                     |     [demo](examples/modnet)     |      ✅      |      ✅      |             ✅             |            ✅            |
+|              [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO)              |            Open-Set Detection With Language            |     [demo](examples/grounding-dino)     |      ✅      |      ✅      |                          |                       |
 
 ## Installation
 

diff --git a/examples/blip/main.rs b/examples/blip/main.rs
@@ -10,7 +10,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // textual
     let options_textual = Options::default()
         .with_model("blip-textual-base.onnx")?
-        .with_tokenizer("tokenizer-blip.json")?
+        // .with_tokenizer("tokenizer-blip.json")?
         .with_i00((1, 1, 4).into()) // input_id: batch
         .with_i01((1, 1, 4).into()) // input_id: seq_len
         .with_i10((1, 1, 4).into()) // attention_mask: batch
@@ -23,9 +23,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let mut model = Blip::new(options_visual, options_textual)?;
 
     // image caption (this demo use batch_size=1)
-    let x = vec![DataLoader::try_read("./assets/bus.jpg")?];
-    let _y = model.caption(&x, None, true)?; // unconditional
-    let y = model.caption(&x, Some("three man"), true)?; // conditional
+    let xs = vec![DataLoader::try_read("./assets/bus.jpg")?];
+    let image_embeddings = model.encode_images(&xs)?;
+    let _y = model.caption(&image_embeddings, None, true)?; // unconditional
+    let y = model.caption(&image_embeddings, Some("three man"), true)?; // conditional
     println!("{:?}", y[0].texts());
 
     Ok(())

diff --git a/examples/clip/main.rs b/examples/clip/main.rs
@@ -10,7 +10,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // textual
     let options_textual = Options::default()
         .with_model("clip-b32-textual-dyn.onnx")?
-        .with_tokenizer("tokenizer-clip.json")?
+        // .with_tokenizer("tokenizer-clip.json")?
         .with_i00((1, 1, 4).into())
         .with_profile(false);
 

diff --git a/examples/grounding-dino/main.rs b/examples/grounding-dino/main.rs
@@ -0,0 +1,40 @@
+use usls::{models::GroundingDINO, Annotator, DataLoader, Options};
+
+fn main() -> Result<(), Box<dyn std::error::Error>> {
+    let opts = Options::default()
+        .with_i00((1, 1, 4).into())
+        .with_i02((640, 800, 1200).into())
+        .with_i03((640, 1200, 1200).into())
+        .with_i10((1, 1, 4).into())
+        .with_i11((256, 256, 512).into())
+        .with_i20((1, 1, 4).into())
+        .with_i21((256, 256, 512).into())
+        .with_i30((1, 1, 4).into())
+        .with_i31((256, 256, 512).into())
+        .with_i40((1, 1, 4).into())
+        .with_i41((256, 256, 512).into())
+        .with_i50((1, 1, 4).into())
+        .with_i51((256, 256, 512).into())
+        .with_i52((256, 256, 512).into())
+        .with_model("groundingdino-swint-ogc-dyn-u8.onnx")? // TODO: current onnx model does not support bs > 1
+        // .with_model("groundingdino-swint-ogc-dyn-f32.onnx")?
+        .with_confs(&[0.2])
+        .with_profile(false);
+    let mut model = GroundingDINO::new(opts)?;
+
+    // Load images and set class names
+    let x = [DataLoader::try_read("./assets/bus.jpg")?];
+    let texts = [
+        "person", "hand", "shoes", "bus", "dog", "cat", "sign", "tie", "monitor", "window",
+        "glasses", "tree", "head",
+    ];
+
+    // Run and annotate
+    let y = model.run(&x, &texts)?;
+    let annotator = Annotator::default()
+        .with_bboxes_thickness(4)
+        .with_saveout("GroundingDINO");
+    annotator.annotate(&x, &y);
+
+    Ok(())
+}
diff --git a/examples/sam/main.rs b/examples/sam/main.rs
@@ -99,7 +99,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     let mut model = SAM::new(options_encoder, options_decoder)?;
 
     // Load image
-    let xs = vec![DataLoader::try_read("./assets/truck.jpg")?];
+    let xs = [
+        DataLoader::try_read("./assets/truck.jpg")?,
+        // DataLoader::try_read("./assets/dog.jpg")?,
+    ];
 
     // Build annotator
     let annotator = Annotator::default().with_saveout(saveout);

diff --git a/src/core/ops.rs b/src/core/ops.rs
@@ -7,7 +7,7 @@ use fast_image_resize::{
     FilterType, ResizeAlg, ResizeOptions, Resizer,
 };
 use image::{DynamicImage, GenericImageView};
-use ndarray::{s, Array, Axis, IxDyn};
+use ndarray::{s, Array, Axis, IntoDimension, IxDyn};
 use rayon::prelude::*;
 
 pub enum Ops<'a> {
@@ -20,6 +20,10 @@ pub enum Ops<'a> {
     Nhwc2nchw,
     Nchw2nhwc,
     Norm,
+    Sigmoid,
+    Broadcast,
+    ToShape,
+    Repeat,
 }
 
 impl Ops<'_> {
@@ -34,6 +38,41 @@ impl Ops<'_> {
         Ok((x - min) / (max - min))
     }
 
+    pub fn sigmoid(x: Array<f32, IxDyn>) -> Array<f32, IxDyn> {
+        x.mapv(|x| 1. / ((-x).exp() + 1.))
+    }
+
+    pub fn broadcast<D: IntoDimension + std::fmt::Debug + Copy>(
+        x: Array<f32, IxDyn>,
+        dim: D,
+    ) -> Result<Array<f32, IxDyn>> {
+        match x.broadcast(dim) {
+            Some(x) => Ok(x.to_owned().into_dyn()),
+            None => anyhow::bail!(
+                "Failed to broadcast. Shape: {:?}, dim: {:?}",
+                x.shape(),
+                dim
+            ),
+        }
+    }
+
+    pub fn repeat(x: Array<f32, IxDyn>, d: usize, n: usize) -> Result<Array<f32, IxDyn>> {
+        if d >= x.ndim() {
+            anyhow::bail!("Index {d} is out of bounds with size {}.", x.ndim());
+        } else {
+            let mut dim = x.shape().to_vec();
+            dim[d] = n;
+            Self::broadcast(x, dim.as_slice())
+        }
+    }
+
+    pub fn to_shape<D: ndarray::ShapeArg>(
+        x: Array<f32, IxDyn>,
+        dim: D,
+    ) -> Result<Array<f32, IxDyn>> {
+        Ok(x.to_shape(dim).map(|x| x.to_owned().into_dyn())?)
+    }
+
     pub fn standardize(
         x: Array<f32, IxDyn>,
         mean: &[f32],

diff --git a/src/core/options.rs b/src/core/options.rs
@@ -73,10 +73,13 @@ pub struct Options {
     pub nk: Option<usize>,
     pub nm: Option<usize>,
     pub confs: Vec<f32>,
+    pub confs2: Vec<f32>,
+    pub confs3: Vec<f32>,
     pub kconfs: Vec<f32>,
     pub iou: Option<f32>,
     pub tokenizer: Option<String>,
     pub vocab: Option<String>,
+    pub context_length: Option<usize>,
     pub names: Option<Vec<String>>,  // names
     pub names2: Option<Vec<String>>, // names2
     pub names3: Option<Vec<String>>, // names3
@@ -152,11 +155,14 @@ impl Default for Options {
             nc: None,
             nk: None,
             nm: None,
-            confs: vec![0.4f32],
+            confs: vec![0.3f32],
+            confs2: vec![0.3f32],
+            confs3: vec![0.3f32],
             kconfs: vec![0.5f32],
             iou: None,
             tokenizer: None,
             vocab: None,
+            context_length: None,
             names: None,
             names2: None,
             names3: None,
@@ -255,12 +261,17 @@ impl Options {
     }
 
     pub fn with_vocab(mut self, vocab: &str) -> Result<Self> {
-        self.vocab = Some(auto_load(vocab, Some("models"))?);
+        self.vocab = Some(auto_load(vocab, Some("tokenizers"))?);
         Ok(self)
     }
 
+    pub fn with_context_length(mut self, n: usize) -> Self {
+        self.context_length = Some(n);
+        self
+    }
+
     pub fn with_tokenizer(mut self, tokenizer: &str) -> Result<Self> {
-        self.tokenizer = Some(auto_load(tokenizer, Some("models"))?);
+        self.tokenizer = Some(auto_load(tokenizer, Some("tokenizers"))?);
         Ok(self)
     }
 
@@ -299,8 +310,18 @@ impl Options {
         self
     }
 
-    pub fn with_confs(mut self, confs: &[f32]) -> Self {
-        self.confs = confs.to_vec();
+    pub fn with_confs(mut self, x: &[f32]) -> Self {
+        self.confs = x.to_vec();
+        self
+    }
+
+    pub fn with_confs2(mut self, x: &[f32]) -> Self {
+        self.confs2 = x.to_vec();
+        self
+    }
+
+    pub fn with_confs3(mut self, x: &[f32]) -> Self {
+        self.confs3 = x.to_vec();
         self
     }
 

diff --git a/src/core/ort_engine.rs b/src/core/ort_engine.rs
@@ -321,6 +321,9 @@ impl OrtEngine {
                 TensorElementType::Int8 => {
                     ort::Value::from_array(x.mapv(|x_| x_ as i8).view())?.into_dyn()
                 }
+                TensorElementType::Bool => {
+                    ort::Value::from_array(x.mapv(|x_| x_ != 0.).view())?.into_dyn()
+                }
                 _ => todo!(),
             };
             xs_.push(Into::<ort::SessionInputValue<'_>>::into(x_));

diff --git a/src/core/x.rs b/src/core/x.rs
@@ -1,6 +1,6 @@
 use anyhow::Result;
 use image::DynamicImage;
-use ndarray::{Array, Dim, IxDyn, IxDynImpl};
+use ndarray::{Array, Dim, IntoDimension, IxDyn, IxDynImpl};
 
 use crate::Ops;
 
@@ -51,12 +51,28 @@ impl X {
                 Ops::InsertAxis(d) => y.insert_axis(*d)?,
                 Ops::Nhwc2nchw => y.nhwc2nchw()?,
                 Ops::Nchw2nhwc => y.nchw2nhwc()?,
+                Ops::Sigmoid => y.sigmoid()?,
                 _ => todo!(),
             }
         }
         Ok(y)
     }
 
+    pub fn sigmoid(mut self) -> Result<Self> {
+        self.0 = Ops::sigmoid(self.0);
+        Ok(self)
+    }
+
+    pub fn broadcast<D: IntoDimension + std::fmt::Debug + Copy>(mut self, dim: D) -> Result<Self> {
+        self.0 = Ops::broadcast(self.0, dim)?;
+        Ok(self)
+    }
+
+    pub fn to_shape<D: ndarray::ShapeArg>(mut self, dim: D) -> Result<Self> {
+        self.0 = Ops::to_shape(self.0, dim)?;
+        Ok(self)
+    }
+
     pub fn permute(mut self, shape: &[usize]) -> Result<Self> {
         self.0 = Ops::permute(self.0, shape)?;
         Ok(self)
@@ -77,6 +93,11 @@ impl X {
         Ok(self)
     }
 
+    pub fn repeat(mut self, d: usize, n: usize) -> Result<Self> {
+        self.0 = Ops::repeat(self.0, d, n)?;
+        Ok(self)
+    }
+
     pub fn dims(&self) -> &[usize] {
         self.0.shape()
     }