Skip to content

Commit

Permalink
Add GroundingDINO (#30)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamjamjon authored Aug 9, 2024
1 parent 53d14ee commit b81b5e3
Show file tree
Hide file tree
Showing 24 changed files with 536 additions and 131 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "usls"
version = "0.0.9"
version = "0.0.10"
edition = "2021"
description = "A Rust library integrated with ONNXRuntime, providing a collection of ML models."
repository = "https://github.com/jamjamjon/usls"
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@



A Rust library integrated with **ONNXRuntime**, providing a collection of **Computer Vison** and **Vision-Language** models including [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [RTDETR](https://arxiv.org/abs/2304.08069), [SAM](https://github.com/facebookresearch/segment-anything), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM), [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [DINOv2](https://github.com/facebookresearch/dinov2), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything) and others.
A Rust library integrated with **ONNXRuntime**, providing a collection of **Computer Vison** and **Vision-Language** models including [YOLOv5](https://github.com/ultralytics/yolov5), [YOLOv6](https://github.com/meituan/YOLOv6), [YOLOv7](https://github.com/WongKinYiu/yolov7), [YOLOv8](https://github.com/ultralytics/ultralytics), [YOLOv9](https://github.com/WongKinYiu/yolov9), [YOLOv10](https://github.com/THU-MIG/yolov10), [RTDETR](https://arxiv.org/abs/2304.08069), [SAM](https://github.com/facebookresearch/segment-anything), [MobileSAM](https://github.com/ChaoningZhang/MobileSAM), [EdgeSAM](https://github.com/chongzhou96/EdgeSAM), [SAM-HQ](https://github.com/SysCV/sam-hq), [FastSAM](https://github.com/CASIA-IVA-Lab/FastSAM), [CLIP](https://github.com/openai/CLIP), [BLIP](https://arxiv.org/abs/2201.12086), [DINOv2](https://github.com/facebookresearch/dinov2), [YOLO-World](https://github.com/AILab-CVC/YOLO-World), [PaddleOCR](https://github.com/PaddlePaddle/PaddleOCR), [Depth-Anything](https://github.com/LiheYoung/Depth-Anything), [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) and others.


| Segment Anything |
Expand Down Expand Up @@ -55,6 +55,7 @@ A Rust library integrated with **ONNXRuntime**, providing a collection of **Comp
| [YOLOPv2](https://arxiv.org/abs/2208.11434) | Panoptic Driving Perception | [demo](examples/yolop) |||||
| [Depth-Anything<br />(v1, v2)](https://github.com/LiheYoung/Depth-Anything) | Monocular Depth Estimation | [demo](examples/depth-anything) |||||
| [MODNet](https://github.com/ZHKKKe/MODNet) | Image Matting | [demo](examples/modnet) |||||
| [GroundingDINO](https://github.com/IDEA-Research/GroundingDINO) | Open-Set Detection With Language | [demo](examples/grounding-dino) ||| | |

## Installation

Expand Down
9 changes: 5 additions & 4 deletions examples/blip/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// textual
let options_textual = Options::default()
.with_model("blip-textual-base.onnx")?
.with_tokenizer("tokenizer-blip.json")?
// .with_tokenizer("tokenizer-blip.json")?
.with_i00((1, 1, 4).into()) // input_id: batch
.with_i01((1, 1, 4).into()) // input_id: seq_len
.with_i10((1, 1, 4).into()) // attention_mask: batch
Expand All @@ -23,9 +23,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut model = Blip::new(options_visual, options_textual)?;

// image caption (this demo use batch_size=1)
let x = vec![DataLoader::try_read("./assets/bus.jpg")?];
let _y = model.caption(&x, None, true)?; // unconditional
let y = model.caption(&x, Some("three man"), true)?; // conditional
let xs = vec![DataLoader::try_read("./assets/bus.jpg")?];
let image_embeddings = model.encode_images(&xs)?;
let _y = model.caption(&image_embeddings, None, true)?; // unconditional
let y = model.caption(&image_embeddings, Some("three man"), true)?; // conditional
println!("{:?}", y[0].texts());

Ok(())
Expand Down
2 changes: 1 addition & 1 deletion examples/clip/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
// textual
let options_textual = Options::default()
.with_model("clip-b32-textual-dyn.onnx")?
.with_tokenizer("tokenizer-clip.json")?
// .with_tokenizer("tokenizer-clip.json")?
.with_i00((1, 1, 4).into())
.with_profile(false);

Expand Down
40 changes: 40 additions & 0 deletions examples/grounding-dino/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
use usls::{models::GroundingDINO, Annotator, DataLoader, Options};

fn main() -> Result<(), Box<dyn std::error::Error>> {
let opts = Options::default()
.with_i00((1, 1, 4).into())
.with_i02((640, 800, 1200).into())
.with_i03((640, 1200, 1200).into())
.with_i10((1, 1, 4).into())
.with_i11((256, 256, 512).into())
.with_i20((1, 1, 4).into())
.with_i21((256, 256, 512).into())
.with_i30((1, 1, 4).into())
.with_i31((256, 256, 512).into())
.with_i40((1, 1, 4).into())
.with_i41((256, 256, 512).into())
.with_i50((1, 1, 4).into())
.with_i51((256, 256, 512).into())
.with_i52((256, 256, 512).into())
.with_model("groundingdino-swint-ogc-dyn-u8.onnx")? // TODO: current onnx model does not support bs > 1
// .with_model("groundingdino-swint-ogc-dyn-f32.onnx")?
.with_confs(&[0.2])
.with_profile(false);
let mut model = GroundingDINO::new(opts)?;

// Load images and set class names
let x = [DataLoader::try_read("./assets/bus.jpg")?];
let texts = [
"person", "hand", "shoes", "bus", "dog", "cat", "sign", "tie", "monitor", "window",
"glasses", "tree", "head",
];

// Run and annotate
let y = model.run(&x, &texts)?;
let annotator = Annotator::default()
.with_bboxes_thickness(4)
.with_saveout("GroundingDINO");
annotator.annotate(&x, &y);

Ok(())
}
5 changes: 4 additions & 1 deletion examples/sam/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,10 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
let mut model = SAM::new(options_encoder, options_decoder)?;

// Load image
let xs = vec![DataLoader::try_read("./assets/truck.jpg")?];
let xs = [
DataLoader::try_read("./assets/truck.jpg")?,
// DataLoader::try_read("./assets/dog.jpg")?,
];

// Build annotator
let annotator = Annotator::default().with_saveout(saveout);
Expand Down
41 changes: 40 additions & 1 deletion src/core/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use fast_image_resize::{
FilterType, ResizeAlg, ResizeOptions, Resizer,
};
use image::{DynamicImage, GenericImageView};
use ndarray::{s, Array, Axis, IxDyn};
use ndarray::{s, Array, Axis, IntoDimension, IxDyn};
use rayon::prelude::*;

pub enum Ops<'a> {
Expand All @@ -20,6 +20,10 @@ pub enum Ops<'a> {
Nhwc2nchw,
Nchw2nhwc,
Norm,
Sigmoid,
Broadcast,
ToShape,
Repeat,
}

impl Ops<'_> {
Expand All @@ -34,6 +38,41 @@ impl Ops<'_> {
Ok((x - min) / (max - min))
}

pub fn sigmoid(x: Array<f32, IxDyn>) -> Array<f32, IxDyn> {
x.mapv(|x| 1. / ((-x).exp() + 1.))
}

pub fn broadcast<D: IntoDimension + std::fmt::Debug + Copy>(
x: Array<f32, IxDyn>,
dim: D,
) -> Result<Array<f32, IxDyn>> {
match x.broadcast(dim) {
Some(x) => Ok(x.to_owned().into_dyn()),
None => anyhow::bail!(
"Failed to broadcast. Shape: {:?}, dim: {:?}",
x.shape(),
dim
),
}
}

pub fn repeat(x: Array<f32, IxDyn>, d: usize, n: usize) -> Result<Array<f32, IxDyn>> {
if d >= x.ndim() {
anyhow::bail!("Index {d} is out of bounds with size {}.", x.ndim());
} else {
let mut dim = x.shape().to_vec();
dim[d] = n;
Self::broadcast(x, dim.as_slice())
}
}

pub fn to_shape<D: ndarray::ShapeArg>(
x: Array<f32, IxDyn>,
dim: D,
) -> Result<Array<f32, IxDyn>> {
Ok(x.to_shape(dim).map(|x| x.to_owned().into_dyn())?)
}

pub fn standardize(
x: Array<f32, IxDyn>,
mean: &[f32],
Expand Down
31 changes: 26 additions & 5 deletions src/core/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,13 @@ pub struct Options {
pub nk: Option<usize>,
pub nm: Option<usize>,
pub confs: Vec<f32>,
pub confs2: Vec<f32>,
pub confs3: Vec<f32>,
pub kconfs: Vec<f32>,
pub iou: Option<f32>,
pub tokenizer: Option<String>,
pub vocab: Option<String>,
pub context_length: Option<usize>,
pub names: Option<Vec<String>>, // names
pub names2: Option<Vec<String>>, // names2
pub names3: Option<Vec<String>>, // names3
Expand Down Expand Up @@ -152,11 +155,14 @@ impl Default for Options {
nc: None,
nk: None,
nm: None,
confs: vec![0.4f32],
confs: vec![0.3f32],
confs2: vec![0.3f32],
confs3: vec![0.3f32],
kconfs: vec![0.5f32],
iou: None,
tokenizer: None,
vocab: None,
context_length: None,
names: None,
names2: None,
names3: None,
Expand Down Expand Up @@ -255,12 +261,17 @@ impl Options {
}

pub fn with_vocab(mut self, vocab: &str) -> Result<Self> {
self.vocab = Some(auto_load(vocab, Some("models"))?);
self.vocab = Some(auto_load(vocab, Some("tokenizers"))?);
Ok(self)
}

pub fn with_context_length(mut self, n: usize) -> Self {
self.context_length = Some(n);
self
}

pub fn with_tokenizer(mut self, tokenizer: &str) -> Result<Self> {
self.tokenizer = Some(auto_load(tokenizer, Some("models"))?);
self.tokenizer = Some(auto_load(tokenizer, Some("tokenizers"))?);
Ok(self)
}

Expand Down Expand Up @@ -299,8 +310,18 @@ impl Options {
self
}

pub fn with_confs(mut self, confs: &[f32]) -> Self {
self.confs = confs.to_vec();
pub fn with_confs(mut self, x: &[f32]) -> Self {
self.confs = x.to_vec();
self
}

pub fn with_confs2(mut self, x: &[f32]) -> Self {
self.confs2 = x.to_vec();
self
}

pub fn with_confs3(mut self, x: &[f32]) -> Self {
self.confs3 = x.to_vec();
self
}

Expand Down
3 changes: 3 additions & 0 deletions src/core/ort_engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,9 @@ impl OrtEngine {
TensorElementType::Int8 => {
ort::Value::from_array(x.mapv(|x_| x_ as i8).view())?.into_dyn()
}
TensorElementType::Bool => {
ort::Value::from_array(x.mapv(|x_| x_ != 0.).view())?.into_dyn()
}
_ => todo!(),
};
xs_.push(Into::<ort::SessionInputValue<'_>>::into(x_));
Expand Down
23 changes: 22 additions & 1 deletion src/core/x.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use anyhow::Result;
use image::DynamicImage;
use ndarray::{Array, Dim, IxDyn, IxDynImpl};
use ndarray::{Array, Dim, IntoDimension, IxDyn, IxDynImpl};

use crate::Ops;

Expand Down Expand Up @@ -51,12 +51,28 @@ impl X {
Ops::InsertAxis(d) => y.insert_axis(*d)?,
Ops::Nhwc2nchw => y.nhwc2nchw()?,
Ops::Nchw2nhwc => y.nchw2nhwc()?,
Ops::Sigmoid => y.sigmoid()?,
_ => todo!(),
}
}
Ok(y)
}

pub fn sigmoid(mut self) -> Result<Self> {
self.0 = Ops::sigmoid(self.0);
Ok(self)
}

pub fn broadcast<D: IntoDimension + std::fmt::Debug + Copy>(mut self, dim: D) -> Result<Self> {
self.0 = Ops::broadcast(self.0, dim)?;
Ok(self)
}

pub fn to_shape<D: ndarray::ShapeArg>(mut self, dim: D) -> Result<Self> {
self.0 = Ops::to_shape(self.0, dim)?;
Ok(self)
}

pub fn permute(mut self, shape: &[usize]) -> Result<Self> {
self.0 = Ops::permute(self.0, shape)?;
Ok(self)
Expand All @@ -77,6 +93,11 @@ impl X {
Ok(self)
}

pub fn repeat(mut self, d: usize, n: usize) -> Result<Self> {
self.0 = Ops::repeat(self.0, d, n)?;
Ok(self)
}

pub fn dims(&self) -> &[usize] {
self.0.shape()
}
Expand Down
Loading

0 comments on commit b81b5e3

Please sign in to comment.