Skip to content

Commit

Permalink
adds example usage for product quantization
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva committed Nov 6, 2024
1 parent 528dc19 commit f55a1a7
Show file tree
Hide file tree
Showing 5 changed files with 139 additions and 1 deletion.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion embeddings/.gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.fastembed_cache
.custom_models
.custom_models
./src/bin/datasets/*.csv
5 changes: 5 additions & 0 deletions embeddings/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ edition = "2021"
name = "pq"
path = "./src/bin/pq.rs"

[[bin]]
name = "pq_bench"
path = "./src/bin/pq_bench.rs"

[[bin]]
name = "embeddings"
path = "./src/bin/embeddings.rs"
Expand All @@ -28,3 +32,4 @@ reqwest = { version = "0.12.9", features = ["blocking"] }
strum_macros = "0.26.4"
async-once-cell = "0.5.4"
once_cell = "1.20.2"
csv = "1.3.0"
13 changes: 13 additions & 0 deletions embeddings/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Embeddings

## Running the examples

### Product quantization

Download the dataset at [https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset](https://www.kaggle.com/datasets/notshrirang/spotify-million-song-dataset) and place it under `/src/bin/datasets`.

Then:

```bash
cargo run --release --bin pq_bench
```
118 changes: 118 additions & 0 deletions embeddings/src/bin/pq_bench.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
use anyhow::Result;
use csv::ReaderBuilder;
use embeddings::pq;
use embeddings::OramaModels;
use fastembed::Embedding;
use serde::Deserialize;
use std::time::Instant;

#[derive(Debug, Deserialize)]
struct Record {
artist: String,
song: String,
link: String,
text: String,
}

const BATCH_SIZE: usize = 50;
const MAX_RECORDS: usize = 10_000;

fn main() -> Result<()> {
let model = OramaModels::MultilingualE5Small.try_new()?;
let mut data: Vec<Embedding> = vec![];

let file_path = "src/bin/datasets/spotify_millsongdata.csv";
let mut reader = ReaderBuilder::new().flexible(true).from_path(file_path)?;

let total_records = reader.records().count();

reader = ReaderBuilder::new().flexible(true).from_path(file_path)?;

let mut batch_texts: Vec<String> = Vec::with_capacity(BATCH_SIZE);
let mut current_record = 0;

for result in reader.deserialize() {
if current_record >= MAX_RECORDS {
break;
}

let record: Record = result?;
let concat_text = format!("{} - {} - {}", record.artist, record.song, record.text);
batch_texts.push(concat_text);

current_record += 1;

if batch_texts.len() == BATCH_SIZE
|| current_record == total_records
|| current_record == MAX_RECORDS
{
let batch_embeddings = model.embed(batch_texts, Some(BATCH_SIZE))?;
data.extend(batch_embeddings.into_iter());

let percentage = (current_record as f64 / MAX_RECORDS as f64) * 100.0;
println!(
"generated embeddings up to record {} of {} ({:.2}%)",
current_record, MAX_RECORDS, percentage
);

batch_texts = Vec::with_capacity(BATCH_SIZE);
}
}

let start_time = Instant::now();
let quantizer = pq::ProductQuantizer::try_new(data)?;
let duration = start_time.elapsed();

println!("Time taken to train ProductQuantizer: {:.2?}", duration);

let new_embedding = model.embed(vec![CUSTOM_SONG_LYRICS.to_string()], Some(1))?;

let quantization_time = Instant::now();
let quantized_embeddings = quantizer.quantize(new_embedding);
let quantization_duration = quantization_time.elapsed();

println!(
"Time taken to quantize a new embedding: {:.2?}",
quantization_duration
);
println!("Quantized embedding:");
dbg!(quantized_embeddings);

Ok(())
}

const CUSTOM_SONG_LYRICS: &str = r"
Well, I walk upon the river like it's easier than land
Evil's in my pocket and your will is in my hand
Oh, your will is in my hand
And I'll throw it in the current that I stand upon so still
Love is all, from what I've heard, but my heart's learned to kill
Oh, mine has learned to kill
Oh, I said I could rise
From the harness of our goals
Here come the tears
But like always, I let them go
Just let them go
And now spikes will keep on falling from the heavens to the floor
The future was our skin and now we don't dream anymore
No, we don't dream anymore
Like a house made from spider webs and the clouds rolling in
I bet this mighty river's both my savior and my sin
Oh, my savior and my sin
Oh, I said I could rise
From the harness of our goals
Here come the tears
But like always, I let them go
Just let them go
Well, I walk upon the river like it's easier than land
Evil's in my pocket and your strength is in my hand
Strength is in my hand
And I'll throw you in the current that I stand upon so still
Love is all, from what I've heard, but my heart's learned to kill
Oh, mine has learned to kill
Oh, I said I could rise
From the harness of our goals
Here come the tears
But like always, I let them go
Just let them go
";

0 comments on commit f55a1a7

Please sign in to comment.