Skip to content

Commit

Permalink
update bloom implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
al8n committed Oct 18, 2023
1 parent 21475c6 commit 7b489b6
Showing 1 changed file with 64 additions and 20 deletions.
84 changes: 64 additions & 20 deletions src/lfu/tinylfu/bloom.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
//! This mod implements a Simple Bloom Filter.
//!
//! This file is a mechanical translation of the reference Golang code, available at [here](https://github.com/dgraph-io/ristretto/blob/master/z/bbloom.go)
//! This file is a mechanical translation of the reference Golang code, available at <https://github.com/dgraph-io/ristretto/blob/master/z/bbloom.go>
//!
//! I claim no additional copyright over the original implementation.
use alloc::vec;
use alloc::vec::Vec;

const LN_2: f64 = core::f64::consts::LN_2;
use alloc::{vec, vec::Vec};

const LN_2: f64 = std::f64::consts::LN_2;

struct Size {
size: u64,
Expand Down Expand Up @@ -34,7 +34,8 @@ struct EntriesLocs {
locs: u64,
}

fn calc_size_by_wrong_positives(num_entries: f64, wrongs: f64) -> EntriesLocs {
fn calc_size_by_wrong_positives(num_entries: usize, wrongs: f64) -> EntriesLocs {
let num_entries = num_entries as f64;
let size = -1f64 * num_entries * wrongs.ln() / LN_2.powf(2f64);
let locs = (LN_2 * size / num_entries).ceil();

Expand All @@ -45,9 +46,11 @@ fn calc_size_by_wrong_positives(num_entries: f64, wrongs: f64) -> EntriesLocs {
}

/// Bloom filter
#[repr(C)]
pub(crate) struct Bloom {
bitset: Vec<u64>,
elem_num: u64,
size_exp: u64,
size: u64,
set_locs: u64,
shift: u64,
Expand All @@ -57,7 +60,7 @@ impl Bloom {
pub fn new(cap: usize, false_positive_ratio: f64) -> Self {
let entries_locs = {
if false_positive_ratio < 1f64 {
calc_size_by_wrong_positives(cap as f64, false_positive_ratio)
calc_size_by_wrong_positives(cap, false_positive_ratio)
} else {
EntriesLocs {
entries: cap as u64,
Expand All @@ -72,29 +75,44 @@ impl Bloom {
bitset: vec![0; (size.size >> 6) as usize],
elem_num: 0,
size: size.size - 1,
size_exp: size.exp,
set_locs: entries_locs.locs,
shift: 64 - size.exp,
}
}

/// `size` makes Bloom filter with as bitset of size sz.
#[inline]
#[allow(dead_code)]
pub fn size(&mut self, sz: usize) {
self.bitset = vec![0; sz >> 6]
}

/// Returns the exp of the size
#[inline]
#[allow(dead_code)]
pub fn size_exp(&self) -> u64 {
self.size_exp
}

/// `clear` clear the `Bloom` filter
pub fn clear(&mut self) {
self.bitset.iter_mut().for_each(|v| *v = 0);
}

/// `set` sets the bit[idx] of bitset
/// `set` sets the bit[idx] of bitset
pub fn set(&mut self, idx: usize) {
let array_idx = idx >> 6; // divide by 64 to get the index in the bitset array
let bit_idx = idx % 64; // get the bit position within the 64-bit integer
self.bitset[array_idx] |= 1u64 << bit_idx;
let ptr = (self.bitset.as_mut_ptr() as usize + ((idx % 64) >> 3)) as *mut u8;
unsafe {
*ptr |= 1 << (idx % 8);
}
}

/// `is_set` checks if bit[idx] of bitset is set, returns true/false.
pub fn is_set(&self, idx: usize) -> bool {
let array_idx = idx >> 6; // divide by 64 to get the index in the bitset array
let bit_idx = idx % 64; // get the bit position within the 64-bit integer
(self.bitset[array_idx] & (1u64 << bit_idx)) != 0
let ptr = (self.bitset.as_ptr() as usize + ((idx % 64) >> 3)) as *const u8;
let r = unsafe { *ptr >> (idx % 8) } & 1;
r == 1
}

/// `add` adds hash of a key to the bloom filter
Expand Down Expand Up @@ -131,17 +149,25 @@ impl Bloom {
true
}
}

/// `total_size` returns the total size of the bloom filter.
#[allow(dead_code)]
#[inline]
pub fn total_size(&self) -> usize {
// The bl struct has 5 members and each one is 8 byte. The bitset is a
// uint64 byte slice.
self.bitset.len() * 8 + 5 * 8
}
}

#[cfg(test)]
mod test {
use crate::lfu::tinylfu::bloom::Bloom;
use alloc::string::String;
use alloc::vec::Vec;
use core::hash::{Hash, Hasher};
use super::*;
use rand::distributions::Alphanumeric;
use rand::{thread_rng, Rng};
use std::collections::hash_map::DefaultHasher;
use std::println;
use std::hash::{Hash, Hasher};
use std::string::String;

const N: usize = 1 << 16;

Expand All @@ -165,7 +191,6 @@ mod test {
}

#[test]
#[cfg_attr(miri, ignore)]
fn test_number_of_wrongs() {
let mut bf = Bloom::new(N * 10, 7f64);

Expand All @@ -177,6 +202,25 @@ mod test {
}
});

println!("Bloomfilter(size = {}) Check for 'false positives': {} wrong positive 'Has' results on 2^16 entries => {}%", bf.bitset.len() << 6, cnt, cnt as f64 / N as f64);
std::println!("Bloomfilter(size = {}) Check for 'false positives': {} wrong positive 'Has' results on 2^16 entries => {}%", bf.bitset.len() << 6, cnt, cnt as f64 / N as f64);
}

#[test]
fn test_total_size() {
let bf = Bloom::new(10, 7f64);
assert_eq!(bf.total_size(), 104);
}

#[test]
fn test_size_exp() {
let bf = Bloom::new(10, 7f64);
assert_eq!(bf.size_exp(), 9);
}

#[test]
fn test_size() {
let mut bf = Bloom::new(10, 7f64);
bf.size(1024);
assert_eq!(bf.bitset.len(), 16);
}
}

0 comments on commit 7b489b6

Please sign in to comment.