Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(zkos): Implement ZK OS Merkle tree #3625

Open
wants to merge 32 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 27 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
001e162
Sketch basic tree skeleton
slowli Feb 6, 2025
906a15a
Sketch tree hashing
slowli Feb 12, 2025
97947bc
Test hashes
slowli Feb 12, 2025
e13d1dd
Allow initializing tree from scratch
slowli Feb 12, 2025
4baad6f
Fix inserting new nodes
slowli Feb 12, 2025
06bb509
Update core lockfile
slowli Feb 12, 2025
aee010d
Test updates / inserts
slowli Feb 13, 2025
80629ca
Split `storage` module
slowli Feb 13, 2025
16ef332
High-level tests + fixes
slowli Feb 13, 2025
a18cfdc
Change tree update signature
slowli Feb 13, 2025
0fc0512
Sketch RocksDB persistence
slowli Feb 14, 2025
4eaff69
Sketch tree consistency checks
slowli Feb 14, 2025
80ec371
Prepare for load test; fix insertion
slowli Feb 14, 2025
1ee50d8
Add load test
slowli Feb 14, 2025
06b5160
Parameterize tree construction
slowli Feb 17, 2025
9f29c17
Parameterize tree some more + add tags
slowli Feb 17, 2025
171cbd1
Check tree tags on initialization
slowli Feb 17, 2025
580639e
Remove future key indices on tree truncation
slowli Feb 17, 2025
037b2ed
Sketch batch insertion proof
slowli Feb 17, 2025
7ffb23d
Sketch getting insertion proof from tree
slowli Feb 18, 2025
7ec8122
Add doc comments; load test with proofs
slowli Feb 18, 2025
e996c7e
Optimize `InternalNodeHashes`
slowli Feb 18, 2025
fa8592c
Optimize `InternalNodeHashes` some more
slowli Feb 19, 2025
7440517
Simplify `PartialPatchSet`
slowli Feb 19, 2025
913279a
Update readme
slowli Feb 19, 2025
7669583
Support insertion proofs based on empty tree
slowli Feb 19, 2025
e35897e
Update core lockfile
slowli Feb 19, 2025
50dc56c
Simplify batch proof verification
slowli Feb 20, 2025
d73218a
Improve version updates
slowli Feb 20, 2025
555fa7d
Log write batch stats
slowli Feb 20, 2025
f71f59a
Document storage layout
slowli Feb 21, 2025
12710cf
Better document code
slowli Feb 21, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions core/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ members = [
"lib/crypto_primitives",
"lib/external_price_api",
"lib/task_management",
"lib/zk_os_merkle_tree",
"lib/test_contracts",
# Test infrastructure
"tests/loadnext",
Expand Down
2 changes: 1 addition & 1 deletion core/lib/merkle_tree/examples/loadtest/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ struct Cli {
/// Number of commits to perform.
#[arg(name = "commits")]
commit_count: u64,
/// Number of inserts / updates per commit.
/// Number of inserts per commit.
#[arg(name = "ops")]
writes_per_commit: usize,
/// Generate Merkle proofs for each operation.
Expand Down
2 changes: 1 addition & 1 deletion core/lib/merkle_tree/src/consistency.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,7 @@ struct AtomicBitSet {
}

impl AtomicBitSet {
const BITS_PER_ATOMIC: usize = 8;
const BITS_PER_ATOMIC: usize = 64;

fn new(len: usize) -> Self {
let atomic_count = (len + Self::BITS_PER_ATOMIC - 1) / Self::BITS_PER_ATOMIC;
Expand Down
24 changes: 20 additions & 4 deletions core/lib/storage/src/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -487,10 +487,10 @@ impl<CF: NamedColumnFamily> RocksDB<CF> {
self.inner.db.multi_get(keys)
}

pub fn multi_get_cf(
pub fn multi_get_cf<K: AsRef<[u8]>>(
&self,
cf: CF,
keys: impl Iterator<Item = Vec<u8>>,
keys: impl Iterator<Item = K>,
) -> Vec<Result<Option<DBPinnableSlice<'_>>, rocksdb::Error>> {
let cf = self.column_family(cf);
self.inner.db.batched_multi_get_cf(cf, keys, false)
Expand Down Expand Up @@ -597,12 +597,28 @@ impl<CF: NamedColumnFamily> RocksDB<CF> {
pub fn from_iterator_cf(
&self,
cf: CF,
key_from: &[u8],
keys: ops::RangeFrom<&[u8]>,
) -> impl Iterator<Item = (Box<[u8]>, Box<[u8]>)> + '_ {
let cf = self.column_family(cf);
self.inner
.db
.iterator_cf(cf, IteratorMode::From(key_from, Direction::Forward))
.iterator_cf(cf, IteratorMode::From(keys.start, Direction::Forward))
.map(Result::unwrap)
.fuse()
// ^ unwrap() is safe for the same reasons as in `prefix_iterator_cf()`.
}

/// Iterates over key-value pairs in the specified column family `cf` in the reverse lexical
/// key order starting from the given `key_from`.
pub fn to_iterator_cf(
&self,
cf: CF,
keys: ops::RangeToInclusive<&[u8]>,
) -> impl Iterator<Item = (Box<[u8]>, Box<[u8]>)> + '_ {
let cf = self.column_family(cf);
self.inner
.db
.iterator_cf(cf, IteratorMode::From(keys.end, Direction::Reverse))
.map(Result::unwrap)
.fuse()
// ^ unwrap() is safe for the same reasons as in `prefix_iterator_cf()`.
Expand Down
29 changes: 29 additions & 0 deletions core/lib/zk_os_merkle_tree/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
[package]
name = "zk_os_merkle_tree"
description = "Persistent ZK OS Merkle tree"
version.workspace = true
edition.workspace = true
authors.workspace = true
homepage.workspace = true
repository.workspace = true
license.workspace = true
keywords.workspace = true
categories.workspace = true

[dependencies]
zksync_basic_types.workspace = true
zksync_crypto_primitives.workspace = true
zksync_storage.workspace = true

anyhow.workspace = true
leb128.workspace = true
once_cell.workspace = true
rayon.workspace = true
thiserror.workspace = true
tracing.workspace = true

[dev-dependencies]
clap = { workspace = true, features = ["derive"] }
rand.workspace = true
tempfile.workspace = true
tracing-subscriber = { workspace = true, features = ["env-filter"] }
69 changes: 69 additions & 0 deletions core/lib/zk_os_merkle_tree/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Persistent ZK OS Merkle tree

Dense, doubly linked Merkle tree implementation with parameterized depth and amortization factor.

## Construction

- The tree is a dense binary Merkle tree with parametric depth (the default depth is currently set to 64; i.e., up to
`2 ** 64` leaves).
- Hash function is parametric as well; the default one is Blake2s with 256-bit output. The tree is always considered to
have fixed depth (i.e., no reduced hashing for lightly populated trees).
- The order of leaves is the insertion order; leaves are never removed from the tree.
- Leaves emulate a linked list. I.e., each leaf holds beside a 32-byte key and 32-byte value, 0-based indices in the
tree to leaves with lexicographically previous and next keys.
- There are 2 pre-inserted guard leaves with min / max keys (i.e., `[0_u8; 32]` and `[u8::MAX; 32]`). As such, all
“real” leaves always have previous / next pointers well-defined.

Hashing specification:

```text
hash(leaf) = blake2s(
leaf.key ++ leaf.value ++ leaf.prev.to_le_bytes() ++ leaf.next.to_le_bytes()
);
hash(node) = blake2s(hash(node.left) ++ hash(node.right));
```

where `++` is byte concatenation.

## Storage layout

RocksDB is used for tree persistence. The implementation uses parametric amortization strategy similar to [Jellyfish
Merkle tree] to reduce the amount of I/O at the cost of increased hashing. Here, parametric means that the radix of
internal nodes is configurable (obviously, it's fixed for a tree instance). Radix-8 or radix-16 look optimal; the
default is currently set to radix-16.

## Benchmarking

The `loadtest` example is a CLI app allowing to measure tree performance. It allows using the in-memory or RocksDB
storage backend, and Blake2s or no-op hashing functions. For example, the following command launches a benchmark with
1,000 batches each containing 4,000 insertions and 16,000 updates (= 20,000 writes / batch; 4M inserts in total),
generating an insertion proof for each batch.

```shell
RUST_LOG=debug cargo run --release \
-p zk_os_merkle_tree --example loadtest -- \
--updates=16000 --chunk-size=500 --proofs 1000 4000
```

The order of timings should be as follows (measured on MacBook Pro with 12-core Apple M2 Max CPU and 32 GB DDR5 RAM
using the command line above):

```text
2025-02-19T11:06:24.736870Z INFO loadtest: Processing block #999
2025-02-19T11:06:24.813829Z DEBUG zk_os_merkle_tree::storage::patch: loaded lookup info, elapsed: 76.89375ms
2025-02-19T11:06:24.908340Z DEBUG zk_os_merkle_tree::storage::patch: loaded nodes, elapsed: 93.501125ms, distinct_indices.len: 23967
2025-02-19T11:06:24.908994Z DEBUG zk_os_merkle_tree: loaded tree data, elapsed: 172.085ms, inserts: 4000, updates: 16000, loaded_internal_nodes: 36294
2025-02-19T11:06:24.936667Z DEBUG zk_os_merkle_tree::storage::patch: collected hashes for batch proof, hash_latency: 15.131706ms, traverse_latency: 10.213624ms
2025-02-19T11:06:24.936756Z DEBUG zk_os_merkle_tree: created batch proof, elapsed: 27.751333ms, proof.leaves.len: 23967, proof.hashes.len: 156210
2025-02-19T11:06:24.944054Z DEBUG zk_os_merkle_tree: updated tree structure, elapsed: 7.285209ms
2025-02-19T11:06:24.954820Z DEBUG zk_os_merkle_tree: hashed tree, elapsed: 10.747417ms
2025-02-19T11:06:25.017817Z DEBUG zk_os_merkle_tree: persisted tree, elapsed: 62.967083ms
2025-02-19T11:06:25.018655Z INFO loadtest: Processed block #999 in 281.765541ms, root hash = 0x12fa11d7742d67509c9a980e0fb62a1b64a478c9ff4d7596555e1f0d5cb2043f
2025-02-19T11:06:25.018669Z INFO loadtest: Verifying tree consistency...
2025-02-19T11:07:06.144174Z INFO loadtest: Verified tree consistency in 41.126574667s
```

I.e., latency is dominated by I/O (~30% for key–index lookup, ~30% for loading tree nodes, and ~20% for tree
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Intuitively, there should be a way to optimize key–index lookup, but I haven't come up with anything functional yet. Stuff like reusing RocksDB iterators doesn't seem to provide efficiency boost.

persistence).

[jellyfish merkle tree]: https://developers.diem.com/papers/jellyfish-merkle-tree/2021-01-14.pdf
172 changes: 172 additions & 0 deletions core/lib/zk_os_merkle_tree/examples/loadtest.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
//! Load test for the Merkle tree.

use std::{hint::black_box, time::Instant};

use anyhow::Context;
use clap::Parser;
use rand::{
prelude::{IteratorRandom, StdRng},
SeedableRng,
};
use tempfile::TempDir;
use tracing_subscriber::EnvFilter;
use zk_os_merkle_tree::{
Database, DefaultTreeParams, HashTree, MerkleTree, PatchSet, RocksDBWrapper, TreeEntry,
TreeParams,
};
use zksync_basic_types::H256;
use zksync_crypto_primitives::hasher::{blake2::Blake2Hasher, Hasher};
use zksync_storage::{RocksDB, RocksDBOptions};

#[derive(Debug)]
struct WithDynHasher;

impl TreeParams for WithDynHasher {
type Hasher = &'static dyn HashTree;
const TREE_DEPTH: u8 = <DefaultTreeParams>::TREE_DEPTH;
const INTERNAL_NODE_DEPTH: u8 = <DefaultTreeParams>::INTERNAL_NODE_DEPTH;
}

/// CLI for load-testing for the Merkle tree implementation.
#[derive(Debug, Parser)]
#[command(author, version, about, long_about = None)]
struct Cli {
/// Number of batches to insert into the tree.
#[arg(name = "batches")]
batch_count: u64,
/// Number of inserts per commit.
#[arg(name = "ops")]
writes_per_batch: usize,
/// Additional number of updates of previously written keys per commit.
#[arg(name = "updates", long, default_value = "0")]
updates_per_batch: usize,
/// Generate Merkle proofs for each operation.
#[arg(name = "proofs", long)]
proofs: bool,
/// Use a no-op hashing function.
#[arg(name = "no-hash", long)]
no_hashing: bool,
/// Perform testing on in-memory DB rather than RocksDB (i.e., with focus on hashing logic).
#[arg(long = "in-memory", short = 'M')]
in_memory: bool,
/// Block cache capacity for RocksDB in bytes.
#[arg(long = "block-cache", conflicts_with = "in_memory")]
block_cache: Option<usize>,
/// If specified, RocksDB indices and Bloom filters will be managed by the block cache rather than
/// being loaded entirely into RAM.
#[arg(long = "cache-indices", conflicts_with = "in_memory")]
cache_indices: bool,
/// Chunk size for RocksDB multi-get operations.
#[arg(long = "chunk-size", conflicts_with = "in_memory")]
chunk_size: Option<usize>,
/// Seed to use in the RNG for reproducibility.
#[arg(long = "rng-seed", default_value = "0")]
rng_seed: u64,
// FIXME: restore missing options (proof, in-memory buffering)
}

impl Cli {
fn init_logging() {
tracing_subscriber::fmt()
.pretty()
.with_env_filter(EnvFilter::from_default_env())
.init();
}

fn run(self) -> anyhow::Result<()> {
Self::init_logging();
tracing::info!("Launched with options: {self:?}");

let (mut mock_db, mut rocksdb);
let mut _temp_dir = None;
let db: &mut dyn Database = if self.in_memory {
mock_db = PatchSet::default();
&mut mock_db
} else {
let dir = TempDir::new().context("failed creating temp dir for RocksDB")?;
tracing::info!(
"Created temp dir for RocksDB: {}",
dir.path().to_string_lossy()
);
let db_options = RocksDBOptions {
block_cache_capacity: self.block_cache,
include_indices_and_filters_in_block_cache: self.cache_indices,
..RocksDBOptions::default()
};
let db =
RocksDB::with_options(dir.path(), db_options).context("failed creating RocksDB")?;
rocksdb = RocksDBWrapper::from(db);

if let Some(chunk_size) = self.chunk_size {
rocksdb.set_multi_get_chunk_size(chunk_size);
}

_temp_dir = Some(dir);
&mut rocksdb
};

let hasher: &dyn HashTree = if self.no_hashing { &() } else { &Blake2Hasher };
let mut rng = StdRng::seed_from_u64(self.rng_seed);

let mut tree = MerkleTree::<_, WithDynHasher>::with_hasher(db, hasher)
.context("cannot create tree")?;
let mut next_key_idx = 0_u64;
let mut next_value_idx = 0_u64;
for version in 0..self.batch_count {
let new_keys: Vec<_> = Self::generate_keys(next_key_idx..)
.take(self.writes_per_batch)
.collect();
let updated_indices =
(0..next_key_idx).choose_multiple(&mut rng, self.updates_per_batch);
next_key_idx += new_keys.len() as u64;

next_value_idx += (new_keys.len() + updated_indices.len()) as u64;
let updated_keys = Self::generate_keys(updated_indices.into_iter());
let kvs = new_keys
.into_iter()
.chain(updated_keys)
.zip(next_value_idx..);
let kvs = kvs.map(|(key, idx)| TreeEntry {
key,
value: H256::from_low_u64_be(idx),
});
let kvs = kvs.collect::<Vec<_>>();

tracing::info!("Processing block #{version}");
let start = Instant::now();
let output = if self.proofs {
let (output, proof) = tree
.extend_with_proof(&kvs)
.context("failed extending tree")?;
black_box(proof); // Ensure that proof creation isn't optimized away
output
} else {
tree.extend(&kvs).context("failed extending tree")?
};
let root_hash = output.root_hash;

let elapsed = start.elapsed();
tracing::info!("Processed block #{version} in {elapsed:?}, root hash = {root_hash:?}");
}

tracing::info!("Verifying tree consistency...");
let start = Instant::now();
tree.verify_consistency(self.batch_count - 1)
.context("tree consistency check failed")?;
let elapsed = start.elapsed();
tracing::info!("Verified tree consistency in {elapsed:?}");

Ok(())
}

fn generate_keys(key_indexes: impl Iterator<Item = u64>) -> impl Iterator<Item = H256> {
key_indexes.map(move |idx| {
let key = H256::from_low_u64_be(idx);
Blake2Hasher.hash_bytes(key.as_bytes())
})
}
}

fn main() -> anyhow::Result<()> {
Cli::parse().run()
}
Loading
Loading