Skip to content

Commit

Permalink
Add ETag into DiskDataCache hashed block path (#594)
Browse files Browse the repository at this point in the history
* Move ETag into DiskDataCache hashed block path

Signed-off-by: Daniel Carl Jones <[email protected]>

* Test hash_cache_key separately from get_path_for_key/get_path_for_block

Signed-off-by: Daniel Carl Jones <[email protected]>

* Add CACHE_VERSION to hash computed on cache_key

Signed-off-by: Daniel Carl Jones <[email protected]>

---------

Signed-off-by: Daniel Carl Jones <[email protected]>
  • Loading branch information
dannycjones authored Nov 10, 2023
1 parent 9768f31 commit 689ebe3
Showing 1 changed file with 39 additions and 23 deletions.
62 changes: 39 additions & 23 deletions mountpoint-s3/src/data_cache/disk_data_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,13 +158,13 @@ impl DiskDataCache {
let mut path = self.cache_directory.join(CACHE_VERSION);

// An S3 key may be up to 1024 UTF-8 bytes long, which exceeds the maximum UNIX file name length.
// Instead, we hash the key.
// The risk of collisions is mitigated as we will ignore blocks read that contain the wrong S3 key, etc..
let encoded_s3_key = hex::encode(Sha256::digest(cache_key.s3_key.as_bytes()));
// Instead, the path contains a hash of the S3 key and ETag.
// The risk of collisions is mitigated as we ignore blocks read that contain the wrong S3 key, etc..
let hashed_cache_key = hash_cache_key(cache_key);

// TODO: Split directory into subdirectories.
// Take the first few chars of hash to avoid hitting any FS-specific maximum number of directory entries.
path.push(encoded_s3_key);
path.push(cache_key.etag.as_str());
path.push(hashed_cache_key);
path
}

Expand All @@ -176,6 +176,18 @@ impl DiskDataCache {
}
}

/// Hash the cache key using its fields as well as the [CACHE_VERSION],
/// returning the hash encoded as hexadecimal in a new [String].
fn hash_cache_key(cache_key: &CacheKey) -> String {
let CacheKey { s3_key, etag } = cache_key;

let mut hasher = Sha256::new();
hasher.update(CACHE_VERSION.as_bytes());
hasher.update(s3_key.as_bytes());
hasher.update(etag.as_str().as_bytes());
hex::encode(hasher.finalize())
}

impl DataCache for DiskDataCache {
fn get_block(&self, cache_key: &CacheKey, block_idx: BlockIndex) -> DataCacheResult<Option<ChecksummedBytes>> {
let path = self.get_path_for_block(cache_key, block_idx);
Expand Down Expand Up @@ -254,7 +266,6 @@ mod tests {
use super::*;

use mountpoint_s3_client::types::ETag;
use test_case::test_case;

#[test]
fn test_block_format_version_requires_update() {
Expand All @@ -276,30 +287,39 @@ mod tests {
);
}

#[test_case("hello"; "simple string")]
#[test_case("foo/bar/baz"; "with forward slashes")]
#[test_case("hello+world"; "with plus char")]
#[test_case("hello\\ world"; "backslash")]
#[test_case("hello=world"; "equals")]
#[test_case("look🎡emoji"; "emoji")]
fn test_get_path_for_key(s3_key: &str) {
#[test]
fn test_hash_cache_key() {
let s3_key = "a".repeat(266);
let etag = ETag::for_tests();
let key = CacheKey {
etag,
s3_key: s3_key.to_owned(),
};
let expected_hash = "b717d5a78ed63238b0778e7295d83e963758aa54db6e969a822f2b13ce9a3067";
assert_eq!(expected_hash, hash_cache_key(&key));
}

#[test]
fn test_get_path_for_key() {
let cache_dir = PathBuf::from("mountpoint-cache/");
let data_cache = DiskDataCache::new(cache_dir, 1024);

let encoded_s3_key = hex::encode(Sha256::digest(s3_key.as_bytes()));
let s3_key = "a".repeat(266);

let etag = ETag::for_tests();
let key = CacheKey {
etag,
s3_key: s3_key.to_owned(),
};
let expected = vec!["mountpoint-cache", CACHE_VERSION, &encoded_s3_key, key.etag.as_str()];
let hashed_cache_key = hash_cache_key(&key);
let expected = vec!["mountpoint-cache", CACHE_VERSION, hashed_cache_key.as_str()];
let path = data_cache.get_path_for_key(&key);
let results: Vec<OsString> = path.iter().map(ToOwned::to_owned).collect();
assert_eq!(expected, results);
}

#[test]
fn test_get_path_for_key_very_long_key() {
fn test_get_path_for_block() {
let cache_dir = PathBuf::from("mountpoint-cache/");
let data_cache = DiskDataCache::new(cache_dir, 1024);

Expand All @@ -310,13 +330,9 @@ mod tests {
etag,
s3_key: s3_key.to_owned(),
};
let expected = vec![
"mountpoint-cache",
CACHE_VERSION,
"a7bf334bec6f17021671033b243b8689757212496cd525ba9873addde87b0c56",
key.etag.as_str(),
];
let path = data_cache.get_path_for_key(&key);
let hashed_cache_key = hash_cache_key(&key);
let expected = vec!["mountpoint-cache", CACHE_VERSION, hashed_cache_key.as_str(), "5.block"];
let path = data_cache.get_path_for_block(&key, 5);
let results: Vec<OsString> = path.iter().map(ToOwned::to_owned).collect();
assert_eq!(expected, results);
}
Expand Down

0 comments on commit 689ebe3

Please sign in to comment.