diff --git a/mountpoint-s3/src/fs.rs b/mountpoint-s3/src/fs.rs index 1fd42839d..0eaa24a99 100644 --- a/mountpoint-s3/src/fs.rs +++ b/mountpoint-s3/src/fs.rs @@ -290,6 +290,8 @@ pub struct CacheConfig { pub file_ttl: Duration, /// How long the kernel will cache metadata for directories pub dir_ttl: Duration, + /// Maximum number of negative entries to cache. + pub negative_cache_size: usize, } impl Default for CacheConfig { @@ -310,6 +312,7 @@ impl Default for CacheConfig { serve_lookup_from_cache: false, file_ttl, dir_ttl, + negative_cache_size: 100_000, } } } diff --git a/mountpoint-s3/src/inode.rs b/mountpoint-s3/src/inode.rs index db4934d36..7a0f8a04e 100644 --- a/mountpoint-s3/src/inode.rs +++ b/mountpoint-s3/src/inode.rs @@ -50,6 +50,9 @@ use crate::sync::{Arc, RwLock}; mod expiry; use expiry::Expiry; +mod expirable_set; +use expirable_set::ExpirableSet; + mod readdir; pub use readdir::ReaddirHandle; @@ -83,6 +86,7 @@ pub struct Superblock { struct SuperblockInner { bucket: String, inodes: RwLock>, + known_missing_keys: RwLock, next_ino: AtomicU64, mount_time: OffsetDateTime, config: SuperblockConfig, @@ -119,9 +123,12 @@ impl Superblock { let mut inodes = HashMap::new(); inodes.insert(ROOT_INODE_NO, root); + let cache = ExpirableSet::new(config.cache_config.negative_cache_size, config.cache_config.file_ttl); + let inner = SuperblockInner { bucket: bucket.to_owned(), inodes: RwLock::new(inodes), + known_missing_keys: RwLock::new(cache), next_ino: AtomicU64::new(2), mount_time, config, @@ -598,7 +605,7 @@ impl SuperblockInner { }; let lookup = match lookup { - Some(lookup) => lookup, + Some(lookup) => lookup?, None => { let remote = self.remote_lookup(client, parent_ino, name).await?; self.update_from_remote(parent_ino, name, remote)? @@ -611,30 +618,44 @@ impl SuperblockInner { /// Lookup an [Inode] against known directory entries in the parent, /// verifying any returned entry has not expired. - fn cache_lookup(&self, parent_ino: InodeNo, name: &str) -> Option { - fn do_cache_lookup(parent: Inode, name: &str) -> Option { + fn cache_lookup(&self, parent_ino: InodeNo, name: &str) -> Option> { + fn do_cache_lookup( + superblock: &SuperblockInner, + parent: Inode, + name: &str, + ) -> Option> { match &parent.get_inode_state().ok()?.kind_data { InodeKindData::File { .. } => unreachable!("parent should be a directory!"), InodeKindData::Directory { children, .. } => { - let inode = children.get(name)?; - let inode_stat = &inode.get_inode_state().ok()?.stat; - if inode_stat.is_valid() { - let lookup = LookedUp { - inode: inode.clone(), - stat: inode_stat.clone(), - }; - return Some(lookup); + if let Some(inode) = children.get(name) { + let inode_stat = &inode.get_inode_state().ok()?.stat; + if inode_stat.is_valid() { + let lookup = LookedUp { + inode: inode.clone(), + stat: inode_stat.clone(), + }; + return Some(Ok(lookup)); + } } } }; + if superblock + .known_missing_keys + .read() + .unwrap() + .contains(parent.ino(), name) + { + return Some(Err(InodeError::FileDoesNotExist(name.to_owned(), parent.err()))); + } + None } let lookup = self .get(parent_ino) .ok() - .and_then(|parent| do_cache_lookup(parent, name)); + .and_then(|parent| do_cache_lookup(self, parent, name)); match &lookup { Some(lookup) => trace!("lookup returned from cache: {:?}", lookup), @@ -789,7 +810,7 @@ impl SuperblockInner { } // Fast path: try with only a read lock on the directory first. - if let Some(looked_up) = Self::try_update_fast_path(&parent, name, &remote)? { + if let Some(looked_up) = self.try_update_fast_path(&parent, name, &remote)? { return Ok(looked_up); } @@ -799,6 +820,7 @@ impl SuperblockInner { /// Try to update the inode for the given name in the parent directory with only a read lock on /// the parent. fn try_update_fast_path( + &self, parent: &Inode, name: &str, remote: &Option, @@ -809,7 +831,13 @@ impl SuperblockInner { InodeKindData::Directory { children, .. } => children.get(name), }; match (remote, inode) { - (None, None) => Err(InodeError::FileDoesNotExist(name.to_owned(), parent.err())), + (None, None) => { + if self.config.cache_config.serve_lookup_from_cache { + // Insert or update validity of negative cache entry. + self.known_missing_keys.write().unwrap().insert(parent.ino(), name); + } + Err(InodeError::FileDoesNotExist(name.to_owned(), parent.err())) + } (Some(remote), Some(existing_inode)) => { let mut existing_state = existing_inode.get_mut_inode_state()?; let existing_is_remote = existing_state.write_status == WriteStatus::Remote; @@ -846,7 +874,13 @@ impl SuperblockInner { InodeKindData::Directory { children, .. } => children.get(name).cloned(), }; match (remote, inode) { - (None, None) => Err(InodeError::FileDoesNotExist(name.to_owned(), parent.err())), + (None, None) => { + if self.config.cache_config.serve_lookup_from_cache { + // Insert or update validity of negative cache entry. + self.known_missing_keys.write().unwrap().insert(parent.ino(), name); + } + Err(InodeError::FileDoesNotExist(name.to_owned(), parent.err())) + } (None, Some(existing_inode)) => { let InodeKindData::Directory { children, @@ -1016,6 +1050,8 @@ impl SuperblockInner { } if let Some(existing_inode) = existing_inode { writing_children.remove(&existing_inode.ino()); + } else { + self.known_missing_keys.write().unwrap().remove(parent.ino(), name); } } } @@ -1672,9 +1708,9 @@ mod tests { let client = Arc::new(MockClient::new(client_config)); let keys = &[ - format!("{prefix}dir0/file0.txt"), - format!("{prefix}dir0/sdir0/file0.txt"), - format!("{prefix}dir0/sdir0/file1.txt"), + format!("{prefix}file0.txt"), + format!("{prefix}sdir0/file0.txt"), + format!("{prefix}sdir0/file1.txt"), ]; let object_size = 30; @@ -1700,29 +1736,97 @@ mod tests { serve_lookup_from_cache: true, dir_ttl: ttl, file_ttl: ttl, + ..Default::default() }, s3_personality: S3Personality::Standard, }, ); - let dir0 = superblock - .lookup(&client, FUSE_ROOT_INODE, &OsString::from("dir0")) - .await - .expect("should exist"); - let file0 = superblock - .lookup(&client, dir0.inode.ino(), &OsString::from("file0.txt")) - .await - .expect("should exist"); + let entries = ["file0.txt", "sdir0"]; + for entry in entries { + _ = superblock + .lookup(&client, FUSE_ROOT_INODE, entry.as_ref()) + .await + .expect("should exist"); + } - client.remove_object(file0.inode.full_key()); + for key in keys { + client.remove_object(key); + } - let file0 = superblock - .lookup(&client, dir0.inode.ino(), &OsString::from("file0.txt")) - .await; - if cached { - file0.expect("file0 inode should still be served from cache"); + for entry in entries { + let lookup = superblock.lookup(&client, FUSE_ROOT_INODE, entry.as_ref()).await; + if cached { + lookup.expect("inode should still be served from cache"); + } else { + lookup.expect_err("entry should have expired, and not be found in S3"); + } + } + } + + #[test_case(true; "cached")] + #[test_case(false; "not cached")] + #[tokio::test] + async fn test_negative_lookup_with_caching(cached: bool) { + let bucket = "test_bucket"; + let prefix = "prefix/"; + let client_config = MockClientConfig { + bucket: bucket.to_string(), + part_size: 1024 * 1024, + ..Default::default() + }; + let client = Arc::new(MockClient::new(client_config)); + + let prefix = Prefix::new(prefix).expect("valid prefix"); + let ttl = if cached { + std::time::Duration::from_secs(60 * 60 * 24 * 7) // 7 days should be enough } else { - file0.expect_err("file0 entry should have expired, and not be found in S3"); + std::time::Duration::ZERO + }; + let superblock = Superblock::new( + bucket, + &prefix, + SuperblockConfig { + cache_config: CacheConfig { + serve_lookup_from_cache: true, + dir_ttl: ttl, + file_ttl: ttl, + ..Default::default() + }, + s3_personality: S3Personality::Standard, + }, + ); + + let entries = ["file0.txt", "sdir0"]; + for entry in entries { + _ = superblock + .lookup(&client, FUSE_ROOT_INODE, entry.as_ref()) + .await + .expect_err("should not exist"); + } + + let keys = &[ + format!("{prefix}file0.txt"), + format!("{prefix}sdir0/file0.txt"), + format!("{prefix}sdir0/file1.txt"), + ]; + + let object_size = 30; + let mut last_modified = OffsetDateTime::UNIX_EPOCH; + for key in keys { + let mut obj = MockObject::constant(0xaa, object_size, ETag::for_tests()); + last_modified += Duration::days(1); + obj.set_last_modified(last_modified); + client.add_object(key, obj); + } + + for entry in entries { + let lookup = superblock.lookup(&client, FUSE_ROOT_INODE, entry.as_ref()).await; + if cached { + lookup.expect_err("negative entry should still be valid in the cache, so the new key should not have been looked up in S3"); + } else { + lookup.expect("new object should have been looked up in S3"); + } } } diff --git a/mountpoint-s3/src/inode/expirable_set.rs b/mountpoint-s3/src/inode/expirable_set.rs new file mode 100644 index 000000000..b3c3159dd --- /dev/null +++ b/mountpoint-s3/src/inode/expirable_set.rs @@ -0,0 +1,70 @@ +use std::time::Duration; + +use linked_hash_map::LinkedHashMap; + +use super::{expiry::Expiry, InodeNo}; + +/// A bounded set of (parent_ino, child_name) entries that expire after a fixed time. +#[derive(Debug)] +pub struct ExpirableSet { + map: LinkedHashMap, + size: usize, + max_size: usize, + validity: Duration, +} + +#[derive(Debug, Hash, PartialEq, Eq)] +struct Key { + parent_ino: InodeNo, + child_name: String, +} + +impl ExpirableSet { + pub fn new(max_size: usize, validity: Duration) -> Self { + Self { + map: Default::default(), + size: 0, + max_size, + validity, + } + } + + pub fn contains(&self, parent_ino: InodeNo, child_name: &str) -> bool { + let key = Key { + parent_ino, + child_name: child_name.to_owned(), + }; + let Some(value) = self.map.get(&key) else { + return false; + }; + value.is_valid() + } + + pub fn remove(&mut self, parent_ino: InodeNo, child_name: &str) { + let key = Key { + parent_ino, + child_name: child_name.to_owned(), + }; + if self.map.remove(&key).is_some() { + self.size -= 1; + } + } + + pub fn insert(&mut self, parent_ino: InodeNo, child_name: &str) { + let expiry = Expiry::from_now(self.validity); + let key = Key { + parent_ino, + child_name: child_name.to_owned(), + }; + if self.map.insert(key, expiry).is_none() { + self.size += 1; + + while self.size > self.max_size || self.map.front().is_some_and(|(_, e)| !e.is_valid()) { + if self.map.pop_front().is_none() { + break; + } + self.size -= 1; + } + } + } +} diff --git a/mountpoint-s3/src/main.rs b/mountpoint-s3/src/main.rs index 34cc7ea81..ce41f1f51 100644 --- a/mountpoint-s3/src/main.rs +++ b/mountpoint-s3/src/main.rs @@ -585,6 +585,7 @@ fn mount(args: CliArgs) -> anyhow::Result { serve_lookup_from_cache: true, dir_ttl: metadata_cache_ttl, file_ttl: metadata_cache_ttl, + ..Default::default() }; let cache_config = match args.max_cache_size { diff --git a/mountpoint-s3/tests/direct_io.rs b/mountpoint-s3/tests/direct_io.rs index 4586b6a29..7ddfac2b4 100644 --- a/mountpoint-s3/tests/direct_io.rs +++ b/mountpoint-s3/tests/direct_io.rs @@ -31,6 +31,7 @@ where serve_lookup_from_cache: true, dir_ttl: Duration::from_secs(600), file_ttl: Duration::from_secs(600), + ..Default::default() }, ..Default::default() }, diff --git a/mountpoint-s3/tests/fs.rs b/mountpoint-s3/tests/fs.rs index cf3a85d07..3316c5c7b 100644 --- a/mountpoint-s3/tests/fs.rs +++ b/mountpoint-s3/tests/fs.rs @@ -177,6 +177,7 @@ async fn test_lookup_negative_cached() { serve_lookup_from_cache: true, dir_ttl: Duration::from_secs(600), file_ttl: Duration::from_secs(600), + ..Default::default() }, ..Default::default() }; @@ -192,29 +193,52 @@ async fn test_lookup_negative_cached() { assert_eq!(head_counter.count(), 1); assert_eq!(list_counter.count(), 1); - // Check no negative caching + // Check negative caching let _ = fs .lookup(FUSE_ROOT_INODE, "file1.txt".as_ref()) .await .expect_err("should fail as no object exists"); - assert_eq!(head_counter.count(), 2); - assert_eq!(list_counter.count(), 2); + assert_eq!(head_counter.count(), 1); + assert_eq!(list_counter.count(), 1); client.add_object("file1.txt", MockObject::constant(0xa1, 15, ETag::for_tests())); let _ = fs .lookup(FUSE_ROOT_INODE, "file1.txt".as_ref()) .await - .expect("should succeed as object exists and no negative cache"); - assert_eq!(head_counter.count(), 3); - assert_eq!(list_counter.count(), 3); + .expect_err("should fail as mountpoint should use negative cache"); + assert_eq!(head_counter.count(), 1); + assert_eq!(list_counter.count(), 1); + + // Use readdirplus to discover the new file + { + let dir_handle = fs.opendir(FUSE_ROOT_INODE, 0).await.unwrap().fh; + let mut reply = Default::default(); + let _reply = fs + .readdirplus(FUSE_ROOT_INODE, dir_handle, 0, &mut reply) + .await + .unwrap(); + + assert_eq!(reply.entries.len(), 2 + 1); + + let mut entries = reply.entries.iter(); + let _ = entries.next().expect("should have current directory"); + let _ = entries.next().expect("should have parent directory"); + + let entry = entries.next().expect("should have file1.txt in entries"); + let expected = OsString::from("file1.txt"); + assert_eq!(entry.name, expected); + assert_eq!(entry.attr.kind, FileType::RegularFile); + } + assert_eq!(head_counter.count(), 1); + assert_eq!(list_counter.count(), 2); let _ = fs .lookup(FUSE_ROOT_INODE, "file1.txt".as_ref()) .await .expect("should succeed as object is cached and exists"); - assert_eq!(head_counter.count(), 3); - assert_eq!(list_counter.count(), 3); + assert_eq!(head_counter.count(), 1); + assert_eq!(list_counter.count(), 2); } #[tokio::test] @@ -224,6 +248,7 @@ async fn test_lookup_then_open_cached() { serve_lookup_from_cache: true, dir_ttl: Duration::from_secs(600), file_ttl: Duration::from_secs(600), + ..Default::default() }, ..Default::default() }; @@ -289,6 +314,7 @@ async fn test_readdir_then_open_cached() { serve_lookup_from_cache: true, dir_ttl: Duration::from_secs(600), file_ttl: Duration::from_secs(600), + ..Default::default() }, ..Default::default() }; @@ -339,6 +365,7 @@ async fn test_unlink_cached() { serve_lookup_from_cache: true, dir_ttl: Duration::from_secs(600), file_ttl: Duration::from_secs(600), + ..Default::default() }, allow_delete: true, ..Default::default() @@ -388,6 +415,7 @@ async fn test_mknod_cached() { serve_lookup_from_cache: true, dir_ttl: Duration::from_secs(600), file_ttl: Duration::from_secs(600), + ..Default::default() }, ..Default::default() }; diff --git a/mountpoint-s3/tests/fuse_tests/lookup_test.rs b/mountpoint-s3/tests/fuse_tests/lookup_test.rs index bcdab723d..6718701df 100644 --- a/mountpoint-s3/tests/fuse_tests/lookup_test.rs +++ b/mountpoint-s3/tests/fuse_tests/lookup_test.rs @@ -114,6 +114,7 @@ where serve_lookup_from_cache: false, file_ttl: Duration::ZERO, dir_ttl: Duration::ZERO, + ..Default::default() }, ..Default::default() }; diff --git a/mountpoint-s3/tests/reftests/harness.rs b/mountpoint-s3/tests/reftests/harness.rs index 235f73386..9141ba23e 100644 --- a/mountpoint-s3/tests/reftests/harness.rs +++ b/mountpoint-s3/tests/reftests/harness.rs @@ -878,6 +878,7 @@ mod mutations { serve_lookup_from_cache: false, dir_ttl: Duration::ZERO, file_ttl: Duration::ZERO, + ..Default::default() }, ..Default::default() };