From 83ee20e7b1084b18dad8497a8b087ba46988a75e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20M=2E=20Bezerra?= Date: Sun, 2 May 2021 17:44:00 -0300 Subject: [PATCH] Ignore nested files when calculating the total For the nested files: - folder/ (5 MB) - folder/big_file (15 MB) The --total now outputs 15 MB instead of the previous 20 MB, because the inner file is inside of the folder that was also passed as an argument. Implemented with the Trie data structure, made of HashMap and PathBufs that represent each path components of the canonicalized file paths. Fixes #12. --- src/lib.rs | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/main.rs | 4 +- 2 files changed, 106 insertions(+), 2 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 16bf1bc..0bcc720 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -3,6 +3,7 @@ use number_prefix::NumberPrefix; use walkdir::WalkDir; use std::{ + collections::BTreeMap, fmt::Display, path::{Path, PathBuf}, }; @@ -31,6 +32,7 @@ impl ResultExt for Result { } } +#[derive(PartialEq, Eq, PartialOrd, Ord)] pub struct Entry { pub path: PathBuf, pub size: u64, @@ -91,3 +93,105 @@ pub fn format_size(size: u64, binary: bool) -> String { NumberPrefix::Prefixed(prefix, number) => format!("{:.2} {}B", number, prefix), } } + +/// Calculate the sum of sizes of all entries +/// +/// Ignore nested files when calculating the total +/// +/// For the nested files: +/// - `folder/ (5 MB)` +/// - `folder/big_file (15 MB)` +/// +/// The is 15 MB instead of 20 MB because the inner file is inside of the +/// folder that was also received as an argument +/// +/// Implemented with the Trie data structure, made of HashMap and PathBufs +/// that represent each path components of the canonicalized file paths +pub fn calculate_unique_total_size(entries: &[Entry]) -> u64 { + // Entries, but with with canonicalized paths + let entries = { + let mut new_entries: Vec<(PathBuf, &Entry)> = vec![]; + + for entry in entries { + // Log errors and ignore them in the total sum + let canonical_path = entry.path.canonicalize().log_err(Some(&entry.path)); + if let Ok(path) = canonical_path { + new_entries.push((path, entry)); + } + } + new_entries + }; + + #[derive(PartialEq, Eq, PartialOrd, Ord)] + struct TriePathNode { + // Children nodes of this current path, accessed by path + children: BTreeMap, + // Size of the file that ends at this node + node_size: u64, + } + + let mut trie_root = TriePathNode { + children: BTreeMap::new(), + node_size: 0, + }; + + // For each entry/path, add it to the Trie if it wasn't already inserted + // + // If the Trie receives a folder that is parent of a previously added file, then just consider + // the parent folder, removing the childs, this way, we do not count them twice towards the + // final total + for (path, entry) in entries { + // Necessary because we need to check when it's the last path piece + let mut path_iter = path.iter().peekable(); + // Pointer to traverse the tree + let mut current_trie_node = &mut trie_root; + // Size to be added at the end if the current entry isn't children of any other + let size_of_current_file = entry.size; + + while let Some(piece) = path_iter.next() { + // Query for the node in the Trie which matches the current path piece + let entry = current_trie_node.children.entry(PathBuf::from(piece)); + + let mut is_current_node_size_zero = true; + // Keeps track if the current entry is child of another previously found + let next_trie_node = entry + .and_modify(|next_node| { + // If we are in this block, it means that this node was already present in the + // trie tree + is_current_node_size_zero = next_node.node_size == 0; + }) + // Add a node with 0 size, which is only changed afterwards if it's the last piece + .or_insert(TriePathNode { + children: BTreeMap::new(), + node_size: 0, + }); + + // Skipping current entry, because it's nested inside an already accounted file, or is + // a repeated file + if !is_current_node_size_zero { + break; + } + + // If we are at the last piece of the current entry path, it means that this is the tip + // that finally represents the file, and which path is the full file path + let is_the_last_piece = path_iter.peek().is_none(); + if is_the_last_piece { + // Update the size of the last trie node for this piece + next_trie_node.node_size = size_of_current_file; + // Drop all the childrens so that their sizes won't be added twice + next_trie_node.children.clear(); + } + + // Update the pointer to keep traversing the trie + current_trie_node = next_trie_node; + } + } + + fn trie_recursive_sum(node: &TriePathNode) -> u64 { + let children_sum: u64 = node.children.values().map(trie_recursive_sum).sum(); + node.node_size + children_sum + } + + // Traverse the trie tree to calculate the sum + trie_recursive_sum(&trie_root) +} diff --git a/src/main.rs b/src/main.rs index b5ebd82..a58c9cb 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4,7 +4,7 @@ use wild; mod cli; use cli::Cli; -use durt::{format_size, Entry}; +use durt::{calculate_unique_total_size, format_size, Entry}; fn main() { #[cfg(windows)] @@ -60,7 +60,7 @@ fn main() { Table::new(" {:>} {:<}") }; - let total_size = entries.iter().map(|e| e.size).sum(); + let total_size = calculate_unique_total_size(&entries); let mut omitted_entries = 0; for entry in entries {