diff --git a/dvc/cli/parser.py b/dvc/cli/parser.py index 31e68ad4d0..eb0b456e89 100644 --- a/dvc/cli/parser.py +++ b/dvc/cli/parser.py @@ -19,6 +19,7 @@ data_sync, destroy, diff, + du, experiments, freeze, gc, @@ -95,6 +96,7 @@ data, artifacts, studio, + du, ] diff --git a/dvc/commands/du.py b/dvc/commands/du.py new file mode 100644 index 0000000000..11f067c32b --- /dev/null +++ b/dvc/commands/du.py @@ -0,0 +1,80 @@ +import argparse +import logging + +from dvc.cli import completion +from dvc.cli.command import CmdBaseNoRepo +from dvc.cli.utils import DictAction, append_doc_link +from dvc.ui import ui + +logger = logging.getLogger(__name__) + + +class CmdDU(CmdBaseNoRepo): + def run(self): + from dvc.repo import Repo + from dvc.utils.humanize import naturalsize + + entries = Repo.du( + self.args.url, + self.args.path, + rev=self.args.rev, + summarize=self.args.summarize, + config=self.args.config, + remote=self.args.remote, + remote_config=self.args.remote_config, + ) + ui.table([(naturalsize(size), path) for path, size in entries]) + return 0 + + +def add_parser(subparsers, parent_parser): + DU_HELP = "Show disk usage." + du_parser = subparsers.add_parser( + "du", + parents=[parent_parser], + description=append_doc_link(DU_HELP, "du"), + help=DU_HELP, + formatter_class=argparse.RawTextHelpFormatter, + ) + du_parser.add_argument("url", help="Location of DVC repository") + du_parser.add_argument( + "--rev", + nargs="?", + help="Git revision (e.g. SHA, branch, tag)", + metavar="", + ) + du_parser.add_argument( + "-s", + "--summarize", + action="store_true", + help="Show total disk usage.", + ) + du_parser.add_argument( + "--config", + type=str, + help=( + "Path to a config file that will be merged with the config " + "in the target repository." + ), + ) + du_parser.add_argument( + "--remote", + type=str, + help="Remote name to set as a default in the target repository.", + ) + du_parser.add_argument( + "--remote-config", + type=str, + nargs="*", + action=DictAction, + help=( + "Remote config options to merge with a remote's config (default or one " + "specified by '--remote') in the target repository." + ), + ) + du_parser.add_argument( + "path", + nargs="?", + help="Path to directory within the repository", + ).complete = completion.DIR + du_parser.set_defaults(func=CmdDU) diff --git a/dvc/fs/dvc.py b/dvc/fs/dvc.py index ccc2c944ab..25926c45fe 100644 --- a/dvc/fs/dvc.py +++ b/dvc/fs/dvc.py @@ -4,6 +4,7 @@ import os import posixpath import threading +from collections import deque from contextlib import ExitStack, suppress from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, Type, Union @@ -60,6 +61,7 @@ def _merge_info(repo, key, fs_info, dvc_info): if fs_info: ret["type"] = fs_info["type"] ret["size"] = fs_info["size"] + ret["fs_info"] = fs_info isexec = False if fs_info["type"] == "file": isexec = utils.is_exec(fs_info["mode"]) @@ -421,6 +423,45 @@ def get_file(self, rpath, lpath, **kwargs): dvc_path = _get_dvc_path(dvc_fs, subkey) return dvc_fs.get_file(dvc_path, lpath, **kwargs) + def du(self, path, total=True, maxdepth=None, withdirs=False, **kwargs): + if maxdepth is not None: + raise NotImplementedError + + sizes = {} + dus = {} + todo = deque([self.info(path)]) + while todo: + info = todo.popleft() + isdir = info["type"] == "directory" + size = info["size"] or 0 + name = info["name"] + + if not isdir: + sizes[name] = size + continue + + dvc_info = info.get("dvc_info") or {} + fs_info = info.get("fs_info") + entry = dvc_info.get("entry") + if ( + dvc_info + and not fs_info + and entry is not None + and entry.size is not None + ): + dus[name] = entry.size + continue + + if withdirs: + sizes[name] = size + + todo.extend(self.ls(info["name"], detail=True)) + + if total: + return sum(sizes.values()) + sum(dus.values()) + + return sizes + def close(self): self._repo_stack.close() diff --git a/dvc/repo/__init__.py b/dvc/repo/__init__.py index f51ec4d349..071e083af3 100644 --- a/dvc/repo/__init__.py +++ b/dvc/repo/__init__.py @@ -70,6 +70,7 @@ class Repo: from dvc.repo.commit import commit # type: ignore[misc] from dvc.repo.destroy import destroy # type: ignore[misc] from dvc.repo.diff import diff # type: ignore[misc] + from dvc.repo.du import du as _du # type: ignore[misc] from dvc.repo.fetch import fetch # type: ignore[misc] from dvc.repo.freeze import freeze, unfreeze # type: ignore[misc] from dvc.repo.gc import gc # type: ignore[misc] @@ -93,6 +94,7 @@ class Repo: from .cache import check_missing as cache_check_missing # type: ignore[misc] from .data import status as data_status # type: ignore[misc] + du = staticmethod(_du) ls = staticmethod(_ls) ls_url = staticmethod(_ls_url) get = staticmethod(_get) diff --git a/dvc/repo/du.py b/dvc/repo/du.py new file mode 100644 index 0000000000..f511480881 --- /dev/null +++ b/dvc/repo/du.py @@ -0,0 +1,42 @@ +from typing import Any, Dict, Optional, Union + + +def du( + url: str, + path: Optional[str] = None, + rev: Optional[str] = None, + summarize: bool = False, + config: Union[None, Dict[str, Any], str] = None, + remote: Optional[str] = None, + remote_config: Optional[dict] = None, +): + from dvc.config import Config + + from . import Repo + + if config and not isinstance(config, dict): + config_dict = Config.load_file(config) + else: + config_dict = None + + with Repo.open( + url, + rev=rev, + subrepos=True, + uninitialized=True, + config=config_dict, + remote=remote, + remote_config=remote_config, + ) as repo: + path = path or "" + + fs = repo.dvcfs + + if summarize or not fs.isdir(path): + return [(path, fs.du(path, total=True))] + + ret = [ + (entry_path, fs.du(entry_path, total=True)) for entry_path in fs.ls(path) + ] + ret.append((path, sum(entry[1] for entry in ret))) + return ret diff --git a/tests/func/test_du.py b/tests/func/test_du.py new file mode 100644 index 0000000000..9496f8ab56 --- /dev/null +++ b/tests/func/test_du.py @@ -0,0 +1,49 @@ +import os + + +def test_du(tmp_dir, dvc): + tmp_dir.gen( + { + "file": b"file", + "dvcfile": b"dvcfile", + "dir": { + "dirfile": b"dirfile", + "subdir": { + "subdirfile": b"subdirfile", + }, + "dvcsubdir": { + "dvcsubdirfile": b"dvcsubdirfile", + }, + }, + } + ) + + dvc.add("dvcfile") + dvc.add(os.path.join("dir", "dvcsubdir")) + + assert dvc.du(".", "file") == [("file", 4)] + assert dvc.du(".", "dvcfile") == [("dvcfile", 7)] + assert set(dvc.du(".", "dir/subdir")) == { + ("dir/subdir/subdirfile", 10), + ("dir/subdir", 10), + } + assert dvc.du(".", "dir/subdir", summarize=True) == [("dir/subdir", 10)] + assert set(dvc.du(".", "dir/dvcsubdir")) == { + ("dir/dvcsubdir/dvcsubdirfile", 13), + ("dir/dvcsubdir", 13), + } + assert dvc.du(".", "dir/dvcsubdir", summarize=True) == [("dir/dvcsubdir", 13)] + assert set(dvc.du(".", "dir")) == { + ("dir/dvcsubdir", 13), + ("dir/subdir", 10), + ("dir/dirfile", 7), + ("dir", 30), + } + assert dvc.du(".", "dir", summarize=True) == [("dir", 30)] + assert set(dvc.du(".", "/")) == { + ("/dvcfile", 7), + ("/dir", 30), + ("/file", 4), + ("/", 41), + } + assert dvc.du(".", "/", summarize=True) == [("/", 41)] diff --git a/tests/unit/command/test_du.py b/tests/unit/command/test_du.py new file mode 100644 index 0000000000..6ad99494e8 --- /dev/null +++ b/tests/unit/command/test_du.py @@ -0,0 +1,21 @@ +from dvc.cli import parse_args +from dvc.commands.du import CmdDU + + +def test_du(mocker): + cli_args = parse_args(["du", "myurl", "mypath", "--summarize", "--rev", "myrev"]) + assert cli_args.func == CmdDU + + cmd = cli_args.func(cli_args) + mock_du = mocker.patch("dvc.repo.Repo.du") + + assert cmd.run() == 0 + mock_du.assert_called_once_with( + "myurl", + "mypath", + rev="myrev", + summarize=True, + config=None, + remote=None, + remote_config=None, + )