diff --git a/docs/changelog/next_release/325.feature.rst b/docs/changelog/next_release/325.feature.rst new file mode 100644 index 000000000..2ffd32725 --- /dev/null +++ b/docs/changelog/next_release/325.feature.rst @@ -0,0 +1 @@ +Introduce ``FileSizeRange(min=..., max=...)`` filter class. Now users can set ``FileDownloader`` / ``FileMover`` to download/move only files with specific file size range. diff --git a/docs/file/file_filters/file_size_filter.rst b/docs/file/file_filters/file_size_filter.rst new file mode 100644 index 000000000..ba02b6bc4 --- /dev/null +++ b/docs/file/file_filters/file_size_filter.rst @@ -0,0 +1,9 @@ +.. _file-size-range: + +FileSizeRange +============= + +.. currentmodule:: onetl.file.filter.file_size + +.. autoclass:: FileSizeRange + :members: match diff --git a/docs/file/file_filters/index.rst b/docs/file/file_filters/index.rst index a76fedf3c..575a31667 100644 --- a/docs/file/file_filters/index.rst +++ b/docs/file/file_filters/index.rst @@ -10,6 +10,7 @@ File Filters glob regexp exclude_dir + file_size_filter .. toctree:: :maxdepth: 1 diff --git a/onetl/file/filter/__init__.py b/onetl/file/filter/__init__.py index 88e2f8356..3449d3bb4 100644 --- a/onetl/file/filter/__init__.py +++ b/onetl/file/filter/__init__.py @@ -2,6 +2,16 @@ # SPDX-License-Identifier: Apache-2.0 from onetl.file.filter.exclude_dir import ExcludeDir from onetl.file.filter.file_hwm import FileHWMFilter +from onetl.file.filter.file_size import FileSizeRange from onetl.file.filter.glob import Glob from onetl.file.filter.match_all_filters import match_all_filters from onetl.file.filter.regexp import Regexp + +__all__ = [ + "ExcludeDir", + "FileHWMFilter", + "FileSizeRange", + "Glob", + "match_all_filters", + "Regexp", +] diff --git a/onetl/file/filter/file_size.py b/onetl/file/filter/file_size.py new file mode 100644 index 000000000..7789f1b3b --- /dev/null +++ b/onetl/file/filter/file_size.py @@ -0,0 +1,109 @@ +# SPDX-FileCopyrightText: 2021-2024 MTS PJSC +# SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + +from typing import Optional + +from onetl.base.path_protocol import PathWithStatsProtocol + +try: + from pydantic.v1 import ByteSize, root_validator, validator +except (ImportError, AttributeError): + from pydantic import ByteSize, root_validator, validator # type: ignore[no-redef, assignment] + +from onetl.base import BaseFileFilter, PathProtocol +from onetl.impl import FrozenModel + + +class FileSizeRange(BaseFileFilter, FrozenModel): + """Filter files matching a specified size. + + If file size doesn't match boundaries, it will be excluded. + Doesn't affect directories or paths without defined size. + + .. versionadded:: 0.13.0 + + .. note:: + + SI unit prefixes means that ``1KB`` == ``1 kilobyte`` == ``1000 bytes``. + If you need ``1024 bytes``, use ``1 KiB`` == ``1 kibibyte``. + + Parameters + ---------- + + min : int or str, optional + + Minimal allowed file size. ``None`` means no limit. + + max : int or str, optional + + If file size is greater than this value, it will be excluded. + + Maximum allowed file size. ``None`` means no limit. + + Examples + -------- + + Specify min and max file sizes: + + .. code:: python + + from onetl.file.filter import FileSizeRange + + file_size = FileSizeRange(min="1KiB", max="100MiB") + + Specify only min file size: + + .. code:: python + + from onetl.file.filter import FileSizeRange + + file_size = FileSizeRange(min="1KiB") + + Specify only max file size: + + .. code:: python + + from onetl.file.filter import FileSizeRange + + file_size = FileSizeRange(max="100MiB") + """ + + min: Optional[ByteSize] = None + max: Optional[ByteSize] = None + + @root_validator + def _validate_min_max(cls, values): + min_value = values.get("min") + max_value = values.get("max") + + if min_value is None and max_value is None: + raise ValueError("Either min or max must be specified") + + if min_value and max_value and min_value > max_value: + raise ValueError("Min size cannot be greater than max size") + + return values + + @validator("min", "max") + def _validate_min(cls, value): + if value is not None and value < 0: + raise ValueError("size not ne negative") + return value + + def __repr__(self): + min_human_readable = self.min.human_readable() if self.min is not None else None + max_human_readable = self.max.human_readable() if self.max is not None else None + return f"{self.__class__.__name__}(min={min_human_readable!r}, max={max_human_readable!r})" + + def match(self, path: PathProtocol) -> bool: + if path.is_file() and isinstance(path, PathWithStatsProtocol): + file_size = path.stat().st_size + + if self.min is not None and file_size < self.min: + return False + + if self.max is not None and file_size > self.max: + return False + + return True diff --git a/setup.cfg b/setup.cfg index 3fcecbbaa..3d6d37b61 100644 --- a/setup.cfg +++ b/setup.cfg @@ -281,7 +281,9 @@ ignore = # WPS412 Found `__init__.py` module with logic WPS412, # WPS413 Found bad magic module function: __getattr__ - WPS413 + WPS413, +# WPS338 Found incorrect order of methods in a class + WPS338 # http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores per-file-ignores = diff --git a/tests/tests_unit/test_file/test_filter/test_file_size_range.py b/tests/tests_unit/test_file/test_filter/test_file_size_range.py new file mode 100644 index 000000000..077e21095 --- /dev/null +++ b/tests/tests_unit/test_file/test_filter/test_file_size_range.py @@ -0,0 +1,64 @@ +import pytest + +from onetl.file.filter import FileSizeRange +from onetl.impl import RemoteDirectory, RemoteFile, RemotePathStat + + +def test_file_size_range_invalid(): + with pytest.raises(ValueError, match="Either min or max must be specified"): + FileSizeRange() + + with pytest.raises(ValueError, match="size not ne negative"): + FileSizeRange(min=-1) + + with pytest.raises(ValueError, match="size not ne negative"): + FileSizeRange(max=-1) + + with pytest.raises(ValueError, match="Min size cannot be greater than max size"): + FileSizeRange(min="10KB", max="1KB") + + with pytest.raises(ValueError, match="could not parse value and unit from byte string"): + FileSizeRange(min="wtf") + with pytest.raises(ValueError, match="could not parse value and unit from byte string"): + FileSizeRange(max="wtf") + + +def test_file_size_range_repr(): + assert repr(FileSizeRange(min="10KiB", max="10GiB")) == "FileSizeRange(min='10.0KiB', max='10.0GiB')" + + +@pytest.mark.parametrize( + ["input", "expected"], + [ + ("10", 10), + ("10B", 10), + ("10KB", 10_000), + ("10KiB", 10 * 1024), + ("10MB", 10_000_000), + ("10MiB", 10 * 1024 * 1024), + ("10GB", 10_000_000_000), + ("10GiB", 10 * 1024 * 1024 * 1024), + ], +) +def test_file_size_range_parse_units(input: str, expected: int): + assert FileSizeRange(min=input.replace("B", "b")).min == expected + assert FileSizeRange(min=input).min == expected + assert FileSizeRange(max=input.replace("B", "b")).max == expected + assert FileSizeRange(max=input).max == expected + + +@pytest.mark.parametrize( + "matched, path", + [ + (False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=1024, st_mtime=50))), + (True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=10 * 1024, st_mtime=50))), + (True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=15 * 1024, st_mtime=50))), + (True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=20 * 1024, st_mtime=50))), + (False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=30 * 1024, st_mtime=50))), + (True, RemoteDirectory("some")), + ], +) +def test_file_size_range_match(matched, path): + file_filter = FileSizeRange(min="10Kib", max="20Kib") + + assert file_filter.match(path) == matched