Skip to content

Commit

Permalink
[DOP-22143] Introduce FileSizeRange
Browse files Browse the repository at this point in the history
  • Loading branch information
dolfinus committed Jan 13, 2025
1 parent dc1bc9a commit 6174840
Show file tree
Hide file tree
Showing 7 changed files with 197 additions and 1 deletion.
1 change: 1 addition & 0 deletions docs/changelog/next_release/325.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Introduce ``FileSizeRange(min=..., max=...)`` filter class. Now users can set ``FileDownloader`` / ``FileMover`` to download/move only files with specific file size range.
9 changes: 9 additions & 0 deletions docs/file/file_filters/file_size_filter.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
.. _file-size-range:

FileSizeRange
=============

.. currentmodule:: onetl.file.filter.file_size

.. autoclass:: FileSizeRange
:members: match
1 change: 1 addition & 0 deletions docs/file/file_filters/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ File Filters
glob
regexp
exclude_dir
file_size_filter

.. toctree::
:maxdepth: 1
Expand Down
10 changes: 10 additions & 0 deletions onetl/file/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,16 @@
# SPDX-License-Identifier: Apache-2.0
from onetl.file.filter.exclude_dir import ExcludeDir
from onetl.file.filter.file_hwm import FileHWMFilter
from onetl.file.filter.file_size import FileSizeRange
from onetl.file.filter.glob import Glob
from onetl.file.filter.match_all_filters import match_all_filters
from onetl.file.filter.regexp import Regexp

__all__ = [
"ExcludeDir",
"FileHWMFilter",
"FileSizeRange",
"Glob",
"match_all_filters",
"Regexp",
]
109 changes: 109 additions & 0 deletions onetl/file/filter/file_size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# SPDX-FileCopyrightText: 2021-2024 MTS PJSC
# SPDX-License-Identifier: Apache-2.0
from __future__ import annotations

from typing import Optional

from onetl.base.path_protocol import PathWithStatsProtocol

try:
from pydantic.v1 import ByteSize, root_validator, validator
except (ImportError, AttributeError):
from pydantic import ByteSize, root_validator, validator # type: ignore[no-redef, assignment]

Check warning on line 12 in onetl/file/filter/file_size.py

View check run for this annotation

Codecov / codecov/patch

onetl/file/filter/file_size.py#L11-L12

Added lines #L11 - L12 were not covered by tests

from onetl.base import BaseFileFilter, PathProtocol
from onetl.impl import FrozenModel


class FileSizeRange(BaseFileFilter, FrozenModel):
"""Filter files matching a specified size.
If file size doesn't match boundaries, it will be excluded.
Doesn't affect directories or paths without defined size.
.. versionadded:: 0.13.0
.. note::
SI unit prefixes means that ``1KB`` == ``1 kilobyte`` == ``1000 bytes``.
If you need ``1024 bytes``, use ``1 KiB`` == ``1 kibibyte``.
Parameters
----------
min : int or str, optional
Minimal allowed file size. ``None`` means no limit.
max : int or str, optional
If file size is greater than this value, it will be excluded.
Maximum allowed file size. ``None`` means no limit.
Examples
--------
Specify min and max file sizes:
.. code:: python
from onetl.file.filter import FileSizeRange
file_size = FileSizeRange(min="1KiB", max="100MiB")
Specify only min file size:
.. code:: python
from onetl.file.filter import FileSizeRange
file_size = FileSizeRange(min="1KiB")
Specify only max file size:
.. code:: python
from onetl.file.filter import FileSizeRange
file_size = FileSizeRange(max="100MiB")
"""

min: Optional[ByteSize] = None
max: Optional[ByteSize] = None

@root_validator
def _validate_min_max(cls, values):
min_value = values.get("min")
max_value = values.get("max")

if min_value is None and max_value is None:
raise ValueError("Either min or max must be specified")

if min_value and max_value and min_value > max_value:
raise ValueError("Min size cannot be greater than max size")

return values

@validator("min", "max")
def _validate_min(cls, value):
if value is not None and value < 0:
raise ValueError("size not ne negative")
return value

def __repr__(self):
min_human_readable = self.min.human_readable() if self.min is not None else None
max_human_readable = self.max.human_readable() if self.max is not None else None
return f"{self.__class__.__name__}(min={min_human_readable!r}, max={max_human_readable!r})"

def match(self, path: PathProtocol) -> bool:
if path.is_file() and isinstance(path, PathWithStatsProtocol):
file_size = path.stat().st_size

if self.min is not None and file_size < self.min:
return False

if self.max is not None and file_size > self.max:
return False

return True
4 changes: 3 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,9 @@ ignore =
# WPS412 Found `__init__.py` module with logic
WPS412,
# WPS413 Found bad magic module function: __getattr__
WPS413
WPS413,
# WPS338 Found incorrect order of methods in a class
WPS338

# http://flake8.pycqa.org/en/latest/user/options.html?highlight=per-file-ignores#cmdoption-flake8-per-file-ignores
per-file-ignores =
Expand Down
64 changes: 64 additions & 0 deletions tests/tests_unit/test_file/test_filter/test_file_size_range.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import pytest

from onetl.file.filter import FileSizeRange
from onetl.impl import RemoteDirectory, RemoteFile, RemotePathStat


def test_file_size_range_invalid():
with pytest.raises(ValueError, match="Either min or max must be specified"):
FileSizeRange()

with pytest.raises(ValueError, match="size not ne negative"):
FileSizeRange(min=-1)

with pytest.raises(ValueError, match="size not ne negative"):
FileSizeRange(max=-1)

with pytest.raises(ValueError, match="Min size cannot be greater than max size"):
FileSizeRange(min="10KB", max="1KB")

with pytest.raises(ValueError, match="could not parse value and unit from byte string"):
FileSizeRange(min="wtf")
with pytest.raises(ValueError, match="could not parse value and unit from byte string"):
FileSizeRange(max="wtf")


def test_file_size_range_repr():
assert repr(FileSizeRange(min="10KiB", max="10GiB")) == "FileSizeRange(min='10.0KiB', max='10.0GiB')"


@pytest.mark.parametrize(
["input", "expected"],
[
("10", 10),
("10B", 10),
("10KB", 10_000),
("10KiB", 10 * 1024),
("10MB", 10_000_000),
("10MiB", 10 * 1024 * 1024),
("10GB", 10_000_000_000),
("10GiB", 10 * 1024 * 1024 * 1024),
],
)
def test_file_size_range_parse_units(input: str, expected: int):
assert FileSizeRange(min=input.replace("B", "b")).min == expected
assert FileSizeRange(min=input).min == expected
assert FileSizeRange(max=input.replace("B", "b")).max == expected
assert FileSizeRange(max=input).max == expected


@pytest.mark.parametrize(
"matched, path",
[
(False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=1024, st_mtime=50))),
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=10 * 1024, st_mtime=50))),
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=15 * 1024, st_mtime=50))),
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=20 * 1024, st_mtime=50))),
(False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=30 * 1024, st_mtime=50))),
(True, RemoteDirectory("some")),
],
)
def test_file_size_range_match(matched, path):
file_filter = FileSizeRange(min="10Kib", max="20Kib")

assert file_filter.match(path) == matched

0 comments on commit 6174840

Please sign in to comment.