-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
197 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
Introduce ``FileSizeRange(min=..., max=...)`` filter class. Now users can set ``FileDownloader`` / ``FileMover`` to download/move only files with specific file size range. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
.. _file-size-range: | ||
|
||
FileSizeRange | ||
============= | ||
|
||
.. currentmodule:: onetl.file.filter.file_size | ||
|
||
.. autoclass:: FileSizeRange | ||
:members: match |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,6 +10,7 @@ File Filters | |
glob | ||
regexp | ||
exclude_dir | ||
file_size_filter | ||
|
||
.. toctree:: | ||
:maxdepth: 1 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,109 @@ | ||
# SPDX-FileCopyrightText: 2021-2024 MTS PJSC | ||
# SPDX-License-Identifier: Apache-2.0 | ||
from __future__ import annotations | ||
|
||
from typing import Optional | ||
|
||
from onetl.base.path_protocol import PathWithStatsProtocol | ||
|
||
try: | ||
from pydantic.v1 import ByteSize, root_validator, validator | ||
except (ImportError, AttributeError): | ||
from pydantic import ByteSize, root_validator, validator # type: ignore[no-redef, assignment] | ||
|
||
from onetl.base import BaseFileFilter, PathProtocol | ||
from onetl.impl import FrozenModel | ||
|
||
|
||
class FileSizeRange(BaseFileFilter, FrozenModel): | ||
"""Filter files matching a specified size. | ||
If file size doesn't match boundaries, it will be excluded. | ||
Doesn't affect directories or paths without defined size. | ||
.. versionadded:: 0.13.0 | ||
.. note:: | ||
SI unit prefixes means that ``1KB`` == ``1 kilobyte`` == ``1000 bytes``. | ||
If you need ``1024 bytes``, use ``1 KiB`` == ``1 kibibyte``. | ||
Parameters | ||
---------- | ||
min : int or str, optional | ||
Minimal allowed file size. ``None`` means no limit. | ||
max : int or str, optional | ||
If file size is greater than this value, it will be excluded. | ||
Maximum allowed file size. ``None`` means no limit. | ||
Examples | ||
-------- | ||
Specify min and max file sizes: | ||
.. code:: python | ||
from onetl.file.filter import FileSizeRange | ||
file_size = FileSizeRange(min="1KiB", max="100MiB") | ||
Specify only min file size: | ||
.. code:: python | ||
from onetl.file.filter import FileSizeRange | ||
file_size = FileSizeRange(min="1KiB") | ||
Specify only max file size: | ||
.. code:: python | ||
from onetl.file.filter import FileSizeRange | ||
file_size = FileSizeRange(max="100MiB") | ||
""" | ||
|
||
min: Optional[ByteSize] = None | ||
max: Optional[ByteSize] = None | ||
|
||
@root_validator | ||
def _validate_min_max(cls, values): | ||
min_value = values.get("min") | ||
max_value = values.get("max") | ||
|
||
if min_value is None and max_value is None: | ||
raise ValueError("Either min or max must be specified") | ||
|
||
if min_value and max_value and min_value > max_value: | ||
raise ValueError("Min size cannot be greater than max size") | ||
|
||
return values | ||
|
||
@validator("min", "max") | ||
def _validate_min(cls, value): | ||
if value is not None and value < 0: | ||
raise ValueError("size not ne negative") | ||
return value | ||
|
||
def __repr__(self): | ||
min_human_readable = self.min.human_readable() if self.min is not None else None | ||
max_human_readable = self.max.human_readable() if self.max is not None else None | ||
return f"{self.__class__.__name__}(min={min_human_readable!r}, max={max_human_readable!r})" | ||
|
||
def match(self, path: PathProtocol) -> bool: | ||
if path.is_file() and isinstance(path, PathWithStatsProtocol): | ||
file_size = path.stat().st_size | ||
|
||
if self.min is not None and file_size < self.min: | ||
return False | ||
|
||
if self.max is not None and file_size > self.max: | ||
return False | ||
|
||
return True |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
64 changes: 64 additions & 0 deletions
64
tests/tests_unit/test_file/test_filter/test_file_size_range.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
import pytest | ||
|
||
from onetl.file.filter import FileSizeRange | ||
from onetl.impl import RemoteDirectory, RemoteFile, RemotePathStat | ||
|
||
|
||
def test_file_size_range_invalid(): | ||
with pytest.raises(ValueError, match="Either min or max must be specified"): | ||
FileSizeRange() | ||
|
||
with pytest.raises(ValueError, match="size not ne negative"): | ||
FileSizeRange(min=-1) | ||
|
||
with pytest.raises(ValueError, match="size not ne negative"): | ||
FileSizeRange(max=-1) | ||
|
||
with pytest.raises(ValueError, match="Min size cannot be greater than max size"): | ||
FileSizeRange(min="10KB", max="1KB") | ||
|
||
with pytest.raises(ValueError, match="could not parse value and unit from byte string"): | ||
FileSizeRange(min="wtf") | ||
with pytest.raises(ValueError, match="could not parse value and unit from byte string"): | ||
FileSizeRange(max="wtf") | ||
|
||
|
||
def test_file_size_range_repr(): | ||
assert repr(FileSizeRange(min="10KiB", max="10GiB")) == "FileSizeRange(min='10.0KiB', max='10.0GiB')" | ||
|
||
|
||
@pytest.mark.parametrize( | ||
["input", "expected"], | ||
[ | ||
("10", 10), | ||
("10B", 10), | ||
("10KB", 10_000), | ||
("10KiB", 10 * 1024), | ||
("10MB", 10_000_000), | ||
("10MiB", 10 * 1024 * 1024), | ||
("10GB", 10_000_000_000), | ||
("10GiB", 10 * 1024 * 1024 * 1024), | ||
], | ||
) | ||
def test_file_size_range_parse_units(input: str, expected: int): | ||
assert FileSizeRange(min=input.replace("B", "b")).min == expected | ||
assert FileSizeRange(min=input).min == expected | ||
assert FileSizeRange(max=input.replace("B", "b")).max == expected | ||
assert FileSizeRange(max=input).max == expected | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"matched, path", | ||
[ | ||
(False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=1024, st_mtime=50))), | ||
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=10 * 1024, st_mtime=50))), | ||
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=15 * 1024, st_mtime=50))), | ||
(True, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=20 * 1024, st_mtime=50))), | ||
(False, RemoteFile(path="file.csv", stats=RemotePathStat(st_size=30 * 1024, st_mtime=50))), | ||
(True, RemoteDirectory("some")), | ||
], | ||
) | ||
def test_file_size_range_match(matched, path): | ||
file_filter = FileSizeRange(min="10Kib", max="20Kib") | ||
|
||
assert file_filter.match(path) == matched |