From d02bdd06e87cb0ffa4f5e9066f6897a5572cfd96 Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 7 Feb 2025 10:34:58 +0100 Subject: [PATCH 1/6] Enhance verbosity handling --- environment_linux.yml | 104 ++++++++++++++++++ requirements-test.txt | 13 +++ short_output_test.py | 13 +++ src/cleanvision/dataset/fsspec_dataset.py | 8 +- src/cleanvision/dataset/utils.py | 5 +- src/cleanvision/imagelab.py | 3 + .../issue_managers/duplicate_issue_manager.py | 5 +- .../image_property_issue_manager.py | 5 +- src/cleanvision/utils/utils.py | 1 + 9 files changed, 148 insertions(+), 9 deletions(-) create mode 100644 environment_linux.yml create mode 100644 requirements-test.txt create mode 100644 short_output_test.py diff --git a/environment_linux.yml b/environment_linux.yml new file mode 100644 index 00000000..ff49616b --- /dev/null +++ b/environment_linux.yml @@ -0,0 +1,104 @@ +name: cleanvision +channels: + - conda-forge + - pytorch + - nvidia +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - bzip2=1.0.8=h4bc722e_7 + - ca-certificates=2025.1.31=hbcca054_0 + - ld_impl_linux-64=2.43=h712a8e2_2 + - libexpat=2.6.4=h5888daf_0 + - libffi=3.4.2=h7f98852_5 + - libgcc=14.2.0=h77fa898_1 + - libgcc-ng=14.2.0=h69a702a_1 + - libgomp=14.2.0=h77fa898_1 + - liblzma=5.6.4=hb9d3cd8_0 + - libmpdec=4.0.0=h4bc722e_0 + - libsqlite=3.48.0=hee588c1_1 + - libuuid=2.38.1=h0b41bf4_0 + - libzlib=1.3.1=hb9d3cd8_2 + - ncurses=6.5=h2d0b736_3 + - openssl=3.4.0=h7b32b05_1 + - pip=25.0=pyh145f28c_0 + - python=3.13.1=ha99a958_105_cp313 + - python_abi=3.13=5_cp313 + - readline=8.2=h8228510_1 + - tk=8.6.13=noxft_h4845f30_101 + - pip: + - aiobotocore==2.19.0 + - aiohappyeyeballs==2.4.4 + - aiohttp==3.11.12 + - aioitertools==0.12.0 + - aiosignal==1.3.2 + - attrs==25.1.0 + - botocore==1.36.3 + - certifi==2025.1.31 + - charset-normalizer==3.4.1 + - cleanvision==0.3.7 + - contourpy==1.3.1 + - cycler==0.12.1 + - datasets==3.2.0 + - dill==0.3.8 + - filelock==3.17.0 + - fonttools==4.55.8 + - frozenlist==1.5.0 + - fsspec==2024.9.0 + - huggingface-hub==0.28.1 + - idna==3.10 + - imagehash==4.3.2 + - iniconfig==2.0.0 + - jinja2==3.1.5 + - jmespath==1.0.1 + - kiwisolver==1.4.8 + - markupsafe==3.0.2 + - matplotlib==3.10.0 + - mpmath==1.3.0 + - multidict==6.1.0 + - multiprocess==0.70.16 + - networkx==3.4.2 + - numpy==2.2.2 + - nvidia-cublas-cu12==12.4.5.8 + - nvidia-cuda-cupti-cu12==12.4.127 + - nvidia-cuda-nvrtc-cu12==12.4.127 + - nvidia-cuda-runtime-cu12==12.4.127 + - nvidia-cudnn-cu12==9.1.0.70 + - nvidia-cufft-cu12==11.2.1.3 + - nvidia-curand-cu12==10.3.5.147 + - nvidia-cusolver-cu12==11.6.1.9 + - nvidia-cusparse-cu12==12.3.1.170 + - nvidia-cusparselt-cu12==0.6.2 + - nvidia-nccl-cu12==2.21.5 + - nvidia-nvjitlink-cu12==12.4.127 + - nvidia-nvtx-cu12==12.4.127 + - packaging==24.2 + - pandas==2.2.3 + - pillow==11.1.0 + - pluggy==1.5.0 + - propcache==0.2.1 + - psutil==6.1.1 + - pyarrow==19.0.0 + - pyparsing==3.2.1 + - pytest==8.3.4 + - python-dateutil==2.9.0.post0 + - pytz==2025.1 + - pywavelets==1.8.0 + - pyyaml==6.0.2 + - requests==2.32.3 + - s3fs==2024.9.0 + - scipy==1.15.1 + - setuptools==75.8.0 + - six==1.17.0 + - sympy==1.13.1 + - tabulate==0.9.0 + - torch==2.6.0 + - torchvision==0.21.0 + - tqdm==4.67.1 + - triton==3.2.0 + - typing-extensions==4.12.2 + - tzdata==2025.1 + - urllib3==2.3.0 + - wrapt==1.17.2 + - xxhash==3.5.0 + - yarl==1.18.3 diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 00000000..01dd08a9 --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1,13 @@ +pandas +psutil +matplotlib +pillow +datasets +torchvision +fsspec +numpy +imagehash +tqdm +torch +pytest +s3fs diff --git a/short_output_test.py b/short_output_test.py new file mode 100644 index 00000000..c0989582 --- /dev/null +++ b/short_output_test.py @@ -0,0 +1,13 @@ +from cleanvision import Imagelab + +print("Running short output test with verbose=True") +# Specify path to folder containing the image files in your dataset +imagelab = Imagelab(data_path="image_files/", verbose=True) +# Automatically check for a predefined list of issues within your dataset +imagelab.find_issues(verbose=True) + +print("Running short output test with verbose=False") +# Specify path to folder containing the image files in your dataset +imagelab = Imagelab(data_path="image_files/", verbose=False) +# Automatically check for a predefined list of issues within your dataset +imagelab.find_issues(verbose=False) diff --git a/src/cleanvision/dataset/fsspec_dataset.py b/src/cleanvision/dataset/fsspec_dataset.py index c732c707..942dd05b 100644 --- a/src/cleanvision/dataset/fsspec_dataset.py +++ b/src/cleanvision/dataset/fsspec_dataset.py @@ -19,6 +19,7 @@ def __init__( data_folder: Optional[str] = None, filepaths: Optional[List[str]] = None, storage_opts: Dict[str, str] = {}, + verbose: bool = True, ) -> None: super().__init__() self.storage_opts = storage_opts @@ -32,7 +33,7 @@ def __init__( self.fs, dataset_path = fsspec.core.url_to_fs( data_folder, **self.storage_opts ) - self._filepaths = self.__get_filepaths(dataset_path) + self._filepaths = self.__get_filepaths(dataset_path, verbose) else: assert filepaths is not None if len(filepaths) != len(set(filepaths)): @@ -64,10 +65,11 @@ def get_name(self, item: Union[int, str]) -> str: assert isinstance(item, str) return item.split("/")[-1] - def __get_filepaths(self, dataset_path: str) -> List[str]: + def __get_filepaths(self, dataset_path: str, verbose: bool) -> List[str]: """See an issue here: https://github.com/fsspec/filesystem_spec/issues/1019 There's a problem with proper patterning on /**/ in fsspec""" - print(f"Reading images from {dataset_path}") + if verbose: + print(f"Reading images from {dataset_path}") filepaths = [] for ext in IMAGE_FILE_EXTENSIONS: # initial *.ext search, top level diff --git a/src/cleanvision/dataset/utils.py b/src/cleanvision/dataset/utils.py index 97873bdb..9b5fc6b9 100644 --- a/src/cleanvision/dataset/utils.py +++ b/src/cleanvision/dataset/utils.py @@ -19,11 +19,12 @@ def build_dataset( image_key: Optional[str] = None, torchvision_dataset: Optional["VisionDataset"] = None, storage_opts: Dict[str, str] = {}, + verbose: bool = True, ) -> Dataset: if data_path: - return FSDataset(data_folder=data_path, storage_opts=storage_opts) + return FSDataset(data_folder=data_path, storage_opts=storage_opts, verbose=verbose) elif filepaths: - return FSDataset(filepaths=filepaths, storage_opts=storage_opts) + return FSDataset(filepaths=filepaths, storage_opts=storage_opts, verbose=verbose) elif hf_dataset and image_key: return HFDataset(hf_dataset, image_key) elif torchvision_dataset: diff --git a/src/cleanvision/imagelab.py b/src/cleanvision/imagelab.py index 2c98f9e1..0e9319d7 100644 --- a/src/cleanvision/imagelab.py +++ b/src/cleanvision/imagelab.py @@ -124,6 +124,7 @@ def __init__( image_key: Optional[str] = None, torchvision_dataset: Optional["VisionDataset"] = None, storage_opts: Dict[str, Any] = {}, + verbose: bool = True, ) -> None: self._dataset = build_dataset( data_path, @@ -132,6 +133,7 @@ def __init__( image_key, torchvision_dataset, storage_opts=storage_opts, + verbose=verbose, ) if len(self._dataset) == 0: raise ValueError("No images found in the dataset specified") @@ -276,6 +278,7 @@ def find_issues( dataset=self._dataset, imagelab_info=self.info, n_jobs=n_jobs, + verbose=verbose, ) # update issues, issue_summary and info diff --git a/src/cleanvision/issue_managers/duplicate_issue_manager.py b/src/cleanvision/issue_managers/duplicate_issue_manager.py index 2e7858f3..9db2bcd2 100644 --- a/src/cleanvision/issue_managers/duplicate_issue_manager.py +++ b/src/cleanvision/issue_managers/duplicate_issue_manager.py @@ -107,6 +107,7 @@ def find_issues( dataset: Optional[Dataset] = None, imagelab_info: Optional[Dict[str, Any]] = None, n_jobs: Optional[int] = None, + verbose: Optional[bool] = None, **kwargs: Any, ) -> None: super().find_issues(**kwargs) @@ -125,7 +126,7 @@ def find_issues( results: List[Dict[str, Union[str, int]]] = [] if n_jobs == 1: - for idx in tqdm(dataset.index): + for idx in tqdm(dataset.index, leave=verbose, desc="Computing hashes", smoothing=0): results.append(compute_hash(idx, dataset, to_compute, self.params)) else: args = [ @@ -144,7 +145,7 @@ def find_issues( p.imap_unordered( compute_hash_wrapper, args, chunksize=chunksize ), - total=len(dataset), + total=len(dataset), leave=verbose, desc="Computing hashes", smoothing=0, ) ) diff --git a/src/cleanvision/issue_managers/image_property_issue_manager.py b/src/cleanvision/issue_managers/image_property_issue_manager.py index facc92ab..293b5372 100644 --- a/src/cleanvision/issue_managers/image_property_issue_manager.py +++ b/src/cleanvision/issue_managers/image_property_issue_manager.py @@ -114,6 +114,7 @@ def find_issues( dataset: Optional[Dataset] = None, imagelab_info: Optional[Dict[str, Any]] = None, n_jobs: Optional[int] = None, + verbose: Optional[bool] = None, **kwargs: Any, ) -> None: super().find_issues(**kwargs) @@ -138,7 +139,7 @@ def find_issues( if to_be_computed: results: List[Dict[str, Union[int, float, str]]] = [] if n_jobs == 1: - for idx in tqdm(dataset.index): + for idx in tqdm(dataset.index, leave=verbose, desc="Computing scores", smoothing=0): results.append( compute_scores( idx, dataset, to_be_computed, self.image_properties @@ -161,7 +162,7 @@ def find_issues( p.imap_unordered( compute_scores_wrapper, args, chunksize=chunksize ), - total=len(dataset), + total=len(dataset), leave=verbose, desc="Computing scores", smoothing=0, ) ) diff --git a/src/cleanvision/utils/utils.py b/src/cleanvision/utils/utils.py index efe9c2aa..2a4f8f5e 100644 --- a/src/cleanvision/utils/utils.py +++ b/src/cleanvision/utils/utils.py @@ -51,6 +51,7 @@ def get_filepaths( """ abs_dir_path = os.path.abspath(os.path.expanduser(dir_path)) + # ToDo: Suppress print according to verbosity level print(f"Reading images from {abs_dir_path}") filepaths = [] for ext in IMAGE_FILE_EXTENSIONS: From 001eb6b19b8ee3022801c773fbdb46f48d24b7eb Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 7 Feb 2025 10:46:09 +0100 Subject: [PATCH 2/6] Allow verbose keyword in base_issue_manager --- src/cleanvision/utils/base_issue_manager.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/cleanvision/utils/base_issue_manager.py b/src/cleanvision/utils/base_issue_manager.py index 67a01c1e..f318f50f 100644 --- a/src/cleanvision/utils/base_issue_manager.py +++ b/src/cleanvision/utils/base_issue_manager.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from tabnanny import verbose from typing import Dict, Any import pandas as pd @@ -32,6 +33,7 @@ def check_params(**kwargs: Any) -> None: "dataset": Dataset, "imagelab_info": Dict[str, Any], "n_jobs": int, + "verbose": bool, } for name, value in kwargs.items(): From b4fe279b6f78ea1bd759df4c822a2a4caf59c30e Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 21 Feb 2025 09:02:44 +0100 Subject: [PATCH 3/6] Fulfill the proposed deletions --- src/cleanvision/dataset/utils.py | 8 ++++++-- .../issue_managers/duplicate_issue_manager.py | 9 +++++++-- .../issue_managers/image_property_issue_manager.py | 9 +++++++-- src/cleanvision/utils/base_issue_manager.py | 1 - 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/cleanvision/dataset/utils.py b/src/cleanvision/dataset/utils.py index 9b5fc6b9..f5f0ee1b 100644 --- a/src/cleanvision/dataset/utils.py +++ b/src/cleanvision/dataset/utils.py @@ -22,9 +22,13 @@ def build_dataset( verbose: bool = True, ) -> Dataset: if data_path: - return FSDataset(data_folder=data_path, storage_opts=storage_opts, verbose=verbose) + return FSDataset( + data_folder=data_path, storage_opts=storage_opts, verbose=verbose + ) elif filepaths: - return FSDataset(filepaths=filepaths, storage_opts=storage_opts, verbose=verbose) + return FSDataset( + filepaths=filepaths, storage_opts=storage_opts, verbose=verbose + ) elif hf_dataset and image_key: return HFDataset(hf_dataset, image_key) elif torchvision_dataset: diff --git a/src/cleanvision/issue_managers/duplicate_issue_manager.py b/src/cleanvision/issue_managers/duplicate_issue_manager.py index 9db2bcd2..81ea3811 100644 --- a/src/cleanvision/issue_managers/duplicate_issue_manager.py +++ b/src/cleanvision/issue_managers/duplicate_issue_manager.py @@ -126,7 +126,9 @@ def find_issues( results: List[Dict[str, Union[str, int]]] = [] if n_jobs == 1: - for idx in tqdm(dataset.index, leave=verbose, desc="Computing hashes", smoothing=0): + for idx in tqdm( + dataset.index, leave=verbose, desc="Computing hashes", smoothing=0 + ): results.append(compute_hash(idx, dataset, to_compute, self.params)) else: args = [ @@ -145,7 +147,10 @@ def find_issues( p.imap_unordered( compute_hash_wrapper, args, chunksize=chunksize ), - total=len(dataset), leave=verbose, desc="Computing hashes", smoothing=0, + total=len(dataset), + leave=verbose, + desc="Computing hashes", + smoothing=0, ) ) diff --git a/src/cleanvision/issue_managers/image_property_issue_manager.py b/src/cleanvision/issue_managers/image_property_issue_manager.py index 293b5372..a25efaf5 100644 --- a/src/cleanvision/issue_managers/image_property_issue_manager.py +++ b/src/cleanvision/issue_managers/image_property_issue_manager.py @@ -139,7 +139,9 @@ def find_issues( if to_be_computed: results: List[Dict[str, Union[int, float, str]]] = [] if n_jobs == 1: - for idx in tqdm(dataset.index, leave=verbose, desc="Computing scores", smoothing=0): + for idx in tqdm( + dataset.index, leave=verbose, desc="Computing scores", smoothing=0 + ): results.append( compute_scores( idx, dataset, to_be_computed, self.image_properties @@ -162,7 +164,10 @@ def find_issues( p.imap_unordered( compute_scores_wrapper, args, chunksize=chunksize ), - total=len(dataset), leave=verbose, desc="Computing scores", smoothing=0, + total=len(dataset), + leave=verbose, + desc="Computing scores", + smoothing=0, ) ) diff --git a/src/cleanvision/utils/base_issue_manager.py b/src/cleanvision/utils/base_issue_manager.py index f318f50f..b3977f08 100644 --- a/src/cleanvision/utils/base_issue_manager.py +++ b/src/cleanvision/utils/base_issue_manager.py @@ -1,5 +1,4 @@ from abc import ABC, abstractmethod -from tabnanny import verbose from typing import Dict, Any import pandas as pd From 4444bb688271a5115ea881d8b61a01f4bdd46c00 Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 21 Feb 2025 09:10:46 +0100 Subject: [PATCH 4/6] Delete environment_linux.yml --- environment_linux.yml | 104 ------------------------------------------ 1 file changed, 104 deletions(-) delete mode 100644 environment_linux.yml diff --git a/environment_linux.yml b/environment_linux.yml deleted file mode 100644 index ff49616b..00000000 --- a/environment_linux.yml +++ /dev/null @@ -1,104 +0,0 @@ -name: cleanvision -channels: - - conda-forge - - pytorch - - nvidia -dependencies: - - _libgcc_mutex=0.1=conda_forge - - _openmp_mutex=4.5=2_gnu - - bzip2=1.0.8=h4bc722e_7 - - ca-certificates=2025.1.31=hbcca054_0 - - ld_impl_linux-64=2.43=h712a8e2_2 - - libexpat=2.6.4=h5888daf_0 - - libffi=3.4.2=h7f98852_5 - - libgcc=14.2.0=h77fa898_1 - - libgcc-ng=14.2.0=h69a702a_1 - - libgomp=14.2.0=h77fa898_1 - - liblzma=5.6.4=hb9d3cd8_0 - - libmpdec=4.0.0=h4bc722e_0 - - libsqlite=3.48.0=hee588c1_1 - - libuuid=2.38.1=h0b41bf4_0 - - libzlib=1.3.1=hb9d3cd8_2 - - ncurses=6.5=h2d0b736_3 - - openssl=3.4.0=h7b32b05_1 - - pip=25.0=pyh145f28c_0 - - python=3.13.1=ha99a958_105_cp313 - - python_abi=3.13=5_cp313 - - readline=8.2=h8228510_1 - - tk=8.6.13=noxft_h4845f30_101 - - pip: - - aiobotocore==2.19.0 - - aiohappyeyeballs==2.4.4 - - aiohttp==3.11.12 - - aioitertools==0.12.0 - - aiosignal==1.3.2 - - attrs==25.1.0 - - botocore==1.36.3 - - certifi==2025.1.31 - - charset-normalizer==3.4.1 - - cleanvision==0.3.7 - - contourpy==1.3.1 - - cycler==0.12.1 - - datasets==3.2.0 - - dill==0.3.8 - - filelock==3.17.0 - - fonttools==4.55.8 - - frozenlist==1.5.0 - - fsspec==2024.9.0 - - huggingface-hub==0.28.1 - - idna==3.10 - - imagehash==4.3.2 - - iniconfig==2.0.0 - - jinja2==3.1.5 - - jmespath==1.0.1 - - kiwisolver==1.4.8 - - markupsafe==3.0.2 - - matplotlib==3.10.0 - - mpmath==1.3.0 - - multidict==6.1.0 - - multiprocess==0.70.16 - - networkx==3.4.2 - - numpy==2.2.2 - - nvidia-cublas-cu12==12.4.5.8 - - nvidia-cuda-cupti-cu12==12.4.127 - - nvidia-cuda-nvrtc-cu12==12.4.127 - - nvidia-cuda-runtime-cu12==12.4.127 - - nvidia-cudnn-cu12==9.1.0.70 - - nvidia-cufft-cu12==11.2.1.3 - - nvidia-curand-cu12==10.3.5.147 - - nvidia-cusolver-cu12==11.6.1.9 - - nvidia-cusparse-cu12==12.3.1.170 - - nvidia-cusparselt-cu12==0.6.2 - - nvidia-nccl-cu12==2.21.5 - - nvidia-nvjitlink-cu12==12.4.127 - - nvidia-nvtx-cu12==12.4.127 - - packaging==24.2 - - pandas==2.2.3 - - pillow==11.1.0 - - pluggy==1.5.0 - - propcache==0.2.1 - - psutil==6.1.1 - - pyarrow==19.0.0 - - pyparsing==3.2.1 - - pytest==8.3.4 - - python-dateutil==2.9.0.post0 - - pytz==2025.1 - - pywavelets==1.8.0 - - pyyaml==6.0.2 - - requests==2.32.3 - - s3fs==2024.9.0 - - scipy==1.15.1 - - setuptools==75.8.0 - - six==1.17.0 - - sympy==1.13.1 - - tabulate==0.9.0 - - torch==2.6.0 - - torchvision==0.21.0 - - tqdm==4.67.1 - - triton==3.2.0 - - typing-extensions==4.12.2 - - tzdata==2025.1 - - urllib3==2.3.0 - - wrapt==1.17.2 - - xxhash==3.5.0 - - yarl==1.18.3 From 7f01189277abca78e39272dfb27babcdc11e7c17 Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 21 Feb 2025 09:11:05 +0100 Subject: [PATCH 5/6] Delete requirements-test.txt --- requirements-test.txt | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 requirements-test.txt diff --git a/requirements-test.txt b/requirements-test.txt deleted file mode 100644 index 01dd08a9..00000000 --- a/requirements-test.txt +++ /dev/null @@ -1,13 +0,0 @@ -pandas -psutil -matplotlib -pillow -datasets -torchvision -fsspec -numpy -imagehash -tqdm -torch -pytest -s3fs From 5b172345ace00fae80638fac8d6b983677e117a5 Mon Sep 17 00:00:00 2001 From: saskra Date: Fri, 21 Feb 2025 09:11:26 +0100 Subject: [PATCH 6/6] Delete short_output_test.py --- short_output_test.py | 13 ------------- 1 file changed, 13 deletions(-) delete mode 100644 short_output_test.py diff --git a/short_output_test.py b/short_output_test.py deleted file mode 100644 index c0989582..00000000 --- a/short_output_test.py +++ /dev/null @@ -1,13 +0,0 @@ -from cleanvision import Imagelab - -print("Running short output test with verbose=True") -# Specify path to folder containing the image files in your dataset -imagelab = Imagelab(data_path="image_files/", verbose=True) -# Automatically check for a predefined list of issues within your dataset -imagelab.find_issues(verbose=True) - -print("Running short output test with verbose=False") -# Specify path to folder containing the image files in your dataset -imagelab = Imagelab(data_path="image_files/", verbose=False) -# Automatically check for a predefined list of issues within your dataset -imagelab.find_issues(verbose=False)