From 55d3e7fa4fa4babf6ce2cea835fcdf10069008d6 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 15 Jan 2025 16:34:58 -0500 Subject: [PATCH 1/8] updated tests --- tests/test_data/test_annotations.py | 187 +++++++++++--------- tests/test_data/test_dataset.py | 93 ++++------ tests/test_data/test_dataset_integration.py | 14 +- tests/test_data/test_task_ingestion.py | 61 ++----- tests/test_data/utils.py | 15 ++ 5 files changed, 172 insertions(+), 198 deletions(-) create mode 100644 tests/test_data/utils.py diff --git a/tests/test_data/test_annotations.py b/tests/test_data/test_annotations.py index 6e7d28c8..5eddcb67 100644 --- a/tests/test_data/test_annotations.py +++ b/tests/test_data/test_annotations.py @@ -53,7 +53,6 @@ def compare_parquet_rows( ): rows = list(record.to_parquet_rows()) for row in rows: - del row["created_at"] # type: ignore row["file"] = Path(row["file"]) # type: ignore assert rows == expected_rows @@ -516,89 +515,111 @@ def test_detection(subtests: SubTests): } ) - with subtests.test("full"): - detection = Detection( - **{ - "class": "person", - "boundingbox": {"x": 0.1, "y": 0.2, "w": 0.5, "h": 0.5}, - "keypoints": { - "keypoints": [(0.2, 0.4, 2), (0.5, 0.8, 2)], - }, - "segmentation": { - "mask": np.array( - [ - [0, 1, 0, 0], - [1, 1, 0, 0], - [0, 0, 0, 0], - [0, 0, 1, 1], - ] - ), - }, - "instance_segmentation": { - "mask": np.array( - [ - [1, 1, 0, 0], - [1, 1, 0, 0], - [0, 0, 0, 0], - [0, 0, 0, 0], - ] - ), - }, - "metadata": {"age": 25}, - "sub_detections": { - "head": { - "boundingbox": { - "x": 0.2, - "y": 0.3, - "w": 0.1, - "h": 0.1, - }, - } - }, - } - ) - expected_rows = [ - { - "class_name": "person", - "instance_id": -1, - "task_type": "boundingbox", - "annotation": '{"x":0.1,"y":0.2,"w":0.5,"h":0.5}', - }, - { - "class_name": "person", - "instance_id": -1, - "task_type": "keypoints", - "annotation": '{"keypoints":[[0.2,0.4,2],[0.5,0.8,2]]}', - }, - { - "class_name": "person", - "instance_id": -1, - "task_type": "segmentation", - "annotation": '{"height":4,"width":4,"counts":"11213ON0"}', - }, - { - "class_name": "person", - "instance_id": -1, - "task_type": "instance_segmentation", - "annotation": '{"height":4,"width":4,"counts":"02208"}', + +def test_record(tempdir: Path): + detection = Detection( + **{ + "class": "person", + "boundingbox": {"x": 0.1, "y": 0.2, "w": 0.5, "h": 0.5}, + "keypoints": { + "keypoints": [(0.2, 0.4, 2), (0.5, 0.8, 2)], }, - { - "class_name": "person", - "instance_id": -1, - "task_type": "metadata/age", - "annotation": "25", + "segmentation": { + "mask": np.array( + [ + [0, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 0, 0], + [0, 0, 1, 1], + ] + ), }, - { - "class_name": "person", - "instance_id": -1, - "task_type": "classification", - "annotation": "{}", + "instance_segmentation": { + "mask": np.array( + [ + [1, 1, 0, 0], + [1, 1, 0, 0], + [0, 0, 0, 0], + [0, 0, 0, 0], + ] + ), }, - { - "class_name": None, - "instance_id": -1, - "task_type": "head/boundingbox", - "annotation": '{"x":0.2,"y":0.3,"w":0.1,"h":0.1}', + "metadata": {"age": 25}, + "sub_detections": { + "head": { + "boundingbox": { + "x": 0.2, + "y": 0.3, + "w": 0.1, + "h": 0.1, + }, + } }, - ] - assert list(detection.to_parquet_rows()) == expected_rows + } + ) + filename = str(tempdir / "image.jpg") + cv2.imwrite(filename, np.zeros((256, 256, 3), dtype=np.uint8)) + record = DatasetRecord( + **{ + "file": filename, + "annotation": detection, + "task": "test", + } + ) + common = { + "file": filename, + "source_name": "image", + "instance_id": -1, + } + expected_rows = [ + { + **common, + "task_name": "test", + "class_name": "person", + "task_type": "boundingbox", + "annotation": '{"x":0.1,"y":0.2,"w":0.5,"h":0.5}', + }, + { + **common, + "task_name": "test", + "class_name": "person", + "task_type": "keypoints", + "annotation": '{"keypoints":[[0.2,0.4,2],[0.5,0.8,2]]}', + }, + { + **common, + "task_name": "test", + "class_name": "person", + "task_type": "segmentation", + "annotation": '{"height":4,"width":4,"counts":"11213ON0"}', + }, + { + **common, + "task_name": "test", + "class_name": "person", + "task_type": "instance_segmentation", + "annotation": '{"height":4,"width":4,"counts":"02208"}', + }, + { + **common, + "task_name": "test", + "class_name": "person", + "task_type": "metadata/age", + "annotation": "25", + }, + { + **common, + "task_name": "test", + "class_name": "person", + "task_type": "classification", + "annotation": "{}", + }, + { + **common, + "task_name": "test/head", + "class_name": None, + "task_type": "boundingbox", + "annotation": '{"x":0.2,"y":0.3,"w":0.1,"h":0.1}', + }, + ] + assert list(record.to_parquet_rows()) == expected_rows diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py index 464e8832..894078bb 100644 --- a/tests/test_data/test_dataset.py +++ b/tests/test_data/test_dataset.py @@ -2,10 +2,10 @@ from pathlib import Path from typing import Any, Dict, List, Set -import cv2 import numpy as np import pytest from pytest_subtests.plugin import SubTests +from utils import create_image from luxonis_ml.data import ( BucketStorage, @@ -18,17 +18,6 @@ from luxonis_ml.enums import DatasetType -def create_image(i: int, dir: Path) -> Path: - path = dir / f"img_{i}.jpg" - if not path.exists(): - img = np.zeros((512, 512, 3), dtype=np.uint8) - img[0:10, 0:10] = np.random.randint( - 0, 255, (10, 10, 3), dtype=np.uint8 - ) - cv2.imwrite(str(path), img) - return path - - def compare_loader_output(loader: LuxonisLoader, tasks: Set[str]): all_labels = set() for _, labels in loader: @@ -60,8 +49,8 @@ def test_dataset( assert LuxonisDataset.exists( dataset_name, bucket_storage=bucket_storage ) - assert dataset.get_classes()[0] == ["person"] assert set(dataset.get_task_names()) == {"coco"} + assert dataset.get_classes().get("coco") == ["person"] assert dataset.get_skeletons() == { "coco": ( [ @@ -291,9 +280,6 @@ def generator(step=15): ), f"Split {split} has {len(split_data)} samples" -# TODO: Test array - - def test_metadata( bucket_storage: BucketStorage, dataset_name: str, tempdir: Path ): @@ -492,19 +478,9 @@ def generator(): ) -@pytest.mark.dependency(name="test_clone_dataset_local") -def test_clone_dataset_local(dataset_name: str, tempdir: Path): - _test_clone_dataset(BucketStorage.LOCAL, dataset_name, tempdir) - - -@pytest.mark.dependency( - name="test_clone_dataset_gcs", depends=["test_clone_dataset_local"] -) -def test_clone_dataset_gcs(dataset_name: str, tempdir: Path): - _test_clone_dataset(BucketStorage.GCS, dataset_name, tempdir) - - -def _test_clone_dataset(bucket_storage, dataset_name: str, tempdir: Path): +def test_clone_dataset( + bucket_storage: BucketStorage, dataset_name: str, tempdir: Path +): dataset = LuxonisDataset( dataset_name, bucket_storage=bucket_storage, @@ -540,22 +516,13 @@ def generator1(): assert df_cloned.equals(df_original) -@pytest.mark.dependency( - name="test_merge_datasets_local", depends=["test_clone_dataset_gcs"] -) -def test_merge_datasets_local(dataset_name: str, tempdir: Path): - _test_merge_datasets(BucketStorage.LOCAL, dataset_name, tempdir) - - -@pytest.mark.dependency( - name="test_merge_datasets_gcs", depends=["test_merge_datasets_local"] -) -def test_merge_datasets_gcs(dataset_name: str, tempdir: Path): - _test_merge_datasets(BucketStorage.GCS, dataset_name, tempdir) - - -def _test_merge_datasets(bucket_storage, dataset_name: str, tempdir: Path): - dataset1_name = dataset_name + "_1" +def test_merge_datasets( + bucket_storage: BucketStorage, + dataset_name: str, + tempdir: Path, + subtests: SubTests, +): + dataset1_name = f"{dataset_name}_1" dataset1 = LuxonisDataset( dataset1_name, bucket_storage=bucket_storage, @@ -577,7 +544,7 @@ def generator1(): dataset1.add(generator1()) dataset1.make_splits({"train": 0.6, "val": 0.4}) - dataset2_name = dataset_name + "_2" + dataset2_name = f"{dataset_name}_2" dataset2 = LuxonisDataset( dataset2_name, bucket_storage=bucket_storage, @@ -599,26 +566,26 @@ def generator2(): dataset2.add(generator2()) dataset2.make_splits({"train": 0.6, "val": 0.4}) - # Test in-place merge - cloned_dataset1 = dataset1.clone( - new_dataset_name=dataset1_name + "_cloned" - ) - cloned_dataset1_merged_with_dataset2 = cloned_dataset1.merge_with( - dataset2, inplace=True - ) + with subtests.test("test_inplace"): + cloned_dataset1 = dataset1.clone( + new_dataset_name=f"{dataset1_name}_cloned" + ) + cloned_dataset1_merged_with_dataset2 = cloned_dataset1.merge_with( + dataset2, inplace=True + ) - all_classes_inplace, _ = cloned_dataset1_merged_with_dataset2.get_classes() - assert set(all_classes_inplace) == {"person", "dog"} + classes = cloned_dataset1_merged_with_dataset2.get_classes() + assert set(classes[""]) == {"person", "dog"} - # Test out-of-place merge - dataset1_merged_with_dataset2 = dataset1.merge_with( - dataset2, - inplace=False, - new_dataset_name=dataset1_name + "_" + dataset2_name + "_merged", - ) + with subtests.test("test_out_of_place"): + dataset1_merged_with_dataset2 = dataset1.merge_with( + dataset2, + inplace=False, + new_dataset_name=dataset1_name + "_" + dataset2_name + "_merged", + ) - all_classes_out_of_place, _ = dataset1_merged_with_dataset2.get_classes() - assert set(all_classes_out_of_place) == {"person", "dog"} + classes = dataset1_merged_with_dataset2.get_classes() + assert set(classes[""]) == {"person", "dog"} df_merged = dataset1_merged_with_dataset2._load_df_offline() df_cloned_merged = dataset1.merge_with( diff --git a/tests/test_data/test_dataset_integration.py b/tests/test_data/test_dataset_integration.py index cdf752f5..10afa42f 100644 --- a/tests/test_data/test_dataset_integration.py +++ b/tests/test_data/test_dataset_integration.py @@ -1,7 +1,7 @@ import json import uuid from pathlib import Path -from typing import Any, Dict, List +from typing import Any, Dict, List, Set import cv2 import numpy as np @@ -12,6 +12,14 @@ setup_logging(use_rich=True, rich_print=True) +def gather_tasks(dataset: LuxonisDataset) -> Set[str]: + return { + f"{task_name}/{task_type}" + for task_name, task_types in dataset.get_tasks().items() + for task_type in task_types + } + + def get_annotations(sequence_path): frame_data = sequence_path / "step0.frame_data.json" with open(frame_data) as f: @@ -41,7 +49,7 @@ def test_parking_lot_generate( ) dataset.add(generator(data_path, tempdir)) dataset.make_splits((0.8, 0.1, 0.1)) - assert set(dataset.get_tasks()) == { + assert gather_tasks(dataset) == { "car/array", "car/boundingbox", "car/classification", @@ -71,7 +79,7 @@ def test_parking_lot_generate( for _, labels in loader: accumulated_tasks.update(labels.keys()) - assert accumulated_tasks == set(dataset.get_tasks()) + assert accumulated_tasks == gather_tasks(dataset) # TODO: Simplify the dataset so the code can be cleaner diff --git a/tests/test_data/test_task_ingestion.py b/tests/test_data/test_task_ingestion.py index 6f030c80..619b7a40 100644 --- a/tests/test_data/test_task_ingestion.py +++ b/tests/test_data/test_task_ingestion.py @@ -1,11 +1,8 @@ -import shutil from collections import defaultdict from pathlib import Path from typing import Dict -import cv2 -import numpy as np -import pytest +from utils import create_image from luxonis_ml.data import ( BucketStorage, @@ -15,30 +12,9 @@ ) from luxonis_ml.data.utils import get_task_name, get_task_type -DATA_DIR = Path("tests/data/test_task_ingestion") STEP = 10 -@pytest.fixture(autouse=True, scope="module") -def prepare_dir(): - DATA_DIR.mkdir(parents=True, exist_ok=True) - - yield - - shutil.rmtree(DATA_DIR) - - -def make_image(i) -> Path: - path = DATA_DIR / f"img_{i}.jpg" - if not path.exists(): - img = np.zeros((512, 512, 3), dtype=np.uint8) - img[0:10, 0:10] = np.random.randint( - 0, 255, (10, 10, 3), dtype=np.uint8 - ) - cv2.imwrite(str(path), img) - return path - - def compute_histogram(dataset: LuxonisDataset) -> Dict[str, int]: classes = defaultdict(int) loader = LuxonisLoader(dataset, update_mode=UpdateMode.ALWAYS) @@ -50,7 +26,9 @@ def compute_histogram(dataset: LuxonisDataset) -> Dict[str, int]: return dict(classes) -def test_task_ingestion(bucket_storage: BucketStorage, dataset_name: str): +def test_task_ingestion( + bucket_storage: BucketStorage, dataset_name: str, tempdir: Path +): dataset = LuxonisDataset( dataset_name, bucket_storage=bucket_storage, @@ -60,7 +38,7 @@ def test_task_ingestion(bucket_storage: BucketStorage, dataset_name: str): def generator1(): for i in range(STEP): - path = make_image(i) + path = create_image(i, tempdir) yield { "file": str(path), "task": "animals", @@ -110,9 +88,8 @@ def generator1(): dataset.add(generator1()).make_splits((1, 0, 0)) - classes_list, classes = dataset.get_classes() + classes = dataset.get_classes() - assert set(classes_list) == {"dog", "cat", "water", "grass"} assert set(classes["landmass"]) == {"water", "grass"} assert set(classes["animals"]) == {"dog", "cat"} @@ -120,7 +97,7 @@ def generator1(): def generator2(): for i in range(STEP, 2 * STEP): - path = make_image(i) + path = create_image(i, tempdir) yield { "file": str(path), "annotation": { @@ -137,9 +114,7 @@ def generator2(): } dataset.add(generator2()).make_splits((1, 0, 0)) - classes_list, classes = dataset.get_classes() - - assert set(classes_list) == {"background", "dog", "cat", "water", "grass"} + classes = dataset.get_classes() assert set(classes["landmass"]) == {"background", "water", "grass"} assert set(classes["animals"]) == {"dog", "cat"} @@ -150,7 +125,7 @@ def generator2(): def generator3(): for i in range(2 * STEP, 3 * STEP): - path = make_image(i) + path = create_image(i, tempdir) yield { "file": str(path), "task": "animals", @@ -177,9 +152,7 @@ def generator3(): } dataset.add(generator3()).make_splits((1, 0, 0)) - classes_list, classes = dataset.get_classes() - - assert set(classes_list) == {"background", "dog", "cat", "water", "grass"} + classes = dataset.get_classes() assert set(classes["landmass"]) == {"background", "water", "grass"} assert set(classes["animals"]) == {"dog", "cat"} @@ -190,7 +163,7 @@ def generator3(): def generator4(): for i in range(3 * STEP, 4 * STEP): - path = make_image(i) + path = create_image(i, tempdir) yield { "file": str(path), "task": "detection", @@ -235,18 +208,8 @@ def generator4(): } dataset.add(generator4()).make_splits((1, 0, 0)) - classes_list, classes = dataset.get_classes() + classes = dataset.get_classes() - print(classes) - assert set(classes_list) == { - "dog", - "cat", - "water", - "grass", - "bike", - "body", - "background", - } assert set(classes["landmass"]) == {"background", "water", "grass"} assert set(classes["animals"]) == {"dog", "cat"} assert set(classes["landmass-2"]) == {"water"} diff --git a/tests/test_data/utils.py b/tests/test_data/utils.py new file mode 100644 index 00000000..48d01c4c --- /dev/null +++ b/tests/test_data/utils.py @@ -0,0 +1,15 @@ +from pathlib import Path + +import cv2 +import numpy as np + + +def create_image(i: int, dir: Path) -> Path: + path = dir / f"img_{i}.jpg" + if not path.exists(): + img = np.zeros((512, 512, 3), dtype=np.uint8) + img[0:10, 0:10] = np.random.randint( + 0, 255, (10, 10, 3), dtype=np.uint8 + ) + cv2.imwrite(str(path), img) + return path From 07c8559f372ec74814fa719fbddd97911982d410 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 15 Jan 2025 16:35:18 -0500 Subject: [PATCH 2/8] fixed task names, removed timestamp --- luxonis_ml/data/datasets/annotation.py | 112 ++++++++++++---------- luxonis_ml/data/loaders/luxonis_loader.py | 29 +++--- luxonis_ml/data/utils/__init__.py | 3 +- luxonis_ml/data/utils/parquet.py | 13 +-- 4 files changed, 79 insertions(+), 78 deletions(-) diff --git a/luxonis_ml/data/datasets/annotation.py b/luxonis_ml/data/datasets/annotation.py index 2f85b2ed..357a88df 100644 --- a/luxonis_ml/data/datasets/annotation.py +++ b/luxonis_ml/data/datasets/annotation.py @@ -1,7 +1,6 @@ import json import logging from abc import ABC, abstractmethod -from datetime import datetime, timezone from pathlib import Path from typing import Any, Dict, Iterable, List, Literal, Optional, Tuple, Union @@ -14,7 +13,7 @@ from typeguard import check_type from typing_extensions import Annotated, Self, TypeAlias, override -from luxonis_ml.data.utils.parquet import ParquetDetection, ParquetRecord +from luxonis_ml.data.utils.parquet import ParquetRecord from luxonis_ml.utils import BaseModelExtraForbid logger = logging.getLogger(__name__) @@ -41,43 +40,6 @@ class Detection(BaseModelExtraForbid): sub_detections: Dict[str, "Detection"] = {} - def to_parquet_rows(self) -> Iterable[ParquetDetection]: - yield from self._to_parquet_rows() - - def _to_parquet_rows(self, prefix: str = "") -> Iterable[ParquetDetection]: - for task_type in [ - "boundingbox", - "keypoints", - "segmentation", - "instance_segmentation", - "array", - ]: - label: Optional[Annotation] = getattr(self, task_type) - - if label is not None: - yield { - "class_name": self.class_name, - "instance_id": self.instance_id, - "task_type": f"{prefix}{task_type}", - "annotation": label.model_dump_json(), - } - for key, data in self.metadata.items(): - yield { - "class_name": self.class_name, - "instance_id": self.instance_id, - "task_type": f"{prefix}metadata/{key}", - "annotation": json.dumps(data), - } - if self.class_name is not None: - yield { - "class_name": self.class_name, - "instance_id": self.instance_id, - "task_type": f"{prefix}classification", - "annotation": "{}", - } - for name, detection in self.sub_detections.items(): - yield from detection._to_parquet_rows(f"{prefix}{name}/") - @model_validator(mode="after") def validate_names(self) -> Self: for name in self.sub_detections: @@ -533,28 +495,74 @@ def to_parquet_rows(self) -> Iterable[ParquetRecord]: @rtype: L{ParquetDict} @return: A dictionary of annotation data. """ - timestamp = datetime.now(timezone.utc) + yield from self._to_parquet_rows(self.annotation, self.task) + + def _to_parquet_rows( + self, annotation: Optional[Detection], task_name: str + ) -> Iterable[ParquetRecord]: + """Converts an annotation to a dictionary for writing to a + parquet file. + + @rtype: L{ParquetDict} + @return: A dictionary of annotation data. + """ for source, file_path in self.files.items(): - if self.annotation is not None: - for detection in self.annotation.to_parquet_rows(): - yield { - "file": str(file_path), - "source_name": source, - "task_name": self.task, - "created_at": timestamp, - **detection, - } - else: + if annotation is None: yield { "file": str(file_path), "source_name": source, - "task_name": self.task, - "created_at": timestamp, + "task_name": task_name, "class_name": None, "instance_id": None, "task_type": None, "annotation": None, } + else: + for task_type in [ + "boundingbox", + "keypoints", + "segmentation", + "instance_segmentation", + "array", + ]: + label: Optional[Annotation] = getattr( + annotation, task_type + ) + + if label is not None: + yield { + "file": str(file_path), + "source_name": source, + "task_name": task_name, + "class_name": annotation.class_name, + "instance_id": annotation.instance_id, + "task_type": task_type, + "annotation": label.model_dump_json(), + } + for key, data in annotation.metadata.items(): + yield { + "file": str(file_path), + "source_name": source, + "task_name": task_name, + "class_name": annotation.class_name, + "instance_id": annotation.instance_id, + "task_type": f"metadata/{key}", + "annotation": json.dumps(data), + } + if annotation.class_name is not None: + yield { + "file": str(file_path), + "source_name": source, + "task_name": task_name, + "class_name": annotation.class_name, + "instance_id": annotation.instance_id, + "task_type": "classification", + "annotation": "{}", + } + for name, detection in annotation.sub_detections.items(): + yield from self._to_parquet_rows( + detection, f"{task_name}/{name}" + ) def check_valid_identifier(name: str, *, label: str) -> None: diff --git a/luxonis_ml/data/loaders/luxonis_loader.py b/luxonis_ml/data/loaders/luxonis_loader.py index 1210cfac..d633f329 100644 --- a/luxonis_ml/data/loaders/luxonis_loader.py +++ b/luxonis_ml/data/loaders/luxonis_loader.py @@ -104,10 +104,7 @@ def __init__( view = [view] self.view = view - df = self.dataset._load_df_offline() - if df is None: - raise FileNotFoundError("No data found in the dataset.") - self.df = df + self.df = self.dataset._load_df_offline(raise_when_empty=True) if not self.dataset.is_remote: file_index = self.dataset._get_file_index() @@ -115,7 +112,7 @@ def __init__( raise FileNotFoundError("Cannot find file index") self.df = self.df.join(file_index, on="uuid").drop("file_right") - self.classes, self.classes_by_task = self.dataset.get_classes() + self.classes = self.dataset.get_classes() self.augmentations = self._init_augmentations( augmentation_engine, augmentation_config or [], @@ -147,7 +144,7 @@ def __init__( class_: i for i, class_ in enumerate( sorted( - self.classes_by_task.get(task, []), + self.classes.get(task, []), key=lambda x: {"background": -1}.get(x, 0), ) ) @@ -173,8 +170,8 @@ def __init__( "assigned to one class or rename your background class." ) self.tasks_without_background.add(task) - if "background" not in self.classes_by_task[task_name]: - self.classes_by_task[task_name].append("background") + if "background" not in self.classes[task_name]: + self.classes[task_name].append("background") self.class_mappings[task_name] = { class_: idx + 1 for class_, idx in self.class_mappings[ @@ -235,7 +232,7 @@ def _load_data(self, idx: int) -> Tuple[np.ndarray, Labels]: if not self.dataset.is_remote: img_path = ann_rows[0][-1] else: - uuid = ann_rows[0][8] + uuid = ann_rows[0][7] file_extension = ann_rows[0][0].rsplit(".", 1)[-1] img_path = self.dataset.media_path / f"{uuid}.{file_extension}" @@ -250,10 +247,10 @@ def _load_data(self, idx: int) -> Tuple[np.ndarray, Labels]: for annotation_data in ann_rows: task_name: str = annotation_data[2] - class_name: Optional[str] = annotation_data[4] - instance_id: int = annotation_data[5] - task_type: str = annotation_data[6] - ann_str: Optional[str] = annotation_data[7] + class_name: Optional[str] = annotation_data[3] + instance_id: int = annotation_data[4] + task_type: str = annotation_data[5] + ann_str: Optional[str] = annotation_data[6] if ann_str is None: continue @@ -302,7 +299,7 @@ def _load_data(self, idx: int) -> Tuple[np.ndarray, Labels]: array = anns[0].combine_to_numpy( anns, class_ids_by_task[task], - len(self.classes_by_task[task_name]), + len(self.classes[task_name]), ) if task in self.tasks_without_background: unassigned_pixels = ~np.any(array, axis=0) @@ -363,7 +360,9 @@ def _init_augmentations( return None targets = { - task: get_task_type(task) for task in self.dataset.get_tasks() + f"{task_name}/{task_type}": task_type + for task_name, task_types in self.dataset.get_tasks().items() + for task_type in task_types } return AUGMENTATION_ENGINES.get(augmentation_engine)( diff --git a/luxonis_ml/data/utils/__init__.py b/luxonis_ml/data/utils/__init__.py index 39967645..01f0a6ad 100644 --- a/luxonis_ml/data/utils/__init__.py +++ b/luxonis_ml/data/utils/__init__.py @@ -1,6 +1,6 @@ from .data_utils import infer_task, rgb_to_bool_masks, warn_on_duplicates from .enums import BucketStorage, BucketType, ImageType, MediaType, UpdateMode -from .parquet import ParquetDetection, ParquetFileManager, ParquetRecord +from .parquet import ParquetFileManager, ParquetRecord from .task_utils import ( get_task_name, get_task_type, @@ -18,7 +18,6 @@ "warn_on_duplicates", "rgb_to_bool_masks", "ParquetRecord", - "ParquetDetection", "ParquetFileManager", "MediaType", "ImageType", diff --git a/luxonis_ml/data/utils/parquet.py b/luxonis_ml/data/utils/parquet.py index c1deaa9e..70139d8d 100644 --- a/luxonis_ml/data/utils/parquet.py +++ b/luxonis_ml/data/utils/parquet.py @@ -1,4 +1,3 @@ -from datetime import datetime from pathlib import Path from typing import Optional, TypedDict @@ -7,20 +6,16 @@ from luxonis_ml.typing import PathType -class ParquetDetection(TypedDict): +class ParquetRecord(TypedDict): + file: str + source_name: str + task_name: str class_name: Optional[str] instance_id: Optional[int] task_type: Optional[str] annotation: Optional[str] -class ParquetRecord(ParquetDetection): - file: str - source_name: str - task_name: str - created_at: datetime - - class ParquetFileManager: def __init__(self, directory: PathType, num_rows: int = 100_000) -> None: """Manages the insertion of data into parquet files. From cff18dc09e5b4d9819f1ffae267c87adfee0e102 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 15 Jan 2025 16:35:58 -0500 Subject: [PATCH 3/8] changed signature of get_tasks --- luxonis_ml/data/__main__.py | 57 +++++---- luxonis_ml/data/datasets/base_dataset.py | 11 +- luxonis_ml/data/datasets/luxonis_dataset.py | 125 ++++++++++++-------- 3 files changed, 113 insertions(+), 80 deletions(-) diff --git a/luxonis_ml/data/__main__.py b/luxonis_ml/data/__main__.py index 2bc79778..d2b5b83b 100644 --- a/luxonis_ml/data/__main__.py +++ b/luxonis_ml/data/__main__.py @@ -1,7 +1,7 @@ import logging import random from pathlib import Path -from typing import List, Optional, Tuple +from typing import List, Optional, Set, Tuple import cv2 import numpy as np @@ -16,7 +16,6 @@ from luxonis_ml.data import LuxonisDataset, LuxonisLoader, LuxonisParser from luxonis_ml.data.utils.constants import LDF_VERSION -from luxonis_ml.data.utils.task_utils import split_task, task_is_metadata from luxonis_ml.data.utils.visualizations import visualize from luxonis_ml.enums import DatasetType @@ -47,39 +46,47 @@ def check_exists(name: str): raise typer.Exit() -def get_dataset_info(name: str) -> Tuple[int, List[str], List[str]]: - dataset = LuxonisDataset(name) - size = len(dataset) - classes, _ = dataset.get_classes() - return size, classes, dataset.get_task_names() +def get_dataset_info(dataset: LuxonisDataset) -> Tuple[Set[str], List[str]]: + all_classes = { + c for classes in dataset.get_classes().values() for c in classes + } + return all_classes, dataset.get_task_names() def print_info(name: str) -> None: dataset = LuxonisDataset(name) - _, classes = dataset.get_classes() + classes = dataset.get_classes() class_table = Table( title="Classes", box=rich.box.ROUNDED, row_styles=["yellow", "cyan"] ) - class_table.add_column("Task Name", header_style="magenta i", max_width=30) - class_table.add_column("Classes", header_style="magenta i", max_width=50) + if len(classes) > 1 or next(iter(classes)): + class_table.add_column( + "Task Name", header_style="magenta i", max_width=30 + ) + class_table.add_column( + "Class Names", header_style="magenta i", max_width=50 + ) for task_name, c in classes.items(): - class_table.add_row(task_name, ", ".join(c)) + if not task_name: + class_table.add_row(", ".join(c)) + else: + class_table.add_row(task_name, ", ".join(c)) tasks = dataset.get_tasks() - tasks.sort(key=task_is_metadata) task_table = Table( title="Tasks", box=rich.box.ROUNDED, row_styles=["yellow", "cyan"] ) - task_table.add_column("Task Name", header_style="magenta i", max_width=30) - task_table.add_column("Task Type", header_style="magenta i", max_width=50) - separated = False - for task in tasks: - if task_is_metadata(task): - if not separated: - task_table.add_section() - separated = True - task_name, task_type = split_task(task) - task_table.add_row(task_name, task_type) + if len(tasks) > 1 or next(iter(tasks)): + task_table.add_column( + "Task Name", header_style="magenta i", max_width=30 + ) + task_table.add_column("Task Types", header_style="magenta i", max_width=50) + for task_name, task_types in tasks.items(): + task_types.sort() + if not task_name: + task_table.add_row(", ".join(task_types)) + else: + task_table.add_row(task_name, ", ".join(task_types)) splits = dataset.get_splits() @@ -150,7 +157,7 @@ def ls( size = -1 rows.append(str(size)) if full: - _, classes, tasks = get_dataset_info(name) + classes, tasks = get_dataset_info(dataset) rows.extend( [ ", ".join(classes) if classes else "[red][no red]", @@ -250,7 +257,7 @@ def inspect( if len(dataset) == 0: raise ValueError(f"Dataset '{name}' is empty.") - class_names = dataset.get_classes()[1] + classes = dataset.get_classes() for image, labels in loader: image = image.astype(np.uint8) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) @@ -258,7 +265,7 @@ def inspect( h, w, _ = image.shape new_h, new_w = int(h * size_multiplier), int(w * size_multiplier) image = cv2.resize(image, (new_w, new_h)) - image = visualize(image, labels, class_names, blend_all=blend_all) + image = visualize(image, labels, classes, blend_all=blend_all) cv2.imshow("image", image) if cv2.waitKey() == ord("q"): break diff --git a/luxonis_ml/data/datasets/base_dataset.py b/luxonis_ml/data/datasets/base_dataset.py index edc93cbf..7f013bbf 100644 --- a/luxonis_ml/data/datasets/base_dataset.py +++ b/luxonis_ml/data/datasets/base_dataset.py @@ -6,7 +6,6 @@ from luxonis_ml.data.datasets.annotation import DatasetRecord from luxonis_ml.data.datasets.source import LuxonisSource -from luxonis_ml.data.utils.task_utils import get_task_name from luxonis_ml.typing import PathType from luxonis_ml.utils import AutoRegisterMeta, Registry @@ -41,11 +40,11 @@ def version(self) -> Version: ... @abstractmethod - def get_tasks(self) -> List[str]: - """Returns the list of tasks in the dataset. + def get_tasks(self) -> Dict[str, str]: + """Returns a dictionary mapping task names to task types. - @rtype: List[str] - @return: List of task names. + @rtype: Dict[str, str] + @return: A dictionary mapping task names to task types. """ ... @@ -202,4 +201,4 @@ def get_task_names(self) -> List[str]: @rtype: List[str] @return: List of task names. """ - return [get_task_name(task) for task in self.get_tasks()] + return list(self.get_tasks().keys()) diff --git a/luxonis_ml/data/datasets/luxonis_dataset.py b/luxonis_ml/data/datasets/luxonis_dataset.py index 5e9af4b1..f88ba89a 100644 --- a/luxonis_ml/data/datasets/luxonis_dataset.py +++ b/luxonis_ml/data/datasets/luxonis_dataset.py @@ -66,7 +66,7 @@ class Metadata(TypedDict): source: LuxonisSource.LuxonisSourceDocument ldf_version: str classes: Dict[str, List[str]] - tasks: List[str] + tasks: Dict[str, List[str]] skeletons: Dict[str, Skeletons] @@ -246,7 +246,7 @@ def _save_df_offline(self, pl_df: pl.DataFrame) -> None: data_dict = dict(row) data_dict.pop("uuid", None) - pfm.write(uuid_val, data_dict) + pfm.write(uuid_val, data_dict) # type: ignore logger.info( f"Saved merged DataFrame to Parquet files in '{annotations_path}'." @@ -273,14 +273,6 @@ def _merge_metadata_with(self, other: "LuxonisDataset") -> None: ) else: existing_val.update(value) - - elif ( - key == "tasks" - and isinstance(existing_val, list) - and isinstance(value, list) - ): - combined = set(existing_val).union(value) - self.metadata[key] = list(combined) else: self.metadata[key] = value self._write_metadata() @@ -359,17 +351,16 @@ def merge_with( @param new_dataset_name: The name of the new dataset to create if inplace is False. """ - if not inplace and not new_dataset_name: + if inplace: + target_dataset = self + elif new_dataset_name: + target_dataset = self.clone(new_dataset_name, push_to_cloud=False) + else: raise ValueError( - "You must specify a name for the new dataset when inplace is False." + "You must specify a name for the new dataset " + "when inplace is False" ) - target_dataset = ( - self - if inplace - else self.clone(new_dataset_name, push_to_cloud=False) - ) - if self.is_remote: other.sync_from_cloud(update_mode=UpdateMode.ALWAYS) self.sync_from_cloud( @@ -378,8 +369,8 @@ def merge_with( else UpdateMode.IF_EMPTY ) - df_self = self._load_df_offline() - df_other = other._load_df_offline() + df_self = self._load_df_offline(raise_when_empty=True) + df_other = other._load_df_offline(raise_when_empty=True) duplicate_uuids = set(df_self["uuid"]).intersection(df_other["uuid"]) if duplicate_uuids: df_other = df_other.filter( @@ -389,8 +380,8 @@ def merge_with( df_merged = pl.concat([df_self, df_other]) target_dataset._save_df_offline(df_merged) - file_index_self = self._get_file_index() - file_index_other = other._get_file_index() + file_index_self = self._get_file_index(raise_when_empty=True) + file_index_other = other._get_file_index(raise_when_empty=True) file_index_duplicates = set(file_index_self["uuid"]).intersection( file_index_other["uuid"] ) @@ -444,16 +435,28 @@ def _save_splits(self, splits: Dict[str, List[str]]) -> None: @overload def _load_df_offline( - self, lazy: Literal[False] = ... + self, + lazy: Literal[False] = ..., + raise_when_empty: Literal[False] = ..., ) -> Optional[pl.DataFrame]: ... @overload def _load_df_offline( - self, lazy: Literal[True] = ... + self, lazy: Literal[False] = ..., raise_when_empty: Literal[True] = ... + ) -> pl.DataFrame: ... + + @overload + def _load_df_offline( + self, lazy: Literal[True] = ..., raise_when_empty: Literal[False] = ... ) -> Optional[pl.LazyFrame]: ... + @overload + def _load_df_offline( + self, lazy: Literal[True] = ..., raise_when_empty: Literal[True] = ... + ) -> pl.LazyFrame: ... + def _load_df_offline( - self, lazy: bool = False + self, lazy: bool = False, raise_when_empty: bool = False ) -> Optional[Union[pl.DataFrame, pl.LazyFrame]]: """Loads the dataset DataFrame **always** from the local storage.""" @@ -467,6 +470,10 @@ def _load_df_offline( ) if not path.exists(): + if raise_when_empty: + raise FileNotFoundError( + f"Dataset '{self.dataset_name}' is empty." + ) return None if lazy: @@ -476,6 +483,9 @@ def _load_df_offline( dfs = [pl.read_parquet(file) for file in path.glob("*.parquet")] df = pl.concat(dfs) if dfs else None + if df is None and raise_when_empty: + raise FileNotFoundError(f"Dataset '{self.dataset_name}' is empty.") + if self.version == LDF_VERSION or df is None: return df @@ -493,7 +503,6 @@ def _load_df_offline( "file", "source_name", "task_name", - "created_at", "class_name", "instance_id", "task_type", @@ -505,16 +514,39 @@ def _load_df_offline( @overload def _get_file_index( - self, lazy: Literal[False] = ..., sync_from_cloud: bool = ... + self, + lazy: Literal[False] = ..., + sync_from_cloud: bool = ..., + raise_when_empty: Literal[False] = ..., ) -> Optional[pl.DataFrame]: ... + @overload + def _get_file_index( + self, + lazy: Literal[False] = ..., + sync_from_cloud: bool = ..., + raise_when_empty: Literal[True] = ..., + ) -> pl.DataFrame: ... @overload def _get_file_index( - self, lazy: Literal[True] = ..., sync_from_cloud: bool = ... + self, + lazy: Literal[True] = ..., + sync_from_cloud: bool = ..., + raise_when_empty: Literal[False] = ..., ) -> Optional[pl.LazyFrame]: ... + @overload + def _get_file_index( + self, + lazy: Literal[True] = ..., + sync_from_cloud: bool = ..., + raise_when_empty: Literal[True] = ..., + ) -> pl.LazyFrame: ... def _get_file_index( - self, lazy: bool = False, sync_from_cloud: bool = False + self, + lazy: bool = False, + sync_from_cloud: bool = False, + raise_when_empty: bool = False, ) -> Optional[Union[pl.DataFrame, pl.LazyFrame]]: """Loads the file index DataFrame from the local storage or the cloud if sync_from_cloud. @@ -540,6 +572,10 @@ def _get_file_index( return df.select(pl.all().exclude("^__index_level_.*$")) + if raise_when_empty: + raise FileNotFoundError( + f"File index for dataset '{self.dataset_name}' is empty." + ) return None def _write_index( @@ -596,7 +632,7 @@ def _get_metadata(self) -> Metadata: "source": LuxonisSource().to_document(), "ldf_version": str(LDF_VERSION), "classes": {}, - "tasks": [], + "tasks": {}, "skeletons": {}, } @@ -630,15 +666,8 @@ def set_classes( self._write_metadata() @override - def get_classes(self) -> Tuple[List[str], Dict[str, List[str]]]: - all_classes = list( - { - c - for classes in self.metadata["classes"].values() - for c in classes - } - ) - return sorted(all_classes), self.metadata["classes"] + def get_classes(self) -> Dict[str, List[str]]: + return self.metadata["classes"] @override def set_skeletons( @@ -671,8 +700,8 @@ def get_skeletons( } @override - def get_tasks(self) -> List[str]: - return self.metadata.get("tasks", []) + def get_tasks(self) -> Dict[str, List[str]]: + return self.metadata.get("tasks", {}) def sync_from_cloud( self, update_mode: UpdateMode = UpdateMode.IF_EMPTY @@ -858,7 +887,7 @@ def add( if ann is not None: if not explicit_task: record.task = infer_task( - record.task, ann.class_name, self.get_classes()[1] + record.task, ann.class_name, self.get_classes() ) if ann.class_name is not None: classes_per_task[record.task].add(ann.class_name) @@ -883,7 +912,7 @@ def add( with suppress(shutil.SameFileError): self.fs.put_dir(annotations_path, "") - _, curr_classes = self.get_classes() + curr_classes = self.get_classes() for task, classes in classes_per_task.items(): old_classes = set(curr_classes.get(task, [])) new_classes = list(classes - old_classes) @@ -912,14 +941,14 @@ def _save_tasks_to_metadata(self) -> None: df = self._load_df_offline() if df is None: return - tasks = [] + tasks = defaultdict(list) for task_name, task_type in ( df.select("task_name", "task_type") .unique() .drop_nulls() .iter_rows() ): - tasks.append(f"{task_name}/{task_type}") + tasks[task_name].append(task_type) self.metadata["tasks"] = tasks self._write_metadata() @@ -1026,9 +1055,7 @@ def make_splits( if definitions is None: ratios = ratios or {"train": 0.8, "val": 0.1, "test": 0.1} - df = self._load_df_offline() - if df is None: - raise FileNotFoundError("No data found in dataset") + df = self._load_df_offline(raise_when_empty=True) ids = ( df.filter(~pl.col("uuid").is_in(defined_uuids)) .select("uuid") @@ -1146,7 +1173,7 @@ def list_datasets( if not fs.exists(): return [] - def process_directory(path: str) -> Optional[str]: + def process_directory(path: PathType) -> Optional[str]: path = Path(path) metadata_path = path / "metadata" / "metadata.json" if fs.exists(metadata_path): From 176859ef0875c889fc3ae8cd0005ac8afa1f1c09 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 15 Jan 2025 16:46:06 -0500 Subject: [PATCH 4/8] added string formatting --- tests/test_data/test_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py index 894078bb..9142ea47 100644 --- a/tests/test_data/test_dataset.py +++ b/tests/test_data/test_dataset.py @@ -517,8 +517,8 @@ def generator1(): def test_merge_datasets( - bucket_storage: BucketStorage, dataset_name: str, + bucket_storage: BucketStorage, tempdir: Path, subtests: SubTests, ): @@ -581,7 +581,7 @@ def generator2(): dataset1_merged_with_dataset2 = dataset1.merge_with( dataset2, inplace=False, - new_dataset_name=dataset1_name + "_" + dataset2_name + "_merged", + new_dataset_name=f"{dataset1_name}_{dataset2_name}_merged", ) classes = dataset1_merged_with_dataset2.get_classes() From 12b168715c0f004b482e5ab172abc4291c60738f Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 15 Jan 2025 16:49:57 -0500 Subject: [PATCH 5/8] updated doc --- luxonis_ml/data/datasets/base_dataset.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/luxonis_ml/data/datasets/base_dataset.py b/luxonis_ml/data/datasets/base_dataset.py index 7f013bbf..3d086339 100644 --- a/luxonis_ml/data/datasets/base_dataset.py +++ b/luxonis_ml/data/datasets/base_dataset.py @@ -74,13 +74,12 @@ def set_classes( ... @abstractmethod - def get_classes(self) -> Tuple[List[str], Dict[str, List[str]]]: - """Gets overall classes in the dataset and classes according to - computer vision task. + def get_classes(self) -> Dict[str, List[str]]: + """Get classes according to computer vision tasks. - @rtype: Tuple[List[str], Dict] - @return: A combined list of classes for all tasks and a - dictionary mapping tasks to the classes used in each task. + @rtype: Dict[str, List[str]] + @return: A dictionary mapping tasks to the classes used in each + task. """ ... From 7f144545c662ae02586d4edd27a2f3a2e5decfd2 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 15 Jan 2025 17:58:39 -0500 Subject: [PATCH 6/8] removed print --- tests/test_data/test_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py index 9142ea47..b3e30a89 100644 --- a/tests/test_data/test_dataset.py +++ b/tests/test_data/test_dataset.py @@ -308,7 +308,6 @@ def generator(): dataset.make_splits() loader = LuxonisLoader(dataset) for _, labels in loader: - print(labels.keys()) labels = {get_task_type(k): v for k, v in labels.items()} assert { "metadata/color", From dd831b0246e995f8311d94c3568559c58ad7df82 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 15 Jan 2025 19:25:14 -0500 Subject: [PATCH 7/8] missing doc --- luxonis_ml/data/loaders/luxonis_loader.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/luxonis_ml/data/loaders/luxonis_loader.py b/luxonis_ml/data/loaders/luxonis_loader.py index d633f329..25b752ea 100644 --- a/luxonis_ml/data/loaders/luxonis_loader.py +++ b/luxonis_ml/data/loaders/luxonis_loader.py @@ -85,6 +85,12 @@ def __init__( @type width: Optional[int] @param width: The width of the output images. Defaults to C{None}. + @type keep_aspect_ratio: bool + @param keep_aspect_ratio: Whether to keep the aspect ratio of the + images. Defaults to C{True}. + @type out_image_format: Literal["RGB", "BGR"] + @param out_image_format: The format of the output images. Defaults + to C{"RGB"}. @type update_mode: UpdateMode @param update_mode: Enum that determines the sync mode: - UpdateMode.ALWAYS: Force a fresh download From 37fed8158e70c051d9272b417440e8be25492be1 Mon Sep 17 00:00:00 2001 From: Martin Kozlovsky Date: Wed, 15 Jan 2025 20:01:31 -0500 Subject: [PATCH 8/8] fixed test --- tests/test_data/test_dataset.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py index b3e30a89..b831b280 100644 --- a/tests/test_data/test_dataset.py +++ b/tests/test_data/test_dataset.py @@ -516,11 +516,12 @@ def generator1(): def test_merge_datasets( - dataset_name: str, bucket_storage: BucketStorage, + dataset_name: str, tempdir: Path, subtests: SubTests, ): + dataset_name = f"{dataset_name}_{bucket_storage.value}" dataset1_name = f"{dataset_name}_1" dataset1 = LuxonisDataset( dataset1_name, @@ -583,8 +584,8 @@ def generator2(): new_dataset_name=f"{dataset1_name}_{dataset2_name}_merged", ) - classes = dataset1_merged_with_dataset2.get_classes() - assert set(classes[""]) == {"person", "dog"} + classes = dataset1_merged_with_dataset2.get_classes() + assert set(classes[""]) == {"person", "dog"} df_merged = dataset1_merged_with_dataset2._load_df_offline() df_cloned_merged = dataset1.merge_with(