add tests and significantly speed up LuxonisDataset.list_datasets

luxonis · Jan 9, 2025 · 75962c1 · 75962c1
1 parent b31516e
commit 75962c1
Show file tree

Hide file tree

Showing 2 changed files with 105 additions and 13 deletions.
diff --git a/luxonis_ml/data/datasets/luxonis_dataset.py b/luxonis_ml/data/datasets/luxonis_dataset.py
@@ -4,6 +4,7 @@
 import shutil
 import tempfile
 from collections import defaultdict
+from concurrent.futures import ThreadPoolExecutor
 from contextlib import suppress
 from functools import cached_property
 from pathlib import Path
@@ -1107,22 +1108,20 @@ def list_datasets(
         bucket_storage: BucketStorage = BucketStorage.LOCAL,
         bucket: Optional[str] = None,
     ) -> List[str]:
-        """Returns a dictionary of all datasets.
+        """Returns a list of all datasets.
 
         @type team_id: Optional[str]
         @param team_id: Optional team identifier
         @type bucket_storage: BucketStorage
-        @param bucket_storage: Underlying bucket storage from C{local},
-            C{S3}, or C{GCS}. Default is C{local}.
+        @param bucket_storage: Underlying bucket storage (local, S3, or
+            GCS). Default is local.
         @type bucket: Optional[str]
-        @param bucket: Name of the bucket. Default is C{None}.
+        @param bucket: Name of the bucket. Default is None.
         @rtype: List[str]
         @return: List of all dataset names.
         """
         base_path = environ.LUXONISML_BASE_PATH
-
         team_id = team_id or environ.LUXONISML_TEAM_ID
-        names = []
 
         if bucket_storage == BucketStorage.LOCAL:
             fs = LuxonisFileSystem(
@@ -1142,13 +1141,18 @@ def list_datasets(
         if not fs.exists():
             return []
 
-        for path in fs.walk_dir("", recursive=False, typ="directory"):
+        def process_directory(path: str) -> Optional[str]:
             path = Path(path)
             metadata_path = path / "metadata" / "metadata.json"
-            if not fs.exists(metadata_path):
-                continue
-            metadata_text = fs.read_text(metadata_path)
-            if isinstance(metadata_text, bytes):
-                metadata_text = metadata_text.decode()
-            names.append(path.name)
+            if fs.exists(metadata_path):
+                return path.name
+            return None
+
+        # Collect directory paths and process them in parallel
+        paths = list(fs.walk_dir("", recursive=False, typ="directory"))
+        with ThreadPoolExecutor() as executor:
+            names = [
+                name for name in executor.map(process_directory, paths) if name
+            ]
+
         return names
diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py
@@ -488,3 +488,91 @@ def generator():
             "detection/segmentation",
         },
     )
+
+
+def test_clone_and_merge_dataset(tempdir: Path, bucket_storage: BucketStorage):
+    dataset1_name = "test_merge_1"
+    dataset1 = LuxonisDataset(
+        dataset1_name,
+        bucket_storage=bucket_storage,
+        delete_existing=True,
+        delete_remote=True,
+    )
+
+    def generator1():
+        for i in range(3):
+            img = create_image(i, tempdir)
+            yield {
+                "file": img,
+                "annotation": {
+                    "class": "person",
+                    "boundingbox": {"x": 0.1, "y": 0.1, "w": 0.1, "h": 0.1},
+                },
+            }
+
+    dataset1.add(generator1())
+    dataset1.make_splits({"train": 0.6, "val": 0.4})
+
+    dataset2_name = "test_merge_2"
+    dataset2 = LuxonisDataset(
+        dataset2_name,
+        bucket_storage=bucket_storage,
+        delete_existing=True,
+        delete_remote=True,
+    )
+
+    def generator2():
+        for i in range(3, 6):
+            img = create_image(i, tempdir)
+            yield {
+                "file": img,
+                "annotation": {
+                    "class": "dog",
+                    "boundingbox": {"x": 0.2, "y": 0.2, "w": 0.2, "h": 0.2},
+                },
+            }
+
+    dataset2.add(generator2())
+    dataset2.make_splits({"train": 0.6, "val": 0.4})
+
+    cloned_datset1 = dataset1.clone(new_dataset_name=dataset1_name + "_cloned")
+
+    assert cloned_datset1.get_splits() == dataset1.get_splits()
+    assert cloned_datset1.get_classes() == dataset1.get_classes()
+    assert cloned_datset1.get_task_names() == dataset1.get_task_names()
+    assert cloned_datset1.get_skeletons() == dataset1.get_skeletons()
+
+    df_cloned = cloned_datset1._load_df_offline()
+    df_original = dataset1._load_df_offline()
+    assert df_cloned.equals(df_original)
+
+    cloned_datset1_merged_with_dataset2 = cloned_datset1.merge_with(
+        dataset2, inplace=True
+    )
+
+    dataset1_merged_with_dataset2 = dataset1.merge_with(
+        dataset2,
+        inplace=False,
+        new_dataset_name=dataset1_name + "_" + dataset2_name + "_merged",
+    )
+
+    assert (
+        dataset1_merged_with_dataset2.get_splits()
+        == cloned_datset1_merged_with_dataset2.get_splits()
+    )
+    assert (
+        dataset1_merged_with_dataset2.get_classes()
+        == cloned_datset1_merged_with_dataset2.get_classes()
+    )
+    assert (
+        dataset1_merged_with_dataset2.get_task_names()
+        == cloned_datset1_merged_with_dataset2.get_task_names()
+    )
+    assert (
+        dataset1_merged_with_dataset2.get_skeletons()
+        == cloned_datset1_merged_with_dataset2.get_skeletons()
+    )
+
+    df_merged = dataset1_merged_with_dataset2._load_df_offline()
+    df_cloned_merged = cloned_datset1_merged_with_dataset2._load_df_offline()
+    assert df_merged.equals(df_cloned_merged)