Skip to content

Commit

Permalink
add tests and significantly speed up LuxonisDataset.list_datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
JSabadin committed Jan 9, 2025
1 parent b31516e commit 75962c1
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 13 deletions.
30 changes: 17 additions & 13 deletions luxonis_ml/data/datasets/luxonis_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil
import tempfile
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
from contextlib import suppress
from functools import cached_property
from pathlib import Path
Expand Down Expand Up @@ -1107,22 +1108,20 @@ def list_datasets(
bucket_storage: BucketStorage = BucketStorage.LOCAL,
bucket: Optional[str] = None,
) -> List[str]:
"""Returns a dictionary of all datasets.
"""Returns a list of all datasets.
@type team_id: Optional[str]
@param team_id: Optional team identifier
@type bucket_storage: BucketStorage
@param bucket_storage: Underlying bucket storage from C{local},
C{S3}, or C{GCS}. Default is C{local}.
@param bucket_storage: Underlying bucket storage (local, S3, or
GCS). Default is local.
@type bucket: Optional[str]
@param bucket: Name of the bucket. Default is C{None}.
@param bucket: Name of the bucket. Default is None.
@rtype: List[str]
@return: List of all dataset names.
"""
base_path = environ.LUXONISML_BASE_PATH

team_id = team_id or environ.LUXONISML_TEAM_ID
names = []

if bucket_storage == BucketStorage.LOCAL:
fs = LuxonisFileSystem(
Expand All @@ -1142,13 +1141,18 @@ def list_datasets(
if not fs.exists():
return []

for path in fs.walk_dir("", recursive=False, typ="directory"):
def process_directory(path: str) -> Optional[str]:
path = Path(path)
metadata_path = path / "metadata" / "metadata.json"
if not fs.exists(metadata_path):
continue
metadata_text = fs.read_text(metadata_path)
if isinstance(metadata_text, bytes):
metadata_text = metadata_text.decode()
names.append(path.name)
if fs.exists(metadata_path):
return path.name
return None

# Collect directory paths and process them in parallel
paths = list(fs.walk_dir("", recursive=False, typ="directory"))
with ThreadPoolExecutor() as executor:
names = [
name for name in executor.map(process_directory, paths) if name
]

return names
88 changes: 88 additions & 0 deletions tests/test_data/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,3 +488,91 @@ def generator():
"detection/segmentation",
},
)


def test_clone_and_merge_dataset(tempdir: Path, bucket_storage: BucketStorage):
dataset1_name = "test_merge_1"
dataset1 = LuxonisDataset(
dataset1_name,
bucket_storage=bucket_storage,
delete_existing=True,
delete_remote=True,
)

def generator1():
for i in range(3):
img = create_image(i, tempdir)
yield {
"file": img,
"annotation": {
"class": "person",
"boundingbox": {"x": 0.1, "y": 0.1, "w": 0.1, "h": 0.1},
},
}

dataset1.add(generator1())
dataset1.make_splits({"train": 0.6, "val": 0.4})

dataset2_name = "test_merge_2"
dataset2 = LuxonisDataset(
dataset2_name,
bucket_storage=bucket_storage,
delete_existing=True,
delete_remote=True,
)

def generator2():
for i in range(3, 6):
img = create_image(i, tempdir)
yield {
"file": img,
"annotation": {
"class": "dog",
"boundingbox": {"x": 0.2, "y": 0.2, "w": 0.2, "h": 0.2},
},
}

dataset2.add(generator2())
dataset2.make_splits({"train": 0.6, "val": 0.4})

cloned_datset1 = dataset1.clone(new_dataset_name=dataset1_name + "_cloned")

assert cloned_datset1.get_splits() == dataset1.get_splits()
assert cloned_datset1.get_classes() == dataset1.get_classes()
assert cloned_datset1.get_task_names() == dataset1.get_task_names()
assert cloned_datset1.get_skeletons() == dataset1.get_skeletons()

df_cloned = cloned_datset1._load_df_offline()
df_original = dataset1._load_df_offline()
assert df_cloned.equals(df_original)

cloned_datset1_merged_with_dataset2 = cloned_datset1.merge_with(
dataset2, inplace=True
)

dataset1_merged_with_dataset2 = dataset1.merge_with(
dataset2,
inplace=False,
new_dataset_name=dataset1_name + "_" + dataset2_name + "_merged",
)

assert (
dataset1_merged_with_dataset2.get_splits()
== cloned_datset1_merged_with_dataset2.get_splits()
)
assert (
dataset1_merged_with_dataset2.get_classes()
== cloned_datset1_merged_with_dataset2.get_classes()
)
assert (
dataset1_merged_with_dataset2.get_task_names()
== cloned_datset1_merged_with_dataset2.get_task_names()
)
assert (
dataset1_merged_with_dataset2.get_skeletons()
== cloned_datset1_merged_with_dataset2.get_skeletons()
)

df_merged = dataset1_merged_with_dataset2._load_df_offline()
df_cloned_merged = cloned_datset1_merged_with_dataset2._load_df_offline()
assert df_merged.equals(df_cloned_merged)

0 comments on commit 75962c1

Please sign in to comment.