diff --git a/.github/workflows/ci-checks.yml b/.github/workflows/ci-checks.yml index 88b95903..d5ef0b5e 100644 --- a/.github/workflows/ci-checks.yml +++ b/.github/workflows/ci-checks.yml @@ -28,7 +28,7 @@ jobs: uses: ./check-typing.yml with: actions-ref: main - import-name: "litdata" + import-name: "lightning_data" artifact-name: dist-packages-${{ github.sha }} testing-matrix: | { diff --git a/.github/workflows/ci-testing.yml b/.github/workflows/ci-testing.yml index 69b1c965..cd0aa818 100644 --- a/.github/workflows/ci-testing.yml +++ b/.github/workflows/ci-testing.yml @@ -70,7 +70,7 @@ jobs: - name: Tests run: | - coverage run --source litdata -m pytest tests -v + coverage run --source lightning_data -m pytest tests -v - name: Statistics if: success() diff --git a/MANIFEST.in b/MANIFEST.in index 1e0ea67f..20c087c6 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -5,10 +5,10 @@ recursive-exclude __pycache__ *.py[cod] *.orig # Include the README and CHANGELOG include *.md -recursive-include litdata *.md +recursive-include lightning_data *.md # Include the code -recursive-include litdata *.py +recursive-include lightning_data *.py # Include the license file include LICENSE diff --git a/Makefile b/Makefile index cf5250aa..377799ec 100644 --- a/Makefile +++ b/Makefile @@ -10,7 +10,7 @@ test: clean pip install -q -r requirements/test.txt # use this to run tests - python -m coverage run --source litdata -m pytest src -v --flake8 + python -m coverage run --source lightning_data -m pytest src -v --flake8 python -m coverage report docs: clean diff --git a/README.md b/README.md index fbf7923e..ee41e9bf 100644 --- a/README.md +++ b/README.md @@ -45,7 +45,7 @@ Convert your raw dataset into Lightning Streaming format using the `optimize` op ```python import numpy as np -from litdata import optimize +from lightning_data import optimize from PIL import Image @@ -84,7 +84,7 @@ Here is an example with [AWS S3](https://aws.amazon.com/s3). ### 3. Use StreamingDataset and DataLoader ```python -from litdata import StreamingDataset +from lightning_data import StreamingDataset from torch.utils.data import DataLoader # Remote path where full dataset is persistently stored @@ -135,7 +135,7 @@ for i in range(1000): ```python import os -from litdata import map +from lightning_data import map from PIL import Image input_dir = "s3://my-bucket/my_images" @@ -174,7 +174,7 @@ We have end-to-end free [Studios](https://lightning.ai) showing all the steps to To scale data processing, create a free account on [lightning.ai](https://lightning.ai/) platform. With the platform, the `optimize` and `map` can start multiple machines to make data processing drastically faster as follows: ```python -from litdata import optimize, Machine +from lightning_data import optimize, Machine optimize( ... @@ -186,7 +186,7 @@ optimize( OR ```python -from litdata import map, Machine +from lightning_data import map, Machine map( ... @@ -216,8 +216,8 @@ The `StreamingDataset` and `StreamingDataLoader` takes care of everything for yo You can easily experiment with dataset mixtures using the CombinedStreamingDataset. ```python -from litdata import StreamingDataset, CombinedStreamingDataset -from litdata.streaming.item_loader import TokensLoader +from lightning_data import StreamingDataset, CombinedStreamingDataset +from lightning_data.streaming.item_loader import TokensLoader from tqdm import tqdm import os from torch.utils.data import DataLoader @@ -257,7 +257,7 @@ Note: The `StreamingDataLoader` is used by [Lit-GPT](https://github.com/Lightnin ```python import os import torch -from litdata import StreamingDataset, StreamingDataLoader +from lightning_data import StreamingDataset, StreamingDataLoader dataset = StreamingDataset("s3://my-bucket/my-data", shuffle=True) dataloader = StreamingDataLoader(dataset, num_workers=os.cpu_count(), batch_size=64) @@ -280,7 +280,7 @@ for batch_idx, batch in enumerate(dataloader): The `StreamingDataLoader` supports profiling your data loading. Simply use the `profile_batches` argument as follows: ```python -from litdata import StreamingDataset, StreamingDataLoader +from lightning_data import StreamingDataset, StreamingDataLoader StreamingDataLoader(..., profile_batches=5) ``` @@ -292,7 +292,7 @@ This generates a Chrome trace called `result.json`. You can visualize this trace Access the data you need when you need it. ```python -from litdata import StreamingDataset +from lightning_data import StreamingDataset dataset = StreamingDataset(...) @@ -304,7 +304,7 @@ print(dataset[42]) # show the 42th element of the dataset ## ✢ Use data transforms ```python -from litdata import StreamingDataset, StreamingDataLoader +from lightning_data import StreamingDataset, StreamingDataLoader import torchvision.transforms.v2.functional as F class ImagenetStreamingDataset(StreamingDataset): @@ -326,7 +326,7 @@ for batch in dataloader: Limit the size of the cache holding the chunks. ```python -from litdata import StreamingDataset +from lightning_data import StreamingDataset dataset = StreamingDataset(..., max_cache_size="10GB") ``` @@ -338,7 +338,7 @@ When processing large files like compressed [parquet files](https://en.wikipedia ```python from pathlib import Path import pyarrow.parquet as pq -from litdata import optimize +from lightning_data import optimize from tokenizer import Tokenizer from functools import partial diff --git a/docs/source/conf.py b/docs/source/conf.py index 46fd79f3..dc679337 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,7 +21,7 @@ SPHINX_MOCK_REQUIREMENTS = int(os.environ.get("SPHINX_MOCK_REQUIREMENTS", True)) # alternative https://stackoverflow.com/a/67692/4521646 -spec = spec_from_file_location("litdata/__about__.py", os.path.join(_PATH_ROOT, "litdata", "__about__.py")) +spec = spec_from_file_location("lightning_data/__about__.py", os.path.join(_PATH_ROOT, "lightning_data", "__about__.py")) about = module_from_spec(spec) spec.loader.exec_module(about) @@ -316,8 +316,8 @@ def find_source(): fname = inspect.getsourcefile(obj) # https://github.com/rtfd/readthedocs.org/issues/5735 if any(s in fname for s in ("readthedocs", "rtfd", "checkouts")): - # /home/docs/checkouts/readthedocs.org/user_builds/litdata/checkouts/ - # devel/litdata/utilities/cls_experiment.py#L26-L176 + # /home/docs/checkouts/readthedocs.org/user_builds/lightning_data/checkouts/ + # devel/lightning_data/utilities/cls_experiment.py#L26-L176 path_top = os.path.abspath(os.path.join("..", "..", "..")) fname = os.path.relpath(fname, start=path_top) else: @@ -380,8 +380,8 @@ def find_source(): import os import torch -import litdata -from litdata import StreamingDataset +import lightning_data +from lightning_data import StreamingDataset """ coverage_skip_undoc_in_source = True diff --git a/litdata/CHANGELOG.md b/lightning_data/CHANGELOG.md similarity index 100% rename from litdata/CHANGELOG.md rename to lightning_data/CHANGELOG.md diff --git a/litdata/__about__.py b/lightning_data/__about__.py similarity index 100% rename from litdata/__about__.py rename to lightning_data/__about__.py diff --git a/litdata/__init__.py b/lightning_data/__init__.py similarity index 58% rename from litdata/__init__.py rename to lightning_data/__init__.py index 79a22fda..0671e414 100644 --- a/litdata/__init__.py +++ b/lightning_data/__init__.py @@ -1,9 +1,9 @@ from lightning_utilities.core.imports import RequirementCache -from litdata.processing.functions import map, optimize, walk -from litdata.streaming.combined import CombinedStreamingDataset -from litdata.streaming.dataloader import StreamingDataLoader -from litdata.streaming.dataset import StreamingDataset +from lightning_data.processing.functions import map, optimize, walk +from lightning_data.streaming.combined import CombinedStreamingDataset +from lightning_data.streaming.dataloader import StreamingDataLoader +from lightning_data.streaming.dataset import StreamingDataset __all__ = [ "LightningDataset", diff --git a/litdata/constants.py b/lightning_data/constants.py similarity index 100% rename from litdata/constants.py rename to lightning_data/constants.py diff --git a/litdata/processing/__init__.py b/lightning_data/processing/__init__.py similarity index 100% rename from litdata/processing/__init__.py rename to lightning_data/processing/__init__.py diff --git a/litdata/processing/data_processor.py b/lightning_data/processing/data_processor.py similarity index 98% rename from litdata/processing/data_processor.py rename to lightning_data/processing/data_processor.py index 6a2eb30e..fdaf83a9 100644 --- a/litdata/processing/data_processor.py +++ b/lightning_data/processing/data_processor.py @@ -20,7 +20,7 @@ from lightning import seed_everything from tqdm.auto import tqdm as _tqdm -from litdata.constants import ( +from lightning_data.constants import ( _BOTO3_AVAILABLE, _DEFAULT_FAST_DEV_RUN_ITEMS, _INDEX_FILENAME, @@ -28,13 +28,13 @@ _LIGHTNING_CLOUD_LATEST, _TORCH_GREATER_EQUAL_2_1_0, ) -from litdata.processing.readers import BaseReader -from litdata.streaming import Cache -from litdata.streaming.cache import Dir -from litdata.streaming.client import S3Client -from litdata.streaming.resolver import _resolve_dir -from litdata.utilities.broadcast import broadcast_object -from litdata.utilities.packing import _pack_greedily +from lightning_data.processing.readers import BaseReader +from lightning_data.streaming import Cache +from lightning_data.streaming.cache import Dir +from lightning_data.streaming.client import S3Client +from lightning_data.streaming.resolver import _resolve_dir +from lightning_data.utilities.broadcast import broadcast_object +from lightning_data.utilities.packing import _pack_greedily if _TORCH_GREATER_EQUAL_2_1_0: from torch.utils._pytree import tree_flatten, tree_unflatten, treespec_loads diff --git a/litdata/processing/functions.py b/lightning_data/processing/functions.py similarity index 97% rename from litdata/processing/functions.py rename to lightning_data/processing/functions.py index 8b0e35a1..9b65eee3 100644 --- a/litdata/processing/functions.py +++ b/lightning_data/processing/functions.py @@ -22,11 +22,11 @@ import torch -from litdata.constants import _IS_IN_STUDIO, _TORCH_GREATER_EQUAL_2_1_0 -from litdata.processing.data_processor import DataChunkRecipe, DataProcessor, DataTransformRecipe -from litdata.processing.readers import BaseReader -from litdata.processing.utilities import optimize_dns_context -from litdata.streaming.resolver import ( +from lightning_data.constants import _IS_IN_STUDIO, _TORCH_GREATER_EQUAL_2_1_0 +from lightning_data.processing.data_processor import DataChunkRecipe, DataProcessor, DataTransformRecipe +from lightning_data.processing.readers import BaseReader +from lightning_data.processing.utilities import optimize_dns_context +from lightning_data.streaming.resolver import ( Dir, _assert_dir_has_index_file, _assert_dir_is_empty, diff --git a/litdata/processing/readers.py b/lightning_data/processing/readers.py similarity index 100% rename from litdata/processing/readers.py rename to lightning_data/processing/readers.py diff --git a/litdata/processing/utilities.py b/lightning_data/processing/utilities.py similarity index 94% rename from litdata/processing/utilities.py rename to lightning_data/processing/utilities.py index f8fc84ec..9e160839 100644 --- a/litdata/processing/utilities.py +++ b/lightning_data/processing/utilities.py @@ -5,7 +5,7 @@ from subprocess import Popen # noqa: S404 from typing import Any, Callable, Optional, Tuple -from litdata.constants import _IS_IN_STUDIO +from lightning_data.constants import _IS_IN_STUDIO def get_worker_rank() -> Optional[str]: @@ -66,7 +66,7 @@ def optimize_dns(enable: bool) -> None: ): cmd = ( f"sudo /home/zeus/miniconda3/envs/cloudspace/bin/python" - f" -c 'from litdata.processing.utilities import _optimize_dns; _optimize_dns({enable})'" + f" -c 'from lightning_data.processing.utilities import _optimize_dns; _optimize_dns({enable})'" ) Popen(cmd, shell=True).wait() # E501 diff --git a/litdata/streaming/__init__.py b/lightning_data/streaming/__init__.py similarity index 69% rename from litdata/streaming/__init__.py rename to lightning_data/streaming/__init__.py index d245c37b..8c79bb98 100644 --- a/litdata/streaming/__init__.py +++ b/lightning_data/streaming/__init__.py @@ -11,11 +11,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -from litdata.streaming.cache import Cache -from litdata.streaming.combined import CombinedStreamingDataset -from litdata.streaming.dataloader import StreamingDataLoader -from litdata.streaming.dataset import StreamingDataset -from litdata.streaming.item_loader import TokensLoader +from lightning_data.streaming.cache import Cache +from lightning_data.streaming.combined import CombinedStreamingDataset +from lightning_data.streaming.dataloader import StreamingDataLoader +from lightning_data.streaming.dataset import StreamingDataset +from lightning_data.streaming.item_loader import TokensLoader __all__ = [ "Cache", diff --git a/litdata/streaming/cache.py b/lightning_data/streaming/cache.py similarity index 91% rename from litdata/streaming/cache.py rename to lightning_data/streaming/cache.py index 18d78e31..105a9a4d 100644 --- a/litdata/streaming/cache.py +++ b/lightning_data/streaming/cache.py @@ -15,19 +15,19 @@ import os from typing import Any, Dict, List, Optional, Tuple, Union -from litdata.constants import ( +from lightning_data.constants import ( _INDEX_FILENAME, _LIGHTNING_CLOUD_LATEST, _TORCH_GREATER_EQUAL_2_1_0, ) -from litdata.streaming.item_loader import BaseItemLoader -from litdata.streaming.reader import BinaryReader -from litdata.streaming.resolver import Dir, _resolve_dir -from litdata.streaming.sampler import ChunkedIndex -from litdata.streaming.serializers import Serializer -from litdata.streaming.writer import BinaryWriter -from litdata.utilities.env import _DistributedEnv, _WorkerEnv -from litdata.utilities.format import _convert_bytes_to_int +from lightning_data.streaming.item_loader import BaseItemLoader +from lightning_data.streaming.reader import BinaryReader +from lightning_data.streaming.resolver import Dir, _resolve_dir +from lightning_data.streaming.sampler import ChunkedIndex +from lightning_data.streaming.serializers import Serializer +from lightning_data.streaming.writer import BinaryWriter +from lightning_data.utilities.env import _DistributedEnv, _WorkerEnv +from lightning_data.utilities.format import _convert_bytes_to_int logger = logging.Logger(__name__) diff --git a/litdata/streaming/client.py b/lightning_data/streaming/client.py similarity index 97% rename from litdata/streaming/client.py rename to lightning_data/streaming/client.py index 0a12616d..6bf52986 100644 --- a/litdata/streaming/client.py +++ b/lightning_data/streaming/client.py @@ -2,7 +2,7 @@ from time import time from typing import Any, Optional -from litdata.constants import _BOTO3_AVAILABLE +from lightning_data.constants import _BOTO3_AVAILABLE if _BOTO3_AVAILABLE: import boto3 diff --git a/litdata/streaming/combined.py b/lightning_data/streaming/combined.py similarity index 98% rename from litdata/streaming/combined.py rename to lightning_data/streaming/combined.py index 3209c263..721e8398 100644 --- a/litdata/streaming/combined.py +++ b/lightning_data/streaming/combined.py @@ -16,8 +16,8 @@ from torch.utils.data import IterableDataset -from litdata.streaming.dataset import StreamingDataset -from litdata.utilities.env import _WorkerEnv +from lightning_data.streaming.dataset import StreamingDataset +from lightning_data.utilities.env import _WorkerEnv __NUM_SAMPLES_YIELDED_KEY__ = "__NUM_SAMPLES_YIELDED__" __SAMPLES_KEY__ = "__SAMPLES__" diff --git a/litdata/streaming/compression.py b/lightning_data/streaming/compression.py similarity index 100% rename from litdata/streaming/compression.py rename to lightning_data/streaming/compression.py diff --git a/litdata/streaming/config.py b/lightning_data/streaming/config.py similarity index 94% rename from litdata/streaming/config.py rename to lightning_data/streaming/config.py index befb3019..4d011237 100644 --- a/litdata/streaming/config.py +++ b/lightning_data/streaming/config.py @@ -15,11 +15,11 @@ import os from typing import Any, Dict, List, Optional, Tuple -from litdata.constants import _INDEX_FILENAME, _TORCH_GREATER_EQUAL_2_1_0 -from litdata.streaming.downloader import get_downloader_cls -from litdata.streaming.item_loader import BaseItemLoader, PyTreeLoader, TokensLoader -from litdata.streaming.sampler import ChunkedIndex -from litdata.streaming.serializers import Serializer +from lightning_data.constants import _INDEX_FILENAME, _TORCH_GREATER_EQUAL_2_1_0 +from lightning_data.streaming.downloader import get_downloader_cls +from lightning_data.streaming.item_loader import BaseItemLoader, PyTreeLoader, TokensLoader +from lightning_data.streaming.sampler import ChunkedIndex +from lightning_data.streaming.serializers import Serializer if _TORCH_GREATER_EQUAL_2_1_0: from torch.utils._pytree import tree_unflatten, treespec_loads diff --git a/litdata/streaming/dataloader.py b/lightning_data/streaming/dataloader.py similarity index 97% rename from litdata/streaming/dataloader.py rename to lightning_data/streaming/dataloader.py index 72c360d1..ab313c39 100644 --- a/litdata/streaming/dataloader.py +++ b/lightning_data/streaming/dataloader.py @@ -33,16 +33,16 @@ ) from torch.utils.data.sampler import BatchSampler, Sampler -from litdata.constants import _DEFAULT_CHUNK_BYTES, _TORCH_GREATER_EQUAL_2_1_0, _VIZ_TRACKER_AVAILABLE -from litdata.streaming import Cache -from litdata.streaming.combined import ( +from lightning_data.constants import _DEFAULT_CHUNK_BYTES, _TORCH_GREATER_EQUAL_2_1_0, _VIZ_TRACKER_AVAILABLE +from lightning_data.streaming import Cache +from lightning_data.streaming.combined import ( __NUM_SAMPLES_YIELDED_KEY__, __SAMPLES_KEY__, CombinedStreamingDataset, ) -from litdata.streaming.dataset import StreamingDataset -from litdata.streaming.sampler import CacheBatchSampler -from litdata.utilities.env import _DistributedEnv +from lightning_data.streaming.dataset import StreamingDataset +from lightning_data.streaming.sampler import CacheBatchSampler +from lightning_data.utilities.env import _DistributedEnv if _TORCH_GREATER_EQUAL_2_1_0: from torch.utils._pytree import tree_flatten @@ -105,7 +105,7 @@ def __getitem__(self, index: int) -> Any: if not _equal_items(data_1, data2): raise ValueError( f"Your dataset items aren't deterministic. Found {data_1} and {data2} for index {index}." - " HINT: Use the `litdata.cache.Cache` directly within your dataset." + " HINT: Use the `lightning_data.cache.Cache` directly within your dataset." ) self._is_deterministic = True self._cache[index] = data_1 @@ -180,7 +180,7 @@ def __call__( ) -> None: from torch.utils.data._utils import worker - from litdata.streaming.cache import Cache + from lightning_data.streaming.cache import Cache enable_profiling = self._global_rank == 0 and worker_id == 0 and _VIZ_TRACKER_AVAILABLE and self._profile @@ -481,7 +481,7 @@ def _try_put_index(self) -> None: class StreamingDataLoader(DataLoader): r"""The StreamingDataLoader combines a dataset and a sampler, and provides an iterable over the given dataset. - The :class:`~litdata.streaming.dataloader.StreamingDataLoader` supports either a + The :class:`~lightning_data.streaming.dataloader.StreamingDataLoader` supports either a StreamingDataset and CombinedStreamingDataset datasets with single- or multi-process loading, customizing loading order and optional automatic batching (collation) and memory pinning. diff --git a/litdata/streaming/dataset.py b/lightning_data/streaming/dataset.py similarity index 97% rename from litdata/streaming/dataset.py rename to lightning_data/streaming/dataset.py index 08744603..4f9dcfc5 100644 --- a/litdata/streaming/dataset.py +++ b/lightning_data/streaming/dataset.py @@ -20,17 +20,17 @@ import numpy as np from torch.utils.data import IterableDataset -from litdata.constants import ( +from lightning_data.constants import ( _DEFAULT_CACHE_DIR, _INDEX_FILENAME, ) -from litdata.streaming import Cache -from litdata.streaming.item_loader import BaseItemLoader -from litdata.streaming.resolver import Dir, _resolve_dir -from litdata.streaming.sampler import ChunkedIndex -from litdata.streaming.serializers import Serializer -from litdata.streaming.shuffle import FullShuffle, NoShuffle, Shuffle -from litdata.utilities.env import _DistributedEnv, _is_in_dataloader_worker, _WorkerEnv +from lightning_data.streaming import Cache +from lightning_data.streaming.item_loader import BaseItemLoader +from lightning_data.streaming.resolver import Dir, _resolve_dir +from lightning_data.streaming.sampler import ChunkedIndex +from lightning_data.streaming.serializers import Serializer +from lightning_data.streaming.shuffle import FullShuffle, NoShuffle, Shuffle +from lightning_data.utilities.env import _DistributedEnv, _is_in_dataloader_worker, _WorkerEnv logger = Logger(__name__) diff --git a/litdata/streaming/downloader.py b/lightning_data/streaming/downloader.py similarity index 97% rename from litdata/streaming/downloader.py rename to lightning_data/streaming/downloader.py index ea3a3571..288fc51a 100644 --- a/litdata/streaming/downloader.py +++ b/lightning_data/streaming/downloader.py @@ -19,8 +19,8 @@ from filelock import FileLock, Timeout -from litdata.constants import _INDEX_FILENAME -from litdata.streaming.client import S3Client +from lightning_data.constants import _INDEX_FILENAME +from lightning_data.streaming.client import S3Client class Downloader(ABC): diff --git a/litdata/streaming/item_loader.py b/lightning_data/streaming/item_loader.py similarity index 98% rename from litdata/streaming/item_loader.py rename to lightning_data/streaming/item_loader.py index b578e3bf..7a3f694d 100644 --- a/litdata/streaming/item_loader.py +++ b/lightning_data/streaming/item_loader.py @@ -19,11 +19,11 @@ import numpy as np import torch -from litdata.constants import ( +from lightning_data.constants import ( _TORCH_DTYPES_MAPPING, _TORCH_GREATER_EQUAL_2_1_0, ) -from litdata.streaming.serializers import Serializer +from lightning_data.streaming.serializers import Serializer if _TORCH_GREATER_EQUAL_2_1_0: from torch.utils._pytree import PyTree, tree_unflatten diff --git a/litdata/streaming/reader.py b/lightning_data/streaming/reader.py similarity index 96% rename from litdata/streaming/reader.py rename to lightning_data/streaming/reader.py index ad63175c..5a2fe760 100644 --- a/litdata/streaming/reader.py +++ b/lightning_data/streaming/reader.py @@ -20,12 +20,12 @@ from threading import Thread from typing import Any, Dict, List, Optional, Tuple, Union -from litdata.constants import _TORCH_GREATER_EQUAL_2_1_0 -from litdata.streaming.config import ChunksConfig -from litdata.streaming.item_loader import BaseItemLoader, PyTreeLoader -from litdata.streaming.sampler import ChunkedIndex -from litdata.streaming.serializers import Serializer, _get_serializers -from litdata.utilities.env import _DistributedEnv, _WorkerEnv +from lightning_data.constants import _TORCH_GREATER_EQUAL_2_1_0 +from lightning_data.streaming.config import ChunksConfig +from lightning_data.streaming.item_loader import BaseItemLoader, PyTreeLoader +from lightning_data.streaming.sampler import ChunkedIndex +from lightning_data.streaming.serializers import Serializer, _get_serializers +from lightning_data.utilities.env import _DistributedEnv, _WorkerEnv warnings.filterwarnings("ignore", message=".*The given buffer is not writable.*") diff --git a/litdata/streaming/resolver.py b/lightning_data/streaming/resolver.py similarity index 100% rename from litdata/streaming/resolver.py rename to lightning_data/streaming/resolver.py diff --git a/litdata/streaming/sampler.py b/lightning_data/streaming/sampler.py similarity index 100% rename from litdata/streaming/sampler.py rename to lightning_data/streaming/sampler.py diff --git a/litdata/streaming/serializers.py b/lightning_data/streaming/serializers.py similarity index 99% rename from litdata/streaming/serializers.py rename to lightning_data/streaming/serializers.py index 700251d8..2170fba5 100644 --- a/litdata/streaming/serializers.py +++ b/lightning_data/streaming/serializers.py @@ -23,7 +23,7 @@ import torch from lightning_utilities.core.imports import RequirementCache -from litdata.constants import _NUMPY_DTYPES_MAPPING, _TORCH_DTYPES_MAPPING +from lightning_data.constants import _NUMPY_DTYPES_MAPPING, _TORCH_DTYPES_MAPPING _PIL_AVAILABLE = RequirementCache("PIL") _TORCH_VISION_AVAILABLE = RequirementCache("torchvision") diff --git a/litdata/streaming/shuffle.py b/lightning_data/streaming/shuffle.py similarity index 96% rename from litdata/streaming/shuffle.py rename to lightning_data/streaming/shuffle.py index c8013ef5..cbe2500e 100644 --- a/litdata/streaming/shuffle.py +++ b/lightning_data/streaming/shuffle.py @@ -17,9 +17,9 @@ import numpy as np -from litdata.streaming import Cache -from litdata.utilities.env import _DistributedEnv -from litdata.utilities.shuffle import _associate_chunks_and_internals_to_ranks, _intra_node_chunk_shuffle +from lightning_data.streaming import Cache +from lightning_data.utilities.env import _DistributedEnv +from lightning_data.utilities.shuffle import _associate_chunks_and_internals_to_ranks, _intra_node_chunk_shuffle class Shuffle(ABC): diff --git a/litdata/streaming/writer.py b/lightning_data/streaming/writer.py similarity index 97% rename from litdata/streaming/writer.py rename to lightning_data/streaming/writer.py index 7586ed03..970fcbc7 100644 --- a/litdata/streaming/writer.py +++ b/lightning_data/streaming/writer.py @@ -21,12 +21,12 @@ import numpy as np import torch -from litdata.constants import _INDEX_FILENAME, _TORCH_GREATER_EQUAL_2_1_0 -from litdata.processing.utilities import get_worker_rank -from litdata.streaming.compression import _COMPRESSORS, Compressor -from litdata.streaming.serializers import Serializer, _get_serializers -from litdata.utilities.env import _DistributedEnv, _WorkerEnv -from litdata.utilities.format import _convert_bytes_to_int, _human_readable_bytes +from lightning_data.constants import _INDEX_FILENAME, _TORCH_GREATER_EQUAL_2_1_0 +from lightning_data.processing.utilities import get_worker_rank +from lightning_data.streaming.compression import _COMPRESSORS, Compressor +from lightning_data.streaming.serializers import Serializer, _get_serializers +from lightning_data.utilities.env import _DistributedEnv, _WorkerEnv +from lightning_data.utilities.format import _convert_bytes_to_int, _human_readable_bytes if _TORCH_GREATER_EQUAL_2_1_0: from torch.utils._pytree import PyTree, tree_flatten, treespec_dumps diff --git a/litdata/utilities/__init__.py b/lightning_data/utilities/__init__.py similarity index 100% rename from litdata/utilities/__init__.py rename to lightning_data/utilities/__init__.py diff --git a/litdata/utilities/broadcast.py b/lightning_data/utilities/broadcast.py similarity index 100% rename from litdata/utilities/broadcast.py rename to lightning_data/utilities/broadcast.py diff --git a/litdata/utilities/env.py b/lightning_data/utilities/env.py similarity index 100% rename from litdata/utilities/env.py rename to lightning_data/utilities/env.py diff --git a/litdata/utilities/format.py b/lightning_data/utilities/format.py similarity index 100% rename from litdata/utilities/format.py rename to lightning_data/utilities/format.py diff --git a/litdata/utilities/packing.py b/lightning_data/utilities/packing.py similarity index 100% rename from litdata/utilities/packing.py rename to lightning_data/utilities/packing.py diff --git a/litdata/utilities/shuffle.py b/lightning_data/utilities/shuffle.py similarity index 98% rename from litdata/utilities/shuffle.py rename to lightning_data/utilities/shuffle.py index 8b24a6de..430ac21e 100644 --- a/litdata/utilities/shuffle.py +++ b/lightning_data/utilities/shuffle.py @@ -2,7 +2,7 @@ import numpy as np -from litdata.utilities.env import _DistributedEnv +from lightning_data.utilities.env import _DistributedEnv def _intra_node_chunk_shuffle( diff --git a/pyproject.toml b/pyproject.toml index 70892e85..c24a810d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,7 @@ # limitations under the License. [metadata] -name = "litdata" +name = "lightning_data" author = "Lightning-AI et al." url = "https://github.com/Lightning-AI/lit-data" @@ -87,7 +87,7 @@ lint.ignore-init-module-imports = true "S501", # Probable use of `requests` call with `verify=False` disabling SSL certificate checks "S108", # Probable insecure usage of temporary file or directory: "/tmp/data/MNIST" ] -"litdata/**" = [ +"lightning_data/**" = [ "S101", # todo: Use of `assert` detected "S105", "S106", "S107", # todo: Possible hardcoded password: ... "S113", # todo: Probable use of requests call without timeout diff --git a/setup.py b/setup.py index 3b5eaad2..e63bf7ee 100644 --- a/setup.py +++ b/setup.py @@ -11,7 +11,7 @@ _PATH_REQUIRES = os.path.join(_PATH_ROOT, "requirements") -def _load_py_module(fname, pkg="litdata"): +def _load_py_module(fname, pkg="lightning_data"): spec = spec_from_file_location(os.path.join(pkg, fname), os.path.join(_PATH_ROOT, pkg, fname)) py = module_from_spec(spec) spec.loader.exec_module(py) @@ -51,7 +51,7 @@ def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = # the goal of the project is simplicity for researchers, don't want to add too much # engineer specific practices setup( - name="litdata", + name="lightning_data", version=about.__version__, description=about.__docs__, author=about.__author__, @@ -59,8 +59,8 @@ def _prepare_extras(requirements_dir: str = _PATH_REQUIRES, skip_files: tuple = url=about.__homepage__, download_url="https://github.com/Lightning-AI/lit-data", license=about.__license__, - packages=find_packages(where="litdata"), - package_dir={"": "litdata"}, + packages=find_packages(where="lightning_data"), + package_dir={"": "lightning_data"}, long_description=readme, long_description_content_type="text/markdown", include_package_data=True, diff --git a/tests/processing/test_data_processor.py b/tests/processing/test_data_processor.py index 950f70c1..6fb500b0 100644 --- a/tests/processing/test_data_processor.py +++ b/tests/processing/test_data_processor.py @@ -12,9 +12,9 @@ from lightning import seed_everything from lightning_utilities.core.imports import RequirementCache -from litdata.processing import data_processor as data_processor_module -from litdata.processing import functions -from litdata.processing.data_processor import ( +from lightning_data.processing import data_processor as data_processor_module +from lightning_data.processing import functions +from lightning_data.processing.data_processor import ( DataChunkRecipe, DataProcessor, DataTransformRecipe, @@ -27,9 +27,9 @@ _wait_for_disk_usage_higher_than_threshold, _wait_for_file_to_exist, ) -from litdata.processing.functions import LambdaDataTransformRecipe, map, optimize -from litdata.streaming import resolver -from litdata.streaming.cache import Cache, Dir +from lightning_data.processing.functions import LambdaDataTransformRecipe, map, optimize +from lightning_data.streaming import resolver +from lightning_data.streaming.cache import Cache, Dir _PIL_AVAILABLE = RequirementCache("PIL") @@ -164,7 +164,7 @@ def fn(*_, **__): @pytest.mark.skipif(condition=sys.platform == "win32", reason="Not supported on windows") -@mock.patch("litdata.processing.data_processor._wait_for_disk_usage_higher_than_threshold") +@mock.patch("lightning_data.processing.data_processor._wait_for_disk_usage_higher_than_threshold") def test_download_data_target(wait_for_disk_usage_higher_than_threshold_mock, tmpdir): input_dir = os.path.join(tmpdir, "input_dir") os.makedirs(input_dir, exist_ok=True) @@ -203,7 +203,7 @@ def fn(*_, **__): def test_wait_for_disk_usage_higher_than_threshold(): disk_usage_mock = mock.Mock(side_effect=[mock.Mock(free=10e9), mock.Mock(free=10e9), mock.Mock(free=10e11)]) - with mock.patch("litdata.processing.data_processor.shutil.disk_usage", disk_usage_mock): + with mock.patch("lightning_data.processing.data_processor.shutil.disk_usage", disk_usage_mock): _wait_for_disk_usage_higher_than_threshold("/", 10, sleep_time=0) assert disk_usage_mock.call_count == 3 diff --git a/tests/processing/test_functions.py b/tests/processing/test_functions.py index a939c0e9..7847b96a 100644 --- a/tests/processing/test_functions.py +++ b/tests/processing/test_functions.py @@ -4,8 +4,8 @@ import pytest -from litdata import walk -from litdata.processing.functions import _get_input_dir +from lightning_data import walk +from lightning_data.processing.functions import _get_input_dir @pytest.mark.skipif(sys.platform == "win32", reason="currently not supported for windows.") diff --git a/tests/processing/test_readers.py b/tests/processing/test_readers.py index ff4b33ee..fdf9396f 100644 --- a/tests/processing/test_readers.py +++ b/tests/processing/test_readers.py @@ -3,8 +3,8 @@ import pytest -from litdata import map -from litdata.processing.readers import _PYARROW_AVAILABLE, BaseReader, ParquetReader +from lightning_data import map +from lightning_data.processing.readers import _PYARROW_AVAILABLE, BaseReader, ParquetReader class DummyReader(BaseReader): diff --git a/tests/processing/test_utilities.py b/tests/processing/test_utilities.py index 436e6063..596b80b8 100644 --- a/tests/processing/test_utilities.py +++ b/tests/processing/test_utilities.py @@ -1,7 +1,7 @@ from unittest.mock import MagicMock -from litdata.processing import utilities as utilities_module -from litdata.processing.utilities import optimize_dns_context +from lightning_data.processing import utilities as utilities_module +from lightning_data.processing.utilities import optimize_dns_context def test_optimize_dns_context(monkeypatch): @@ -31,6 +31,6 @@ def readlines(self): cmd = popen_mock._mock_call_args_list[0].args[0] expected_cmd = ( "sudo /home/zeus/miniconda3/envs/cloudspace/bin/python" - " -c 'from litdata.processing.utilities import _optimize_dns; _optimize_dns(True)'" + " -c 'from lightning_data.processing.utilities import _optimize_dns; _optimize_dns(True)'" ) assert cmd == expected_cmd diff --git a/tests/streaming/test_cache.py b/tests/streaming/test_cache.py index c08c0a78..b2326353 100644 --- a/tests/streaming/test_cache.py +++ b/tests/streaming/test_cache.py @@ -25,12 +25,12 @@ from lightning_utilities.test.warning import no_warning_call from torch.utils.data import Dataset -from litdata.streaming import Cache -from litdata.streaming.dataloader import CacheDataLoader -from litdata.streaming.dataset import StreamingDataset -from litdata.streaming.item_loader import TokensLoader -from litdata.streaming.serializers import Serializer -from litdata.utilities.env import _DistributedEnv +from lightning_data.streaming import Cache +from lightning_data.streaming.dataloader import CacheDataLoader +from lightning_data.streaming.dataset import StreamingDataset +from lightning_data.streaming.item_loader import TokensLoader +from lightning_data.streaming.serializers import Serializer +from lightning_data.utilities.env import _DistributedEnv _PIL_AVAILABLE = RequirementCache("PIL") _TORCH_VISION_AVAILABLE = RequirementCache("torchvision") diff --git a/tests/streaming/test_client.py b/tests/streaming/test_client.py index ca933604..260f1b92 100644 --- a/tests/streaming/test_client.py +++ b/tests/streaming/test_client.py @@ -4,7 +4,7 @@ import pytest -from litdata.streaming import client +from lightning_data.streaming import client def test_s3_client_without_cloud_space_id(monkeypatch): diff --git a/tests/streaming/test_combined.py b/tests/streaming/test_combined.py index 40db3bf6..029c6b94 100644 --- a/tests/streaming/test_combined.py +++ b/tests/streaming/test_combined.py @@ -7,10 +7,10 @@ from torch.utils.data import IterableDataset from torch.utils.data.dataloader import DataLoader -from litdata.streaming.cache import Cache -from litdata.streaming.combined import CombinedStreamingDataset -from litdata.streaming.dataloader import StreamingDataLoader -from litdata.streaming.dataset import Dir, StreamingDataset +from lightning_data.streaming.cache import Cache +from lightning_data.streaming.combined import CombinedStreamingDataset +from lightning_data.streaming.dataloader import StreamingDataLoader +from lightning_data.streaming.dataset import Dir, StreamingDataset class TestCombinedStreamingDataset(CombinedStreamingDataset): diff --git a/tests/streaming/test_dataloader.py b/tests/streaming/test_dataloader.py index e5bbd17e..b0aed53f 100644 --- a/tests/streaming/test_dataloader.py +++ b/tests/streaming/test_dataloader.py @@ -4,8 +4,8 @@ import torch from torch import tensor -from litdata.streaming import CombinedStreamingDataset, StreamingDataLoader -from litdata.streaming import dataloader as streaming_dataloader_module +from lightning_data.streaming import CombinedStreamingDataset, StreamingDataLoader +from lightning_data.streaming import dataloader as streaming_dataloader_module class TestStatefulDataset: diff --git a/tests/streaming/test_dataset.py b/tests/streaming/test_dataset.py index b8d2bda0..a7f9607b 100644 --- a/tests/streaming/test_dataset.py +++ b/tests/streaming/test_dataset.py @@ -22,11 +22,11 @@ from lightning import seed_everything from torch.utils.data import DataLoader -from litdata.processing import functions -from litdata.streaming import Cache -from litdata.streaming import dataset as dataset_module -from litdata.streaming.dataloader import StreamingDataLoader -from litdata.streaming.dataset import ( +from lightning_data.processing import functions +from lightning_data.streaming import Cache +from lightning_data.streaming import dataset as dataset_module +from lightning_data.streaming.dataloader import StreamingDataLoader +from lightning_data.streaming.dataset import ( _INDEX_FILENAME, Dir, StreamingDataset, @@ -36,9 +36,9 @@ _should_replace_path, _try_create_cache_dir, ) -from litdata.streaming.item_loader import TokensLoader -from litdata.streaming.shuffle import FullShuffle, NoShuffle -from litdata.utilities.env import _DistributedEnv, _WorkerEnv +from lightning_data.streaming.item_loader import TokensLoader +from lightning_data.streaming.shuffle import FullShuffle, NoShuffle +from lightning_data.utilities.env import _DistributedEnv, _WorkerEnv def test_streaming_dataset(tmpdir, monkeypatch): @@ -392,7 +392,7 @@ def test_try_create_cache_dir(): # the cache dir creating at /cache requires root privileges, so we need to mock `os.makedirs()` with ( mock.patch.dict("os.environ", {"LIGHTNING_CLUSTER_ID": "abc", "LIGHTNING_CLOUD_PROJECT_ID": "123"}), - mock.patch("litdata.streaming.dataset.os.makedirs") as makedirs_mock, + mock.patch("lightning_data.streaming.dataset.os.makedirs") as makedirs_mock, ): cache_dir_1 = _try_create_cache_dir("") cache_dir_2 = _try_create_cache_dir("ssdf") diff --git a/tests/streaming/test_downloader.py b/tests/streaming/test_downloader.py index 218d1bda..4f26bfbf 100644 --- a/tests/streaming/test_downloader.py +++ b/tests/streaming/test_downloader.py @@ -1,7 +1,7 @@ import os from unittest.mock import MagicMock -from litdata.streaming.downloader import S3Downloader, subprocess +from lightning_data.streaming.downloader import S3Downloader, subprocess def test_s3_downloader_fast(tmpdir, monkeypatch): diff --git a/tests/streaming/test_reader.py b/tests/streaming/test_reader.py index 8fb4d542..8ab18ff9 100644 --- a/tests/streaming/test_reader.py +++ b/tests/streaming/test_reader.py @@ -4,13 +4,13 @@ import numpy as np -from litdata.streaming import reader -from litdata.streaming.cache import Cache -from litdata.streaming.config import ChunkedIndex -from litdata.streaming.item_loader import PyTreeLoader -from litdata.streaming.reader import _END_TOKEN, PrepareChunksThread, _get_folder_size -from litdata.streaming.resolver import Dir -from litdata.utilities.env import _DistributedEnv +from lightning_data.streaming import reader +from lightning_data.streaming.cache import Cache +from lightning_data.streaming.config import ChunkedIndex +from lightning_data.streaming.item_loader import PyTreeLoader +from lightning_data.streaming.reader import _END_TOKEN, PrepareChunksThread, _get_folder_size +from lightning_data.streaming.resolver import Dir +from lightning_data.utilities.env import _DistributedEnv def test_reader_chunk_removal(tmpdir): diff --git a/tests/streaming/test_resolver.py b/tests/streaming/test_resolver.py index b4c7e177..56b1e872 100644 --- a/tests/streaming/test_resolver.py +++ b/tests/streaming/test_resolver.py @@ -16,7 +16,7 @@ V1ListDataConnectionsResponse, ) -from litdata.streaming import resolver +from lightning_data.streaming import resolver @pytest.mark.skipif(sys.platform == "win32", reason="windows isn't supported") diff --git a/tests/streaming/test_sampler.py b/tests/streaming/test_sampler.py index 1879771b..a78baf04 100644 --- a/tests/streaming/test_sampler.py +++ b/tests/streaming/test_sampler.py @@ -3,7 +3,7 @@ import pytest from lightning import seed_everything -from litdata.streaming.sampler import CacheBatchSampler +from lightning_data.streaming.sampler import CacheBatchSampler @pytest.mark.parametrize( diff --git a/tests/streaming/test_serializer.py b/tests/streaming/test_serializer.py index bd5fb002..54db39b9 100644 --- a/tests/streaming/test_serializer.py +++ b/tests/streaming/test_serializer.py @@ -22,7 +22,7 @@ from lightning import seed_everything from lightning_utilities.core.imports import RequirementCache -from litdata.streaming.serializers import ( +from lightning_data.streaming.serializers import ( _AV_AVAILABLE, _NUMPY_DTYPES_MAPPING, _SERIALIZERS, diff --git a/tests/streaming/test_writer.py b/tests/streaming/test_writer.py index 4a25d678..49bdc6c5 100644 --- a/tests/streaming/test_writer.py +++ b/tests/streaming/test_writer.py @@ -20,11 +20,11 @@ from lightning import seed_everything from lightning_utilities.core.imports import RequirementCache -from litdata.streaming.compression import _ZSTD_AVAILABLE -from litdata.streaming.reader import BinaryReader -from litdata.streaming.sampler import ChunkedIndex -from litdata.streaming.writer import BinaryWriter -from litdata.utilities.format import _FORMAT_TO_RATIO +from lightning_data.streaming.compression import _ZSTD_AVAILABLE +from lightning_data.streaming.reader import BinaryReader +from lightning_data.streaming.sampler import ChunkedIndex +from lightning_data.streaming.writer import BinaryWriter +from lightning_data.utilities.format import _FORMAT_TO_RATIO _PIL_AVAILABLE = RequirementCache("PIL") diff --git a/tests/utilities/test_broadcast.py b/tests/utilities/test_broadcast.py index 175c638b..f6511946 100644 --- a/tests/utilities/test_broadcast.py +++ b/tests/utilities/test_broadcast.py @@ -1,7 +1,7 @@ import os from unittest import mock -from litdata.utilities.broadcast import broadcast_object, requests +from lightning_data.utilities.broadcast import broadcast_object, requests @mock.patch.dict( diff --git a/tests/utilities/test_format.py b/tests/utilities/test_format.py index 91c48a40..e8dcd592 100644 --- a/tests/utilities/test_format.py +++ b/tests/utilities/test_format.py @@ -1,4 +1,4 @@ -from litdata.utilities.format import _human_readable_bytes +from lightning_data.utilities.format import _human_readable_bytes def test_human_readable_bytes(): diff --git a/tests/utilities/test_packing.py b/tests/utilities/test_packing.py index c9a7c133..5fddcc25 100644 --- a/tests/utilities/test_packing.py +++ b/tests/utilities/test_packing.py @@ -1,6 +1,6 @@ import pytest -from litdata.utilities.packing import _pack_greedily +from lightning_data.utilities.packing import _pack_greedily def test_pack_greedily(): diff --git a/tests/utilities/test_shuffle.py b/tests/utilities/test_shuffle.py index 6923d82b..db6084a1 100644 --- a/tests/utilities/test_shuffle.py +++ b/tests/utilities/test_shuffle.py @@ -1,5 +1,5 @@ -from litdata.utilities.env import _DistributedEnv -from litdata.utilities.shuffle import _associate_chunks_and_internals_to_ranks, _intra_node_chunk_shuffle +from lightning_data.utilities.env import _DistributedEnv +from lightning_data.utilities.shuffle import _associate_chunks_and_internals_to_ranks, _intra_node_chunk_shuffle def test_intra_node_chunk_shuffle():