Skip to content

Commit

Permalink
Feat: Using fsspec to download files (#348)
Browse files Browse the repository at this point in the history
  • Loading branch information
deependujha authored Sep 19, 2024
1 parent 895a829 commit 719bae2
Show file tree
Hide file tree
Showing 16 changed files with 441 additions and 611 deletions.
31 changes: 22 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -217,9 +217,8 @@ Additionally, you can inject client connection settings for [S3](https://boto3.a
from litdata import StreamingDataset

storage_options = {
"endpoint_url": "your_endpoint_url",
"aws_access_key_id": "your_access_key_id",
"aws_secret_access_key": "your_secret_access_key",
"key": "your_access_key_id",
"secret": "your_secret_access_key",
}

dataset = StreamingDataset('s3://my-bucket/my-data', storage_options=storage_options)
Expand Down Expand Up @@ -264,33 +263,47 @@ for batch in val_dataloader:

 

The StreamingDataset supports reading optimized datasets from common cloud providers.
The StreamingDataset supports reading optimized datasets from common cloud providers.

```python
import os
import litdata as ld

# Read data from AWS S3
aws_storage_options={
"AWS_ACCESS_KEY_ID": os.environ['AWS_ACCESS_KEY_ID'],
"AWS_SECRET_ACCESS_KEY": os.environ['AWS_SECRET_ACCESS_KEY'],
"key": os.environ['AWS_ACCESS_KEY_ID'],
"secret": os.environ['AWS_SECRET_ACCESS_KEY'],
}
dataset = ld.StreamingDataset("s3://my-bucket/my-data", storage_options=aws_storage_options)

# Read data from GCS
gcp_storage_options={
"project": os.environ['PROJECT_ID'],
"token": {
# dumped from cat ~/.config/gcloud/application_default_credentials.json
"account": "",
"client_id": "your_client_id",
"client_secret": "your_client_secret",
"quota_project_id": "your_quota_project_id",
"refresh_token": "your_refresh_token",
"type": "authorized_user",
"universe_domain": "googleapis.com",
}
}
dataset = ld.StreamingDataset("gs://my-bucket/my-data", storage_options=gcp_storage_options)

# Read data from Azure
azure_storage_options={
"account_url": f"https://{os.environ['AZURE_ACCOUNT_NAME']}.blob.core.windows.net",
"credential": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
"account_name": "azure_account_name",
"account_key": os.environ['AZURE_ACCOUNT_ACCESS_KEY']
}
dataset = ld.StreamingDataset("azure://my-bucket/my-data", storage_options=azure_storage_options)
```

- For more details on which storage options are supported, please refer to:
- [AWS S3 storage options](https://github.com/fsspec/s3fs/blob/main/s3fs/core.py#L176)
- [GCS storage options](https://github.com/fsspec/gcsfs/blob/main/gcsfs/core.py#L154)
- [Azure storage options](https://github.com/fsspec/adlfs/blob/main/adlfs/spec.py#L124)

</details>

<details>
Expand Down
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,7 @@ torch
lightning-utilities
filelock
numpy
boto3
# boto3
requests
fsspec
fsspec[s3] # aws s3
2 changes: 2 additions & 0 deletions requirements/extras.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ pyarrow
tqdm
lightning-sdk ==0.1.17 # Must be pinned to ensure compatibility
google-cloud-storage
fsspec[gs] # google cloud storage
fsspec[abfs] # azure blob
1 change: 1 addition & 0 deletions src/litdata/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,4 @@
_TIME_FORMAT = "%Y-%m-%d_%H-%M-%S.%fZ"
_IS_IN_STUDIO = bool(os.getenv("LIGHTNING_CLOUD_PROJECT_ID", None)) and bool(os.getenv("LIGHTNING_CLUSTER_ID", None))
_ENABLE_STATUS = bool(int(os.getenv("ENABLE_STATUS_REPORT", "0")))
_SUPPORTED_CLOUD_PROVIDERS = ["s3", "gs", "azure", "abfs"]
Loading

0 comments on commit 719bae2

Please sign in to comment.