diff --git a/README.md b/README.md index 2f1464b0..20c556e4 100644 --- a/README.md +++ b/README.md @@ -49,20 +49,26 @@ This library is a Python client to interact with the [Polaris Hub](https://polar ```python import polaris as po -# Download a benchmark (the associated dataset will be transparently downloaded) -benchmark = po.load_benchmark("org_or_user/name") +# Load the benchmark from the Hub +benchmark = po.load_benchmark("polaris/hello_world_benchmark") -# Retrieve the splits +# Get the train and test data-loaders train, test = benchmark.get_train_test_split() -# Work your magic! -y_pred = ... +# Use the training data to train your model +# Get the input as an array with 'train.inputs' and 'train.targets' +# Or simply iterate over the train object. +for x, y in train: + ... -# Run the evaluation procedure -results = benchmark.evaluate(y_pred) +# Work your magic to accurately predict the test set +predictions = [0.0 for x in test] -# Upload your results to the hub -results.upload_to_hub() +# Evaluate your predictions +results = benchmark.evaluate(predictions) + +# Submit your results +results.upload_to_hub(owner="dummy-user") ``` ## Documentation diff --git a/docs/api/load.md b/docs/api/load.md new file mode 100644 index 00000000..f916b687 --- /dev/null +++ b/docs/api/load.md @@ -0,0 +1,8 @@ + +::: polaris.load_dataset + +--- + +::: polaris.load_benchmark + +--- diff --git a/docs/quickstart.md b/docs/quickstart.md index 8c46702c..2553d22e 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -24,13 +24,26 @@ If all you care about is to partake in a benchmark that is hosted on the hub, it ```python import polaris as po -benchmark = po.load_benchmark("org_or_user/name") +# Load the benchmark from the Hub +benchmark = po.load_benchmark("polaris/hello_world_benchmark") + +# Get the train and test data-loaders train, test = benchmark.get_train_test_split() -y_pred = ... # Work your magic! +# Use the training data to train your model +# Get the input as an array with 'train.inputs' and 'train.targets' +# Or simply iterate over the train object. +for x, y in train: + ... + +# Work your magic to accurately predict the test set +predictions = [0.0 for x in test] + +# Evaluate your predictions +results = benchmark.evaluate(predictions) -results = benchmark.evaluate(y_pred) -results.upload_to_hub() +# Submit your results +results.upload_to_hub(owner="dummy-user") ``` That's all there is to it to partake in a benchmark. No complicated, custom data-loaders or evaluation protocol. With just a few lines of code, you can feel confident that you are properly evaluating your model and focus on what you do best: Solving the hard problems in our domain! diff --git a/docs/tutorials/basics.ipynb b/docs/tutorials/basics.ipynb index 15ba4d33..8c91df01 100644 --- a/docs/tutorials/basics.ipynb +++ b/docs/tutorials/basics.ipynb @@ -63,7 +63,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "\u001b[32m2023-11-06 17:37:18.375\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1mYou are already logged in to the Polaris Hub as lu-valencelabs (lu@valencediscovery.com). Set `overwrite=True` to force re-authentication.\u001b[0m\n" + "\u001b[32m2023-11-27 14:54:08.788\u001b[0m | \u001b[1mINFO \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mlogin\u001b[0m:\u001b[36m262\u001b[0m - \u001b[1mYou are already logged in to the Polaris Hub as cwognum (cas@valencediscovery.com). Set `overwrite=True` to force re-authentication.\u001b[0m\n" ] } ], @@ -285,7 +285,7 @@ { "data": { "text/html": [ - "
nameNone
description
tags
user_attributes
ownerNone
benchmark_namehello_world_benchmark
benchmark_owner
slugpolaris
organization_idorg_2WG9hRFgKNIRtGw4orsMPcr1F4S
user_idNone
ownerorg_2WG9hRFgKNIRtGw4orsMPcr1F4S
github_urlNone
paper_urlNone
contributorsNone
results
Test setTarget labelMetricScore
testSOLmean_squared_error2.6875139821
testSOLmean_absolute_error1.2735690161
" + "
nameNone
description
tags
user_attributes
ownerNone
benchmark_namehello_world_benchmark
benchmark_owner
slugpolaris
external_idorg_2WG9hRFgKNIRtGw4orsMPcr1F4S
typeorganization
github_urlNone
paper_urlNone
contributorsNone
artifact_idNone
benchmark_artifact_idpolaris/hello-world-benchmark
results
Test setTarget labelMetricScore
testSOLmean_squared_error2.6875139821
testSOLmean_absolute_error1.2735690161
" ], "text/plain": [ "{\n", @@ -297,13 +297,14 @@ " \"benchmark_name\": \"hello_world_benchmark\",\n", " \"benchmark_owner\": {\n", " \"slug\": \"polaris\",\n", - " \"organization_id\": \"org_2WG9hRFgKNIRtGw4orsMPcr1F4S\",\n", - " \"user_id\": null,\n", - " \"owner\": \"org_2WG9hRFgKNIRtGw4orsMPcr1F4S\"\n", + " \"external_id\": \"org_2WG9hRFgKNIRtGw4orsMPcr1F4S\",\n", + " \"type\": \"organization\"\n", " },\n", " \"github_url\": null,\n", " \"paper_url\": null,\n", " \"contributors\": null,\n", + " \"artifact_id\": null,\n", + " \"benchmark_artifact_id\": \"polaris/hello-world-benchmark\",\n", " \"results\": [\n", " {\n", " \"Test set\": \"test\",\n", @@ -341,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "id": "a601f415-c563-4efe-94c3-0d44f3fd6576", "metadata": {}, "outputs": [], @@ -362,7 +363,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "id": "60cbf4b9-8514-480d-beda-8a50e5f7c9a6", "metadata": { "scrolled": true @@ -372,16 +373,16 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/lu.zhu/miniconda3/envs/pov3/lib/python3.11/site-packages/pydantic/main.py:309: UserWarning: Pydantic serializer warnings:\n", + "/home/cas/micromamba/envs/polaris/lib/python3.12/site-packages/pydantic/main.py:308: UserWarning: Pydantic serializer warnings:\n", " Expected `url` but got `str` - serialized value may not be as expected\n", " Expected `url` but got `str` - serialized value may not be as expected\n", " return self.__pydantic_serializer__.to_python(\n", - "\u001b[32m2023-11-06 17:38:06.152\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_results\u001b[0m:\u001b[36m413\u001b[0m - \u001b[32m\u001b[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/hello_world_benchmark/YYH033LKM1BaT8byAC5Jc\u001b[0m\n" + "\u001b[32m2023-11-27 14:54:46.649\u001b[0m | \u001b[32m\u001b[1mSUCCESS \u001b[0m | \u001b[36mpolaris.hub.client\u001b[0m:\u001b[36mupload_results\u001b[0m:\u001b[36m428\u001b[0m - \u001b[32m\u001b[1mYour result has been successfully uploaded to the Hub. View it here: https://polarishub.io/benchmarks/polaris/hello_world_benchmark/ns4JrC3hQNK9M1hbVPchy\u001b[0m\n" ] } ], "source": [ - "client.upload_results(results)\n", + "client.upload_results(results, owner=\"cwognum\")\n", "client.close()" ] }, @@ -396,14 +397,6 @@ "\n", "---" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0868ff53-7a42-4e4c-bae4-29fb04c513c7", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -422,7 +415,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.5" + "version": "3.12.0" } }, "nbformat": 4, diff --git a/mkdocs.yml b/mkdocs.yml index cabe3da0..e4ac884c 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -22,6 +22,7 @@ nav: - Custom Datasets and Benchmarks: tutorials/custom_dataset_benchmark.ipynb # - Creating Datasets with zarr: tutorials/dataset_zarr.ipynb - API Reference: + - Load: api/load.md - Core: - Dataset: api/dataset.md - Benchmark: api/benchmark.md diff --git a/polaris/benchmark/_base.py b/polaris/benchmark/_base.py index 87908a4a..45d6a25a 100644 --- a/polaris/benchmark/_base.py +++ b/polaris/benchmark/_base.py @@ -22,7 +22,7 @@ from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidBenchmarkError, PolarisChecksumError from polaris.utils.misc import listit -from polaris.utils.types import AccessType, DataFormat, PredictionsType, SplitType +from polaris.utils.types import AccessType, DataFormat, HubOwner, PredictionsType, SplitType ColumnsType = Union[str, list[str]] @@ -371,6 +371,7 @@ def upload_to_hub( settings: Optional[PolarisHubSettings] = None, cache_auth_token: bool = True, access: Optional[AccessType] = "private", + owner: Optional[Union[HubOwner, str]] = None, **kwargs: dict, ): """ @@ -382,7 +383,7 @@ def upload_to_hub( with PolarisHubClient( env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs ) as client: - return client.upload_benchmark(self, access) + return client.upload_benchmark(self, access=access, owner=owner) def to_json(self, destination: str) -> str: """Save the benchmark to a destination directory as a JSON file. diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index f0e3b53c..f5f658d7 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -23,7 +23,7 @@ from polaris.utils.dict2html import dict2html from polaris.utils.errors import InvalidDatasetError, PolarisChecksumError from polaris.utils.io import get_zarr_root, robust_copy -from polaris.utils.types import AccessType, HttpUrlString, License +from polaris.utils.types import AccessType, HttpUrlString, HubOwner, License # Constants _SUPPORTED_TABLE_EXTENSIONS = ["parquet"] @@ -201,6 +201,7 @@ def upload_to_hub( settings: Optional[PolarisHubSettings] = None, cache_auth_token: bool = True, access: Optional[AccessType] = "private", + owner: Optional[Union[HubOwner, str]] = None, **kwargs: dict, ): """ @@ -212,7 +213,7 @@ def upload_to_hub( with PolarisHubClient( env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs ) as client: - return client.upload_dataset(self, access) + return client.upload_dataset(self, access=access, owner=owner) @classmethod def from_zarr(cls, path: str) -> "Dataset": diff --git a/polaris/evaluate/_results.py b/polaris/evaluate/_results.py index 52464c11..9231f3db 100644 --- a/polaris/evaluate/_results.py +++ b/polaris/evaluate/_results.py @@ -182,6 +182,7 @@ def upload_to_hub( settings: Optional[PolarisHubSettings] = None, cache_auth_token: bool = True, access: Optional[AccessType] = "private", + owner: Optional[Union[HubOwner, str]] = None, **kwargs: dict, ): """ @@ -193,7 +194,7 @@ def upload_to_hub( with PolarisHubClient( env_file=env_file, settings=settings, cache_auth_token=cache_auth_token, **kwargs ) as client: - return client.upload_results(self, access) + return client.upload_results(self, access=access, owner=owner) def _repr_dict_(self) -> dict: """Utility function for pretty-printing to the command line and jupyter notebooks""" diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 3394a362..fe982365 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -373,7 +373,12 @@ def get_benchmark(self, owner: Union[str, HubOwner], name: str) -> BenchmarkSpec ) return benchmark_cls(**response) - def upload_results(self, results: BenchmarkResults, access: AccessType = "private"): + def upload_results( + self, + results: BenchmarkResults, + access: AccessType = "private", + owner: Optional[Union[HubOwner, str]] = None, + ): """Upload the results to the Polaris Hub. Info: Owner @@ -395,9 +400,19 @@ def upload_results(self, results: BenchmarkResults, access: AccessType = "privat Args: results: The results to upload. access: Grant public or private access to result + owner: Which Hub user or organization owns the artifact. + Optional if and only if the `benchmark.owner` attribute is set. """ # Get the serialized model data-structure + + if results.owner is None: + if owner is None: + raise ValueError( + "The `owner` argument must be specified if the `results.owner` attribute is not set." + ) + results.owner = owner if isinstance(owner, HubOwner) else HubOwner(slug=owner) + result_json = results.model_dump(by_alias=True, exclude_none=True) # Make a request to the hub @@ -414,7 +429,11 @@ def upload_results(self, results: BenchmarkResults, access: AccessType = "privat return response def upload_dataset( - self, dataset: Dataset, access: AccessType = "private", timeout: TimeoutTypes = (10, 200) + self, + dataset: Dataset, + access: AccessType = "private", + timeout: TimeoutTypes = (10, 200), + owner: Optional[Union[HubOwner, str]] = None, ): """Upload the dataset to the Polaris Hub. @@ -432,8 +451,21 @@ def upload_dataset( dataset: The dataset to upload. access: Grant public or private access to result timeout: Request timeout values. User can modify the value when uploading large dataset as needed. + This can be a single value with the timeout in seconds for all IO operations, or a more granular + tuple with (connect_timeout, write_timeout). The type of the the timout parameter comes from `httpx`. + Since datasets can get large, it might be needed to increase the write timeout for larger datasets. + See also: https://www.python-httpx.org/advanced/#timeout-configuration + owner: Which Hub user or organization owns the artifact. + Optional if and only if the `benchmark.owner` attribute is set. """ + if dataset.owner is None: + if owner is None: + raise ValueError( + "The `owner` argument must be specified if the `dataset.owner` attribute is not set." + ) + dataset.owner = owner if isinstance(owner, HubOwner) else HubOwner(slug=owner) + # Get the serialized data-model # We exclude the table as it handled separately and the cache_dir as it is user-specific dataset_json = dataset.model_dump(exclude={"cache_dir", "table"}, exclude_none=True, by_alias=True) @@ -500,7 +532,12 @@ def upload_dataset( return response - def upload_benchmark(self, benchmark: BenchmarkSpecification, access: AccessType = "private"): + def upload_benchmark( + self, + benchmark: BenchmarkSpecification, + access: AccessType = "private", + owner: Optional[Union[HubOwner, str]] = None, + ): """Upload the benchmark to the Polaris Hub. Info: Owner @@ -520,7 +557,15 @@ def upload_benchmark(self, benchmark: BenchmarkSpecification, access: AccessType Args: benchmark: The benchmark to upload. access: Grant public or private access to result + owner: Which Hub user or organization owns the artifact. + Optional if and only if the `benchmark.owner` attribute is set. """ + if benchmark.owner is None: + if owner is None: + raise ValueError( + "The `owner` argument must be specified if the `benchmark.owner` attribute is not set." + ) + benchmark.owner = owner if isinstance(owner, HubOwner) else HubOwner(slug=owner) # Get the serialized data-model # We exclude the dataset as we expect it to exist on the hub already. diff --git a/polaris/loader/__init__.py b/polaris/loader/__init__.py index 61162261..980f6dcf 100644 --- a/polaris/loader/__init__.py +++ b/polaris/loader/__init__.py @@ -1,54 +1,3 @@ -import fsspec -import yaml +from .load import load_benchmark, load_dataset -from polaris.benchmark._definitions import ( - MultiTaskBenchmarkSpecification, - SingleTaskBenchmarkSpecification, -) -from polaris.dataset._dataset import Dataset -from polaris.hub.client import PolarisHubClient -from polaris.utils import fs -from polaris.utils.errors import InvalidBenchmarkError, InvalidDatasetError - - -def load_dataset(path: str): - """ - Loads the dataset. Inspired by the HF API, this can either load from a remote or local path or from the Hub. - """ - - extension = fs.get_extension(path) - is_file = fs.is_file(path) or extension == "zarr" - - if not is_file: - # Load from the Hub - client = PolarisHubClient() - return client.get_dataset(*path.split("/")) - - if extension == "zarr": - return Dataset.from_zarr(path) - elif extension == "json": - return Dataset.from_json(path) - - raise NotImplementedError("This should not be reached.") - - -def load_benchmark(path: str): - """ - Loads a benchmark. - """ - - is_file = fs.is_file(path) or fs.get_extension(path) == "zarr" - - if not is_file: - # Load from the Hub - client = PolarisHubClient() - return client.get_benchmark(*path.split("/")) - - with fsspec.open(path, "r") as fd: - data = yaml.safe_load(fd) # type: ignore - - # TODO (cwognum): As this gets more complex, how do we effectivly choose which class we should use? - # e.g. we might end up with a single class per benchmark. - is_single_task = isinstance(data["target_cols"], str) or len(data["target_cols"]) == 1 - cls = SingleTaskBenchmarkSpecification if is_single_task else MultiTaskBenchmarkSpecification - return cls.from_json(path) +_all__ = ["load_benchmark", "load_dataset"] diff --git a/polaris/loader/load.py b/polaris/loader/load.py new file mode 100644 index 00000000..b6a3cc1a --- /dev/null +++ b/polaris/loader/load.py @@ -0,0 +1,84 @@ +import fsspec +import yaml + +from polaris.benchmark._definitions import ( + MultiTaskBenchmarkSpecification, + SingleTaskBenchmarkSpecification, +) +from polaris.dataset._dataset import Dataset +from polaris.hub.client import PolarisHubClient +from polaris.utils import fs + + +def load_dataset(path: str) -> Dataset: + """ + Loads a Polaris dataset. + + In Polaris, a dataset is a tabular data structure that stores data-points in a row-wise manner. + A dataset can have multiple modalities or targets, can be sparse + and can be part of _one or multiple benchmarks_. + + The Polaris dataset can be loaded from the Hub or from a local or remote directory. + + - **Hub** (recommended): When loading the dataset from the Hub, you can simply + provide the `owner/name` slug. This can be easily copied from the relevant dataset + page on the Hub. + - **Directory**: When loading the dataset from a directory, you should provide the path + as returned by [`Dataset.to_json`][polaris.dataset.Dataset.to_json] or + [`Dataset.to_zarr`][polaris.dataset.Dataset.to_zarr]. The path can be local or remote. + + Warning: Loading from `.zarr` + Loading and saving datasets from and to `.zarr` is still experimental and currently not + supported by the Hub. + """ + + extension = fs.get_extension(path) + is_file = fs.is_file(path) or extension == "zarr" + + if not is_file: + # Load from the Hub + client = PolarisHubClient() + return client.get_dataset(*path.split("/")) + + if extension == "zarr": + return Dataset.from_zarr(path) + elif extension == "json": + return Dataset.from_json(path) + + raise NotImplementedError("This should not be reached.") + + +def load_benchmark(path: str): + """ + Loads a Polaris benchmark. + + In Polaris, a benchmark wraps a dataset with additional meta-data to specify the evaluation logic. + + The Polaris benchmark can be loaded from the Hub or from a local or remote directory. + + Note: Dataset is automatically loaded + The dataset underlying the benchmark is automatically loaded when loading the benchmark. + + - **Hub** (recommended): When loading the benchmark from the Hub, you can simply + provide the `owner/name` slug. This can be easily copied from the relevant benchmark + page on the Hub. + - **Directory**: When loading the benchmark from a directory, you should provide the path + as returned by [`BenchmarkSpecification.to_json`][polaris.benchmark._base.BenchmarkSpecification.to_json]. + The path can be local or remote. + """ + + is_file = fs.is_file(path) or fs.get_extension(path) == "zarr" + + if not is_file: + # Load from the Hub + client = PolarisHubClient() + return client.get_benchmark(*path.split("/")) + + with fsspec.open(path, "r") as fd: + data = yaml.safe_load(fd) # type: ignore + + # TODO (cwognum): As this gets more complex, how do we effectivly choose which class we should use? + # e.g. we might end up with a single class per benchmark. + is_single_task = isinstance(data["target_cols"], str) or len(data["target_cols"]) == 1 + cls = SingleTaskBenchmarkSpecification if is_single_task else MultiTaskBenchmarkSpecification + return cls.from_json(path)