Skip to content

Commit

Permalink
fix: test failure fixes for v0.15.1 (#1358)
Browse files Browse the repository at this point in the history
* fix: make FakeObject be the correct standard and robustify usage

* fix: get none valued study object from server

* add: test for minio download failures

* fix: skip test for WSL as it is not supported

* maint: rework if/else case workflow

* maint: ruff fix

* add/fix: log messages for no premission

* fix: make flow name unique and enable testing of avoiding duplicates
  • Loading branch information
LennartPurucker authored Oct 14, 2024
1 parent c1911c7 commit d3bb775
Show file tree
Hide file tree
Showing 7 changed files with 75 additions and 31 deletions.
2 changes: 2 additions & 0 deletions openml/_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,8 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None:
for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
if file_object.object_name is None:
raise ValueError(f"Object name is None for object {file_object!r}")
if file_object.etag is None:
raise ValueError(f"Object etag is None for object {file_object!r}")

marker = destination / file_object.etag
if marker.exists():
Expand Down
26 changes: 9 additions & 17 deletions openml/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,7 +345,10 @@ def _setup(config: _Config | None = None) -> None:
if not config_dir.exists():
config_dir.mkdir(exist_ok=True, parents=True)
except PermissionError:
pass
openml_logger.warning(
f"No permission to create OpenML directory at {config_dir}!"
" This can result in OpenML-Python not working properly."
)

if config is None:
config = _parse_config(config_file)
Expand All @@ -367,27 +370,16 @@ def _setup(config: _Config | None = None) -> None:

try:
cache_exists = _root_cache_directory.exists()
except PermissionError:
cache_exists = False

# create the cache subdirectory
try:
if not _root_cache_directory.exists():
# create the cache subdirectory
if not cache_exists:
_root_cache_directory.mkdir(exist_ok=True, parents=True)
_create_log_handlers()
except PermissionError:
openml_logger.warning(
f"No permission to create openml cache directory at {_root_cache_directory}!"
" This can result in OpenML-Python not working properly.",
f"No permission to create OpenML directory at {_root_cache_directory}!"
" This can result in OpenML-Python not working properly."
)

if cache_exists:
_create_log_handlers()
else:
_create_log_handlers(create_file_handler=False)
openml_logger.warning(
f"No permission to create OpenML directory at {config_dir}! This can result in "
" OpenML-Python not working properly.",
)


def set_field_in_config_file(field: str, value: Any) -> None:
Expand Down
3 changes: 0 additions & 3 deletions openml/runs/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,9 +206,6 @@ def run_flow_on_task( # noqa: C901, PLR0912, PLR0915, PLR0913
avoid_duplicate_runs : bool, optional (default=True)
If True, the run will throw an error if the setup/task combination is already present on
the server. This feature requires an internet connection.
avoid_duplicate_runs : bool, optional (default=True)
If True, the run will throw an error if the setup/task combination is already present on
the server. This feature requires an internet connection.
flow_tags : List[str], optional (default=None)
A list of tags that the flow should have at creation.
seed: int, optional (default=None)
Expand Down
8 changes: 7 additions & 1 deletion openml/study/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def get_study(
return study


def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
def _get_study(id_: int | str, entity_type: str) -> BaseStudy: # noqa: C901
xml_string = openml._api_calls._perform_api_call(f"study/{id_}", "get")
force_list_tags = (
"oml:data_id",
Expand All @@ -93,6 +93,12 @@ def _get_study(id_: int | str, entity_type: str) -> BaseStudy:
alias = result_dict.get("oml:alias", None)
main_entity_type = result_dict["oml:main_entity_type"]

# Parses edge cases where the server returns a string with a newline character for empty values.
none_value_indicator = "\n "
for key in result_dict:
if result_dict[key] == none_value_indicator:
result_dict[key] = None

if entity_type != main_entity_type:
raise ValueError(
f"Unexpected entity type '{main_entity_type}' reported by the server"
Expand Down
30 changes: 29 additions & 1 deletion tests/test_openml/test_api_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,12 @@ def test_retry_on_database_error(self, Session_class_mock, _):

assert Session_class_mock.return_value.__enter__.return_value.get.call_count == 20


class FakeObject(NamedTuple):
object_name: str
etag: str
"""We use the etag of a Minio object as the name of a marker if we already downloaded it."""


class FakeMinio:
def __init__(self, objects: Iterable[FakeObject] | None = None):
Expand All @@ -60,7 +64,7 @@ def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None:
some_url = f"https://not.real.com/bucket/{some_object_path}"
mock_minio.return_value = FakeMinio(
objects=[
FakeObject(some_object_path),
FakeObject(object_name=some_object_path, etag=str(hash(some_object_path))),
],
)

Expand All @@ -71,3 +75,27 @@ def test_download_all_files_observes_cache(mock_minio, tmp_path: Path) -> None:
time_modified = (tmp_path / some_filename).stat().st_mtime

assert time_created == time_modified


@mock.patch.object(minio, "Minio")
def test_download_minio_failure(mock_minio, tmp_path: Path) -> None:
some_prefix, some_filename = "some/prefix", "dataset.arff"
some_object_path = f"{some_prefix}/{some_filename}"
some_url = f"https://not.real.com/bucket/{some_object_path}"
mock_minio.return_value = FakeMinio(
objects=[
FakeObject(object_name=None, etag="tmp"),
],
)

with pytest.raises(ValueError):
_download_minio_bucket(source=some_url, destination=tmp_path)

mock_minio.return_value = FakeMinio(
objects=[
FakeObject(object_name="tmp", etag=None),
],
)

with pytest.raises(ValueError):
_download_minio_bucket(source=some_url, destination=tmp_path)
7 changes: 6 additions & 1 deletion tests/test_openml/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from copy import copy
from typing import Any, Iterator
from pathlib import Path
import platform

import pytest

Expand Down Expand Up @@ -37,14 +38,18 @@ class TestConfig(openml.testing.TestBase):
@unittest.mock.patch("openml.config.openml_logger.warning")
@unittest.mock.patch("openml.config._create_log_handlers")
@unittest.skipIf(os.name == "nt", "https://github.com/openml/openml-python/issues/1033")
@unittest.skipIf(
platform.uname().release.endswith(("-Microsoft", "microsoft-standard-WSL2")),
"WSL does nto support chmod as we would need here, see https://github.com/microsoft/WSL/issues/81",
)
def test_non_writable_home(self, log_handler_mock, warnings_mock):
with tempfile.TemporaryDirectory(dir=self.workdir) as td:
os.chmod(td, 0o444)
_dd = copy(openml.config._defaults)
_dd["cachedir"] = Path(td) / "something-else"
openml.config._setup(_dd)

assert warnings_mock.call_count == 2
assert warnings_mock.call_count == 1
assert log_handler_mock.call_count == 1
assert not log_handler_mock.call_args_list[0][1]["create_file_handler"]
assert openml.config._root_cache_directory == Path(td) / "something-else"
Expand Down
30 changes: 22 additions & 8 deletions tests/test_runs/test_run_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
# time.time() works in seconds
start_time = time.time()
while time.time() - start_time < max_waiting_time_seconds:

try:
openml.runs.get_run_trace(run_id)
except openml.exceptions.OpenMLServerException:
Expand All @@ -131,7 +130,9 @@ def _wait_for_processed_run(self, run_id, max_waiting_time_seconds):
time.sleep(10)
continue

assert len(run.evaluations) > 0, "Expect not-None evaluations to always contain elements."
assert (
len(run.evaluations) > 0
), "Expect not-None evaluations to always contain elements."
return

raise RuntimeError(
Expand Down Expand Up @@ -557,7 +558,7 @@ def determine_grid_size(param_grid):
fold_evaluations=run.fold_evaluations,
num_repeats=1,
num_folds=num_folds,
task_type=task_type
task_type=task_type,
)

# Check if run string and print representation do not run into an error
Expand Down Expand Up @@ -796,7 +797,9 @@ def test_run_and_upload_knn_pipeline(self, warnings_mock):

@pytest.mark.sklearn()
def test_run_and_upload_gridsearch(self):
estimator_name = "base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
estimator_name = (
"base_estimator" if Version(sklearn.__version__) < Version("1.4") else "estimator"
)
gridsearch = GridSearchCV(
BaggingClassifier(**{estimator_name: SVC()}),
{f"{estimator_name}__C": [0.01, 0.1, 10], f"{estimator_name}__gamma": [0.01, 0.1, 10]},
Expand Down Expand Up @@ -1826,7 +1829,9 @@ def test_joblib_backends(self, parallel_mock):
num_instances = x.shape[0]
line_length = 6 + len(task.class_labels)

backend_choice = "loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
backend_choice = (
"loky" if Version(joblib.__version__) > Version("0.11") else "multiprocessing"
)
for n_jobs, backend, call_count in [
(1, backend_choice, 10),
(2, backend_choice, 10),
Expand Down Expand Up @@ -1877,14 +1882,23 @@ def test_joblib_backends(self, parallel_mock):
reason="SimpleImputer doesn't handle mixed type DataFrame as input",
)
def test_delete_run(self):
rs = 1
rs = np.random.randint(1, 2**32 - 1)
clf = sklearn.pipeline.Pipeline(
steps=[("imputer", SimpleImputer()), ("estimator", DecisionTreeClassifier())],
steps=[
(f"test_server_imputer_{rs}", SimpleImputer()),
("estimator", DecisionTreeClassifier()),
],
)
task = openml.tasks.get_task(32) # diabetes; crossvalidation

run = openml.runs.run_model_on_task(model=clf, task=task, seed=rs)
run = openml.runs.run_model_on_task(
model=clf, task=task, seed=rs, avoid_duplicate_runs=False
)
run.publish()

with pytest.raises(openml.exceptions.OpenMLRunsExistError):
openml.runs.run_model_on_task(model=clf, task=task, seed=rs, avoid_duplicate_runs=True)

TestBase._mark_entity_for_removal("run", run.run_id)
TestBase.logger.info(f"collected from test_run_functions: {run.run_id}")

Expand Down

0 comments on commit d3bb775

Please sign in to comment.