diff --git a/.github/workflows/test-inference.yaml b/.github/workflows/test-inference.yaml index ca4c45ff0..a68ef07e8 100644 --- a/.github/workflows/test-inference.yaml +++ b/.github/workflows/test-inference.yaml @@ -5,8 +5,6 @@ on: branches: - dev pull_request: - branches: - - dev jobs: test: diff --git a/.github/workflows/test-nodes.yaml b/.github/workflows/test-nodes.yaml index 99507571b..b10161724 100644 --- a/.github/workflows/test-nodes.yaml +++ b/.github/workflows/test-nodes.yaml @@ -5,8 +5,6 @@ on: branches: - dev pull_request: - branches: - - dev jobs: test: diff --git a/.github/workflows/test-optimization.yaml b/.github/workflows/test-optimization.yaml index ea1cf861e..4625f39d7 100644 --- a/.github/workflows/test-optimization.yaml +++ b/.github/workflows/test-optimization.yaml @@ -5,8 +5,6 @@ on: branches: - dev pull_request: - branches: - - dev jobs: test: diff --git a/.github/workflows/unit-tests.yaml b/.github/workflows/unit-tests.yaml index 3612d561f..5883080eb 100644 --- a/.github/workflows/unit-tests.yaml +++ b/.github/workflows/unit-tests.yaml @@ -5,8 +5,6 @@ on: branches: - dev pull_request: - branches: - - dev jobs: test: diff --git a/autointent/_pipeline/_pipeline.py b/autointent/_pipeline/_pipeline.py index e2711b677..3085b90ba 100644 --- a/autointent/_pipeline/_pipeline.py +++ b/autointent/_pipeline/_pipeline.py @@ -10,7 +10,7 @@ from autointent import Context, Dataset from autointent.configs import InferenceNodeConfig, LoggingConfig, VectorIndexConfig -from autointent.custom_types import ListOfGenericLabels, NodeType, ValidationScheme +from autointent.custom_types import ListOfGenericLabels, NodeType, SamplerType, ValidationScheme from autointent.metrics import PREDICTION_METRICS_MULTILABEL from autointent.nodes import InferenceNode, NodeOptimizer from autointent.nodes.schemes import OptimizationConfig @@ -87,7 +87,7 @@ def default_optimizer(cls, multilabel: bool, seed: int = 42) -> "Pipeline": """ return cls.from_search_space(search_space=load_default_search_space(multilabel), seed=seed) - def _fit(self, context: Context) -> None: + def _fit(self, context: Context, sampler: SamplerType = "brute") -> None: """ Optimize the pipeline. @@ -102,7 +102,7 @@ def _fit(self, context: Context) -> None: for node_type in NodeType: node_optimizer = self.nodes.get(node_type, None) if node_optimizer is not None: - node_optimizer.fit(context) # type: ignore[union-attr] + node_optimizer.fit(context, sampler) # type: ignore[union-attr] if not context.vector_index_config.save_db: self._logger.info("removing vector database from file system...") # TODO clear cache from appdirs @@ -117,7 +117,12 @@ def _is_inference(self) -> bool: return isinstance(self.nodes[NodeType.scoring], InferenceNode) def fit( - self, dataset: Dataset, scheme: ValidationScheme = "ho", n_folds: int = 3, refit_after: bool = False + self, + dataset: Dataset, + scheme: ValidationScheme = "ho", + n_folds: int = 3, + refit_after: bool = False, + sampler: SamplerType = "brute", ) -> Context: """ Optimize the pipeline from dataset. @@ -135,7 +140,7 @@ def fit( context.configure_vector_index(self.vector_index_config) self.validate_modules(dataset) - self._fit(context) + self._fit(context, sampler) if context.is_ram_to_clear(): nodes_configs = context.optimization_info.get_inference_nodes_config() diff --git a/autointent/configs/_optimization.py b/autointent/configs/_optimization.py index 52c9823b4..8d925f8ae 100644 --- a/autointent/configs/_optimization.py +++ b/autointent/configs/_optimization.py @@ -2,9 +2,9 @@ from pathlib import Path -from pydantic import BaseModel, Field +from pydantic import BaseModel, Field, PositiveInt -from autointent.custom_types import ValidationScheme +from autointent.custom_types import SamplerType, ValidationScheme from ._name import get_run_name @@ -16,7 +16,7 @@ class DataConfig(BaseModel): """Path to the training data. Can be local path or HF repo.""" scheme: ValidationScheme """Hold-out or cross-validation.""" - n_folds: int = 3 + n_folds: PositiveInt = 3 """Number of folds in cross-validation.""" @@ -25,6 +25,7 @@ class TaskConfig(BaseModel): search_space_path: Path | None = None """Path to the search space configuration file. If None, the default search space will be used""" + sampler: SamplerType = "brute" class LoggingConfig(BaseModel): diff --git a/autointent/custom_types.py b/autointent/custom_types.py index 557150e05..b0318ee38 100644 --- a/autointent/custom_types.py +++ b/autointent/custom_types.py @@ -5,7 +5,9 @@ """ from enum import Enum -from typing import Literal, TypeAlias +from typing import Annotated, Literal, TypeAlias + +from annotated_types import Interval class LogLevel(Enum): @@ -71,4 +73,9 @@ class Split: INTENTS = "intents" +SamplerType = Literal["brute", "tpe", "random"] ValidationScheme = Literal["ho", "cv"] + + +FloatFromZeroToOne = Annotated[float, Interval(ge=0, le=1)] +"""Float value between 0 and 1, inclusive.""" diff --git a/autointent/modules/decision/_adaptive.py b/autointent/modules/decision/_adaptive.py index e8a7ceffd..f07c66e22 100644 --- a/autointent/modules/decision/_adaptive.py +++ b/autointent/modules/decision/_adaptive.py @@ -7,7 +7,7 @@ import numpy.typing as npt from autointent import Context -from autointent.custom_types import ListOfGenericLabels, ListOfLabelsWithOOS, MultiLabel +from autointent.custom_types import FloatFromZeroToOne, ListOfGenericLabels, ListOfLabelsWithOOS, MultiLabel from autointent.exceptions import MismatchNumClassesError from autointent.metrics import decision_f1 from autointent.modules.abc import DecisionModule @@ -58,7 +58,7 @@ class AdaptiveDecision(DecisionModule): supports_oos = False name = "adaptive" - def __init__(self, search_space: list[float] | None = None) -> None: + def __init__(self, search_space: list[FloatFromZeroToOne] | None = None) -> None: """ Initialize the AdaptiveDecision. @@ -68,7 +68,7 @@ def __init__(self, search_space: list[float] | None = None) -> None: self.search_space = search_space if search_space is not None else default_search_space @classmethod - def from_context(cls, context: Context, search_space: list[float] | None = None) -> "AdaptiveDecision": + def from_context(cls, context: Context, search_space: list[FloatFromZeroToOne] | None = None) -> "AdaptiveDecision": """ Create an AdaptiveDecision instance using a Context object. diff --git a/autointent/modules/decision/_jinoos.py b/autointent/modules/decision/_jinoos.py index e1bb992b7..f01bbe88b 100644 --- a/autointent/modules/decision/_jinoos.py +++ b/autointent/modules/decision/_jinoos.py @@ -6,7 +6,7 @@ import numpy.typing as npt from autointent import Context -from autointent.custom_types import ListOfGenericLabels +from autointent.custom_types import FloatFromZeroToOne, ListOfGenericLabels from autointent.exceptions import MismatchNumClassesError from autointent.modules.abc import DecisionModule from autointent.schemas import Tag @@ -55,7 +55,7 @@ class JinoosDecision(DecisionModule): def __init__( self, - search_space: list[float] | None = None, + search_space: list[FloatFromZeroToOne] | None = None, ) -> None: """ Initialize Jinoos predictor. @@ -65,7 +65,7 @@ def __init__( self.search_space = np.array(search_space) if search_space is not None else default_search_space @classmethod - def from_context(cls, context: Context, search_space: list[float] | None = None) -> "JinoosDecision": + def from_context(cls, context: Context, search_space: list[FloatFromZeroToOne] | None = None) -> "JinoosDecision": """ Initialize from context. diff --git a/autointent/modules/decision/_threshold.py b/autointent/modules/decision/_threshold.py index 76bf0f281..42dcbca20 100644 --- a/autointent/modules/decision/_threshold.py +++ b/autointent/modules/decision/_threshold.py @@ -7,7 +7,7 @@ import numpy.typing as npt from autointent import Context -from autointent.custom_types import ListOfGenericLabels, MultiLabel +from autointent.custom_types import FloatFromZeroToOne, ListOfGenericLabels, MultiLabel from autointent.exceptions import MismatchNumClassesError from autointent.modules.abc import DecisionModule from autointent.schemas import Tag @@ -75,7 +75,7 @@ class ThresholdDecision(DecisionModule): def __init__( self, - thresh: float | list[float], + thresh: FloatFromZeroToOne | list[FloatFromZeroToOne], ) -> None: """ Initialize threshold predictor. @@ -85,7 +85,9 @@ def __init__( self.thresh = thresh if isinstance(thresh, float) else np.array(thresh) @classmethod - def from_context(cls, context: Context, thresh: float | list[float] = 0.5) -> "ThresholdDecision": + def from_context( + cls, context: Context, thresh: FloatFromZeroToOne | list[FloatFromZeroToOne] = 0.5 + ) -> "ThresholdDecision": """ Initialize from context. diff --git a/autointent/modules/decision/_tunable.py b/autointent/modules/decision/_tunable.py index 8bb559d0d..f5bfdfb29 100644 --- a/autointent/modules/decision/_tunable.py +++ b/autointent/modules/decision/_tunable.py @@ -6,6 +6,7 @@ import numpy.typing as npt import optuna from optuna.trial import Trial +from pydantic import PositiveInt from autointent.context import Context from autointent.custom_types import ListOfGenericLabels @@ -76,7 +77,7 @@ class TunableDecision(DecisionModule): def __init__( self, - n_trials: int = 320, + n_trials: PositiveInt = 320, seed: int = 0, tags: list[Tag] | None = None, ) -> None: @@ -92,7 +93,7 @@ def __init__( self.tags = tags @classmethod - def from_context(cls, context: Context, n_trials: int = 320) -> "TunableDecision": + def from_context(cls, context: Context, n_trials: PositiveInt = 320) -> "TunableDecision": """ Initialize from context. diff --git a/autointent/modules/embedding/_logreg.py b/autointent/modules/embedding/_logreg.py index c0f4f18a9..2c6a4257a 100644 --- a/autointent/modules/embedding/_logreg.py +++ b/autointent/modules/embedding/_logreg.py @@ -4,6 +4,7 @@ import numpy as np from numpy.typing import NDArray +from pydantic import PositiveInt from sklearn.linear_model import LogisticRegression, LogisticRegressionCV from sklearn.multioutput import MultiOutputClassifier from sklearn.preprocessing import LabelEncoder @@ -51,7 +52,7 @@ class LogregAimedEmbedding(EmbeddingModule): def __init__( self, embedder_config: EmbedderConfig | str | dict[str, Any], - cv: int = 3, + cv: PositiveInt = 3, ) -> None: """ Initialize the LogregAimedEmbedding. @@ -67,7 +68,7 @@ def from_context( cls, context: Context, embedder_config: EmbedderConfig | str, - cv: int = 3, + cv: PositiveInt = 3, ) -> "LogregAimedEmbedding": """ Create a LogregAimedEmbedding instance using a Context object. diff --git a/autointent/modules/embedding/_retrieval.py b/autointent/modules/embedding/_retrieval.py index dc7816467..13226847a 100644 --- a/autointent/modules/embedding/_retrieval.py +++ b/autointent/modules/embedding/_retrieval.py @@ -2,6 +2,8 @@ from typing import Any +from pydantic import PositiveInt + from autointent import Context, VectorIndex from autointent.context.optimization_info import RetrieverArtifact from autointent.custom_types import ListOfLabels @@ -44,7 +46,7 @@ class RetrievalAimedEmbedding(EmbeddingModule): def __init__( self, - k: int, + k: PositiveInt, embedder_config: EmbedderConfig | str | dict[str, Any], ) -> None: """ @@ -64,7 +66,7 @@ def __init__( def from_context( cls, context: Context, - k: int, + k: PositiveInt, embedder_config: EmbedderConfig | str, ) -> "RetrievalAimedEmbedding": """ diff --git a/autointent/modules/scoring/_description/description.py b/autointent/modules/scoring/_description/description.py index 90368bb9c..19ebbac28 100644 --- a/autointent/modules/scoring/_description/description.py +++ b/autointent/modules/scoring/_description/description.py @@ -5,6 +5,7 @@ import numpy as np import scipy from numpy.typing import NDArray +from pydantic import PositiveFloat from sklearn.metrics.pairwise import cosine_similarity from autointent import Context, Embedder @@ -38,7 +39,7 @@ class DescriptionScorer(ScoringModule): def __init__( self, embedder_config: EmbedderConfig | str | dict[str, Any], - temperature: float = 1.0, + temperature: PositiveFloat = 1.0, ) -> None: """ Initialize the DescriptionScorer. @@ -53,7 +54,7 @@ def __init__( def from_context( cls, context: Context, - temperature: float, + temperature: PositiveFloat, embedder_config: EmbedderConfig | str | None = None, ) -> "DescriptionScorer": """ diff --git a/autointent/modules/scoring/_dnnc/dnnc.py b/autointent/modules/scoring/_dnnc/dnnc.py index f64e21e79..d1a9aedd6 100644 --- a/autointent/modules/scoring/_dnnc/dnnc.py +++ b/autointent/modules/scoring/_dnnc/dnnc.py @@ -6,6 +6,7 @@ import numpy as np import numpy.typing as npt +from pydantic import PositiveInt from autointent import Context, Ranker, VectorIndex from autointent.custom_types import ListOfLabels @@ -76,9 +77,9 @@ class DNNCScorer(ScoringModule): def __init__( self, + k: PositiveInt, cross_encoder_config: CrossEncoderConfig | str | dict[str, Any], embedder_config: EmbedderConfig | str | dict[str, Any], - k: int, ) -> None: """ Initialize the DNNCScorer. @@ -96,7 +97,7 @@ def from_context( cls, context: Context, cross_encoder_config: CrossEncoderConfig | str, - k: int, + k: PositiveInt, embedder_config: EmbedderConfig | str | None = None, ) -> "DNNCScorer": """ diff --git a/autointent/modules/scoring/_knn/knn.py b/autointent/modules/scoring/_knn/knn.py index 13c5f1be7..b64b0246a 100644 --- a/autointent/modules/scoring/_knn/knn.py +++ b/autointent/modules/scoring/_knn/knn.py @@ -4,6 +4,7 @@ import numpy as np import numpy.typing as npt +from pydantic import PositiveInt from autointent import Context, VectorIndex from autointent.custom_types import WEIGHT_TYPES, ListOfLabels @@ -58,7 +59,7 @@ class KNNScorer(ScoringModule): def __init__( self, embedder_config: EmbedderConfig | str | dict[str, Any], - k: int, + k: PositiveInt, weights: WEIGHT_TYPES = "distance", ) -> None: """ @@ -79,7 +80,7 @@ def __init__( def from_context( cls, context: Context, - k: int, + k: PositiveInt, weights: WEIGHT_TYPES, embedder_config: EmbedderConfig | str | None = None, ) -> "KNNScorer": diff --git a/autointent/modules/scoring/_mlknn/mlknn.py b/autointent/modules/scoring/_mlknn/mlknn.py index 044766d06..44e7c2597 100644 --- a/autointent/modules/scoring/_mlknn/mlknn.py +++ b/autointent/modules/scoring/_mlknn/mlknn.py @@ -4,6 +4,7 @@ import numpy as np from numpy.typing import NDArray +from pydantic import NonNegativeInt, PositiveFloat, PositiveInt from autointent import Context, VectorIndex from autointent.custom_types import ListOfLabels @@ -58,7 +59,7 @@ class MLKnnScorer(ScoringModule): def __init__( self, - k: int, + k: PositiveInt, embedder_config: EmbedderConfig | str | dict[str, Any], s: float = 1.0, ignore_first_neighbours: int = 0, @@ -80,9 +81,9 @@ def __init__( def from_context( cls, context: Context, - k: int, - s: float = 1.0, - ignore_first_neighbours: int = 0, + k: PositiveInt, + s: PositiveFloat = 1.0, + ignore_first_neighbours: NonNegativeInt = 0, embedder_config: EmbedderConfig | str | None = None, ) -> "MLKnnScorer": """ diff --git a/autointent/nodes/_optimization/_node_optimizer.py b/autointent/nodes/_optimization/_node_optimizer.py index 43bde1a59..ca8b9981e 100644 --- a/autointent/nodes/_optimization/_node_optimizer.py +++ b/autointent/nodes/_optimization/_node_optimizer.py @@ -1,20 +1,37 @@ """Node optimizer.""" import gc -import itertools as it import logging from copy import deepcopy +from functools import partial from pathlib import Path from typing import Any +import optuna import torch +from optuna.trial import Trial +from pydantic import BaseModel, Field from autointent import Dataset from autointent.context import Context -from autointent.custom_types import NodeType +from autointent.custom_types import NodeType, SamplerType from autointent.nodes._nodes_info import NODES_INFO +class ParamSpaceInt(BaseModel): + low: int = Field(..., description="Low boundary of the search space.") + high: int = Field(..., description="High boundary of the search space.") + step: int = Field(1, description="Step of the search space.") + log: bool = Field(False, description="Whether to use a logarithmic scale.") + + +class ParamSpaceFloat(BaseModel): + low: float = Field(..., description="Low boundary of the search space.") + high: float = Field(..., description="High boundary of the search space.") + step: float | None = Field(None, description="Step of the search space.") + log: bool = Field(False, description="Whether to use a logarithmic scale.") + + class NodeOptimizer: """Node optimizer class.""" @@ -43,7 +60,7 @@ def __init__( self.modules_search_spaces = search_space self._logger = logging.getLogger(__name__) # TODO solve duplicate logging messages problem - def fit(self, context: Context) -> None: + def fit(self, context: Context, sampler: SamplerType = "brute") -> None: """ Fit the node optimizer. @@ -52,54 +69,107 @@ def fit(self, context: Context) -> None: self._logger.info("starting %s node optimization...", self.node_info.node_type) for search_space in deepcopy(self.modules_search_spaces): + self._counter = 0 module_name = search_space.pop("module_name") - - for j_combination, params_combination in enumerate(it.product(*search_space.values())): - module_kwargs = dict(zip(search_space.keys(), params_combination, strict=False)) - - self._logger.debug("initializing %s module...", module_name) - context.callback_handler.start_module( - module_name=module_name, num=j_combination, module_kwargs=module_kwargs - ) - module = self.node_info.modules_available[module_name].from_context(context, **module_kwargs) - - embedder_config = module.get_embedder_config() - if embedder_config is not None: - module_kwargs["embedder_config"] = embedder_config - - self._logger.debug("scoring %s module...", module_name) - metrics_score = module.score(context, metrics=self.metrics) - metric_value = metrics_score[self.target_metric] - - context.callback_handler.log_metrics(metrics_score) - context.callback_handler.end_module() - - dump_dir = context.get_dump_dir() - - if dump_dir is not None: - module_dump_dir = self.get_module_dump_dir(dump_dir, module_name, j_combination) - module.dump(module_dump_dir) - else: - module_dump_dir = None - - context.optimization_info.log_module_optimization( - self.node_info.node_type, - module_name, - module_kwargs, - metric_value, - self.target_metric, - module.get_assets(), # retriever name / scores / predictions - module_dump_dir, - module=module if not context.is_ram_to_clear() else None, - ) - - if context.is_ram_to_clear(): - module.clear_cache() - gc.collect() - torch.cuda.empty_cache() + n_trials = None + if "n_trials" in search_space: + n_trials = search_space.pop("n_trials") + if sampler == "tpe": + sampler_instance = optuna.samplers.TPESampler(seed=context.seed) + n_trials = n_trials or 10 + elif sampler == "brute": + sampler_instance = optuna.samplers.BruteForceSampler(seed=context.seed) # type: ignore[assignment] + n_trials = None + elif sampler == "random": + sampler_instance = optuna.samplers.RandomSampler(seed=context.seed) # type: ignore[assignment] + n_trials = n_trials or 10 + else: + msg = f"Unexpected sampler: {sampler}" + raise ValueError(msg) + study = optuna.create_study(direction="maximize", sampler=sampler_instance) + optuna.logging.set_verbosity(optuna.logging.WARNING) + obj = partial(self.objective, module_name=module_name, search_space=search_space, context=context) + study.optimize(obj, n_trials=n_trials) self._logger.info("%s node optimization is finished!", self.node_info.node_type) + def objective( + self, + trial: Trial, + module_name: str, + search_space: dict[str, ParamSpaceInt | ParamSpaceFloat | list[Any]], + context: Context, + ) -> float: + config = self.suggest(trial, search_space) + + self._logger.debug("initializing %s module...", module_name) + module = self.node_info.modules_available[module_name].from_context(context, **config) + + embedder_config = module.get_embedder_config() + if embedder_config is not None: + config["embedder_config"] = embedder_config + + context.callback_handler.start_module(module_name=module_name, num=self._counter, module_kwargs=config) + + self._logger.debug("scoring %s module...", module_name) + all_metrics = module.score(context, metrics=self.metrics) + target_metric = all_metrics[self.target_metric] + + context.callback_handler.log_metrics(all_metrics) + context.callback_handler.end_module() + + dump_dir = context.get_dump_dir() + + if dump_dir is not None: + module_dump_dir = self.get_module_dump_dir(dump_dir, module_name, self._counter) + module.dump(module_dump_dir) + else: + module_dump_dir = None + + context.optimization_info.log_module_optimization( + self.node_info.node_type, + module_name, + config, + target_metric, + self.target_metric, + module.get_assets(), # retriever name / scores / predictions + module_dump_dir, + module=module if not context.is_ram_to_clear() else None, + ) + + if context.is_ram_to_clear(): + module.clear_cache() + gc.collect() + torch.cuda.empty_cache() + + self._counter += 1 + + return target_metric + + def suggest(self, trial: Trial, search_space: dict[str, Any | list[Any]]) -> dict[str, Any]: + res: dict[str, Any] = {} + + def is_valid_param_space( + param_space: dict[str, Any], space_type: type[ParamSpaceInt | ParamSpaceFloat] + ) -> bool: + try: + space_type(**param_space) + return True # noqa: TRY300 + except ValueError: + return False + + for param_name, param_space in search_space.items(): + if isinstance(param_space, list): + res[param_name] = trial.suggest_categorical(param_name, choices=param_space) + elif is_valid_param_space(param_space, ParamSpaceInt): + res[param_name] = trial.suggest_int(param_name, **param_space) + elif is_valid_param_space(param_space, ParamSpaceFloat): + res[param_name] = trial.suggest_float(param_name, **param_space) + else: + msg = f"Unsupported type of param search space: {param_space}" + raise TypeError(msg) + return res + def get_module_dump_dir(self, dump_dir: Path, module_name: str, j_combination: int) -> str: """ Get module dump directory. diff --git a/autointent/nodes/schemes.py b/autointent/nodes/schemes.py index 58cba623b..acfc71fab 100644 --- a/autointent/nodes/schemes.py +++ b/autointent/nodes/schemes.py @@ -2,13 +2,58 @@ import inspect from collections.abc import Iterator -from typing import Any, Literal, TypeAlias, Union, get_type_hints +from typing import Annotated, Any, Literal, TypeAlias, Union, get_args, get_origin, get_type_hints -from pydantic import BaseModel, Field, RootModel +from pydantic import BaseModel, Field, PositiveInt, RootModel from autointent.custom_types import NodeType from autointent.modules.abc import Module from autointent.nodes import DecisionNodeInfo, EmbeddingNodeInfo, ScoringNodeInfo +from autointent.nodes._optimization._node_optimizer import ParamSpaceFloat, ParamSpaceInt + + +def unwrap_annotated(tp: type) -> type: + """ + Unwrap the Annotated type to get the actual type. + + :param tp: Type to unwrap + :return: Unwrapped type + """ + return get_args(tp)[0] if get_origin(tp) is Annotated else tp + + +def type_matches(target: type, tp: type) -> bool: + """ + Recursively check if the target type is present in the given type. + + This function handles union types by unwrapping Annotated types where necessary. + + :param target: Target type + :param tp: Given type + :return: If the target type is present in the given type + """ + origin = get_origin(tp) + + if origin is Union: # float | list[float] + return any(type_matches(target, arg) for arg in get_args(tp)) + return unwrap_annotated(tp) is target + + +def get_optuna_class(param_type: type) -> type[ParamSpaceInt | ParamSpaceFloat] | None: + """ + Get the Optuna class for the given parameter type. + + If the (possibly annotated or union) type includes int or float, this function + returns the corresponding search space class. + + :param param_type: Parameter type (could be a union, annotated type, or container) + :return: ParamSpaceInt if the type matches int, ParamSpaceFloat if it matches float, else None. + """ + if type_matches(int, param_type): + return ParamSpaceInt + if type_matches(float, param_type): + return ParamSpaceFloat + return None def generate_models_and_union_type_for_classes( @@ -20,9 +65,12 @@ def generate_models_and_union_type_for_classes( for cls in classes: init_signature = inspect.signature(cls.from_context) globalns = getattr(cls.from_context, "__globals__", {}) - type_hints = get_type_hints(cls.from_context, globalns, None) # Resolve forward refs + type_hints = get_type_hints(cls.from_context, globalns, None, include_extras=True) # Resolve forward refs - fields = {"module_name": (Literal[cls.name], Field(...))} + fields = { + "module_name": (Literal[cls.name], Field(...)), + "n_trials": (PositiveInt | None, Field(None, description="Number of trials")), + } for param_name, param in init_signature.parameters.items(): if param_name in ("self", "cls", "context"): @@ -30,8 +78,11 @@ def generate_models_and_union_type_for_classes( param_type: TypeAlias = type_hints.get(param_name, Any) # type: ignore[valid-type] # noqa: PYI042 field = Field(default=[param.default]) if param.default is not inspect.Parameter.empty else Field(...) - - fields[param_name] = (list[param_type], field) # type: ignore[assignment] + search_type = get_optuna_class(param_type) + if search_type is None: + fields[param_name] = (list[param_type], field) + else: + fields[param_name] = (list[param_type] | search_type, field) model_name = f"{cls.__name__}InitModel" models[cls.__name__] = type( diff --git a/docs/optimizer_config.schema.json b/docs/optimizer_config.schema.json index d213f8621..f51fa93a3 100644 --- a/docs/optimizer_config.schema.json +++ b/docs/optimizer_config.schema.json @@ -7,6 +7,20 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "search_space": { "default": [ null @@ -15,6 +29,8 @@ "anyOf": [ { "items": { + "maximum": 1.0, + "minimum": 0.0, "type": "number" }, "type": "array" @@ -40,6 +56,20 @@ "const": "argmax", "title": "Module Name", "type": "string" + }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" } }, "required": [ @@ -109,6 +139,20 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "cross_encoder_config": { "items": { "anyOf": [ @@ -124,11 +168,19 @@ "type": "array" }, "k": { - "items": { - "type": "integer" - }, - "title": "K", - "type": "array" + "anyOf": [ + { + "items": { + "exclusiveMinimum": 0, + "type": "integer" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], + "title": "K" }, "embedder_config": { "default": [ @@ -237,12 +289,34 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "temperature": { - "items": { - "type": "number" - }, - "title": "Temperature", - "type": "array" + "anyOf": [ + { + "items": { + "exclusiveMinimum": 0.0, + "type": "number" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceFloat" + } + ], + "title": "Temperature" }, "embedder_config": { "default": [ @@ -513,6 +587,20 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "search_space": { "default": [ null @@ -521,6 +609,8 @@ "anyOf": [ { "items": { + "maximum": 1.0, + "minimum": 0.0, "type": "number" }, "type": "array" @@ -547,12 +637,34 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "k": { - "items": { - "type": "integer" - }, - "title": "K", - "type": "array" + "anyOf": [ + { + "items": { + "exclusiveMinimum": 0, + "type": "integer" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], + "title": "K" }, "weights": { "items": { @@ -602,6 +714,20 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "embedder_config": { "default": [ null @@ -636,6 +762,20 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "embedder_config": { "items": { "anyOf": [ @@ -651,14 +791,22 @@ "type": "array" }, "cv": { + "anyOf": [ + { + "items": { + "exclusiveMinimum": 0, + "type": "integer" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], "default": [ 3 ], - "items": { - "type": "integer" - }, - "title": "Cv", - "type": "array" + "title": "Cv" } }, "required": [ @@ -675,32 +823,70 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "k": { - "items": { - "type": "integer" - }, - "title": "K", - "type": "array" + "anyOf": [ + { + "items": { + "exclusiveMinimum": 0, + "type": "integer" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], + "title": "K" }, "s": { + "anyOf": [ + { + "items": { + "exclusiveMinimum": 0.0, + "type": "number" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceFloat" + } + ], "default": [ 1.0 ], - "items": { - "type": "number" - }, - "title": "S", - "type": "array" + "title": "S" }, "ignore_first_neighbours": { + "anyOf": [ + { + "items": { + "minimum": 0, + "type": "integer" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], "default": [ 0 ], - "items": { - "type": "integer" - }, - "title": "Ignore First Neighbours", - "type": "array" + "title": "Ignore First Neighbours" }, "embedder_config": { "default": [ @@ -741,6 +927,77 @@ "title": "NodeType", "type": "string" }, + "ParamSpaceFloat": { + "properties": { + "low": { + "description": "Low boundary of the search space.", + "title": "Low", + "type": "number" + }, + "high": { + "description": "High boundary of the search space.", + "title": "High", + "type": "number" + }, + "step": { + "anyOf": [ + { + "type": "number" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Step of the search space.", + "title": "Step" + }, + "log": { + "default": false, + "description": "Whether to use a logarithmic scale.", + "title": "Log", + "type": "boolean" + } + }, + "required": [ + "low", + "high" + ], + "title": "ParamSpaceFloat", + "type": "object" + }, + "ParamSpaceInt": { + "properties": { + "low": { + "description": "Low boundary of the search space.", + "title": "Low", + "type": "integer" + }, + "high": { + "description": "High boundary of the search space.", + "title": "High", + "type": "integer" + }, + "step": { + "default": 1, + "description": "Step of the search space.", + "title": "Step", + "type": "integer" + }, + "log": { + "default": false, + "description": "Whether to use a logarithmic scale.", + "title": "Log", + "type": "boolean" + } + }, + "required": [ + "low", + "high" + ], + "title": "ParamSpaceInt", + "type": "object" + }, "RerankScorerInitModel": { "properties": { "module_name": { @@ -748,12 +1005,33 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "k": { - "items": { - "type": "integer" - }, - "title": "K", - "type": "array" + "anyOf": [ + { + "items": { + "type": "integer" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], + "title": "K" }, "weights": { "items": { @@ -802,38 +1080,52 @@ "type": "array" }, "m": { + "anyOf": [ + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], "default": [ null ], - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "title": "M", - "type": "array" + "title": "M" }, "rank_threshold_cutoff": { + "anyOf": [ + { + "items": { + "anyOf": [ + { + "type": "integer" + }, + { + "type": "null" + } + ] + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], "default": [ null ], - "items": { - "anyOf": [ - { - "type": "integer" - }, - { - "type": "null" - } - ] - }, - "title": "Rank Threshold Cutoff", - "type": "array" + "title": "Rank Threshold Cutoff" } }, "required": [ @@ -852,12 +1144,34 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "k": { - "items": { - "type": "integer" - }, - "title": "K", - "type": "array" + "anyOf": [ + { + "items": { + "exclusiveMinimum": 0, + "type": "integer" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], + "title": "K" }, "embedder_config": { "items": { @@ -976,6 +1290,20 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "clf_name": { "default": [ "LogisticRegression" @@ -1037,25 +1365,50 @@ "title": "Module Name", "type": "string" }, + "n_trials": { + "anyOf": [ + { + "exclusiveMinimum": 0, + "type": "integer" + }, + { + "type": "null" + } + ], + "default": null, + "description": "Number of trials", + "title": "N Trials" + }, "thresh": { + "anyOf": [ + { + "items": { + "anyOf": [ + { + "maximum": 1.0, + "minimum": 0.0, + "type": "number" + }, + { + "items": { + "maximum": 1.0, + "minimum": 0.0, + "type": "number" + }, + "type": "array" + } + ] + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceFloat" + } + ], "default": [ 0.5 ], - "items": { - "anyOf": [ - { - "type": "number" - }, - { - "items": { - "type": "number" - }, - "type": "array" - } - ] - }, - "title": "Thresh", - "type": "array" + "title": "Thresh" } }, "required": [ @@ -1072,14 +1425,22 @@ "type": "string" }, "n_trials": { + "anyOf": [ + { + "items": { + "exclusiveMinimum": 0, + "type": "integer" + }, + "type": "array" + }, + { + "$ref": "#/$defs/ParamSpaceInt" + } + ], "default": [ 320 ], - "items": { - "type": "integer" - }, - "title": "N Trials", - "type": "array" + "title": "N Trials" } }, "required": [ diff --git a/tests/assets/configs/multiclass.yaml b/tests/assets/configs/multiclass.yaml index 69db79f8b..3fbf8948c 100644 --- a/tests/assets/configs/multiclass.yaml +++ b/tests/assets/configs/multiclass.yaml @@ -20,7 +20,7 @@ - avsolatorio/GIST-small-Embedding-v0 k: [1, 3] - module_name: sklearn - embedder_name: + embedder_config: - sergeyzh/rubert-tiny-turbo clf_name: - LogisticRegression @@ -36,6 +36,5 @@ search_space: - module_name: threshold thresh: [0.5, [0.5, 0.5, 0.5, 0.5]] - - module_name: tunable - module_name: argmax - module_name: jinoos diff --git a/tests/assets/configs/multilabel.yaml b/tests/assets/configs/multilabel.yaml index 879c31c6d..91742358a 100644 --- a/tests/assets/configs/multilabel.yaml +++ b/tests/assets/configs/multilabel.yaml @@ -32,5 +32,4 @@ search_space: - module_name: threshold thresh: [0.5, [0.5, 0.5, 0.5, 0.5]] - - module_name: tunable - module_name: adaptive diff --git a/tests/assets/configs/optuna.yaml b/tests/assets/configs/optuna.yaml new file mode 100644 index 000000000..b775ab3f6 --- /dev/null +++ b/tests/assets/configs/optuna.yaml @@ -0,0 +1,28 @@ +- node_type: embedding + target_metric: retrieval_hit_rate + search_space: + - module_name: retrieval + k: [10] + embedder_config: + - sentence-transformers/all-MiniLM-L6-v2 + - avsolatorio/GIST-small-Embedding-v0 +- node_type: scoring + target_metric: scoring_roc_auc + search_space: + - module_name: knn + n_trials: 3 + k: + low: 5 + high: 10 + step: 1 + weights: [uniform, distance, closest] + - module_name: linear +- node_type: decision + target_metric: decision_accuracy + search_space: + - module_name: threshold + thresh: + low: 0.1 + high: 0.9 + - module_name: argmax + - module_name: jinoos diff --git a/tests/callback/test_callback.py b/tests/callback/test_callback.py index 53b70491c..c9931a6b7 100644 --- a/tests/callback/test_callback.py +++ b/tests/callback/test_callback.py @@ -118,17 +118,78 @@ def test_pipeline_callbacks(dataset): ("end_module", {}), ( "start_module", - {"module_kwargs": {"embedder_config": None, "k": 1, "weights": "uniform"}, "module_name": "knn", "num": 0}, + { + "module_kwargs": { + "embedder_config": { + "batch_size": 32, + "classifier_prompt": None, + "cluster_prompt": None, + "default_prompt": None, + "device": None, + "max_length": None, + "model_name": "sergeyzh/rubert-tiny-turbo", + "passage_prompt": None, + "query_prompt": None, + "sts_prompt": None, + "use_cache": False, + }, + "k": 1, + "weights": "uniform", + }, + "module_name": "knn", + "num": 0, + }, ), ("log_metric", {"metrics": {"scoring_accuracy": 1.0, "scoring_roc_auc": 1.0}}), ("end_module", {}), ( "start_module", - {"module_kwargs": {"embedder_config": None, "k": 1, "weights": "distance"}, "module_name": "knn", "num": 1}, + { + "module_kwargs": { + "embedder_config": { + "batch_size": 32, + "classifier_prompt": None, + "cluster_prompt": None, + "default_prompt": None, + "device": None, + "max_length": None, + "model_name": "sergeyzh/rubert-tiny-turbo", + "passage_prompt": None, + "query_prompt": None, + "sts_prompt": None, + "use_cache": False, + }, + "k": 1, + "weights": "distance", + }, + "module_name": "knn", + "num": 1, + }, ), ("log_metric", {"metrics": {"scoring_accuracy": 1.0, "scoring_roc_auc": 1.0}}), ("end_module", {}), - ("start_module", {"module_kwargs": {"embedder_config": None}, "module_name": "linear", "num": 0}), + ( + "start_module", + { + "module_kwargs": { + "embedder_config": { + "batch_size": 32, + "classifier_prompt": None, + "cluster_prompt": None, + "default_prompt": None, + "device": None, + "max_length": None, + "model_name": "sergeyzh/rubert-tiny-turbo", + "passage_prompt": None, + "query_prompt": None, + "sts_prompt": None, + "use_cache": False, + }, + }, + "module_name": "linear", + "num": 0, + }, + ), ("log_metric", {"metrics": {"scoring_accuracy": 0.75, "scoring_roc_auc": 1.0}}), ("end_module", {}), ("start_module", {"module_kwargs": {"thresh": 0.5}, "module_name": "threshold", "num": 0}), diff --git a/tests/conftest.py b/tests/conftest.py index 1945b3426..002812907 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -27,7 +27,7 @@ def dataset_unsplitted(): return Dataset.from_json(path) -TaskType = Literal["multiclass", "multilabel", "description"] +TaskType = Literal["multiclass", "multilabel", "description", "optuna"] def get_search_space_path(task_type: TaskType): diff --git a/tests/pipeline/test_optimization.py b/tests/pipeline/test_optimization.py index 3df2918e9..de34bac5d 100644 --- a/tests/pipeline/test_optimization.py +++ b/tests/pipeline/test_optimization.py @@ -1,5 +1,4 @@ import os -from typing import Literal import pytest @@ -10,7 +9,21 @@ ) from tests.conftest import get_search_space, setup_environment -TaskType = Literal["multiclass", "multilabel", "description"] + +@pytest.mark.parametrize( + "sampler", + ["tpe", "random"], +) +def test_bayes(dataset, sampler): + project_dir = setup_environment() + search_space = get_search_space("optuna") + + pipeline_optimizer = Pipeline.from_search_space(search_space) + + pipeline_optimizer.set_config(LoggingConfig(project_dir=project_dir, dump_modules=True, clear_ram=True)) + pipeline_optimizer.set_config(VectorIndexConfig()) + + pipeline_optimizer.fit(dataset, scheme="ho", refit_after=False, sampler=sampler) @pytest.mark.parametrize(