Skip to content

Commit

Permalink
Parameter permutation (#341)
Browse files Browse the repository at this point in the history
This PR adds code to:
* sort the user-provided values before storing them as attributes in
discrete parameters (see also #336)
* sort the parameters stored in search spaces

If unsorted, this can cause problems with reproducibility in the sense
that the same parameter content provided in a different order can lead
to different optimization results. For instance, the `RandomRecommender`
randomly selects rows from the `comp_rep` dataframe for the discrete
subspace, which will be ordered differently if the parameter values come
in a different order. This can lead to rather surprising behavior, like
in situations when the parameter values are given as the output of
Python's `set` function, whose order depends on `PYTHONHASHSEED`.
  • Loading branch information
AdrianSosic authored Sep 3, 2024
2 parents e98b63d + e62bd6b commit c2dd164
Show file tree
Hide file tree
Showing 8 changed files with 39 additions and 49 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- `to_tensor` now also handles `numpy` arrays
- `MIN` mode of `NumericalTarget` is now implemented via the acquisition function
instead of negating the computational representation
- Search spaces now store their parameters in alphabetical order by name

### Fixed
- `CategoricalParameter` and `TaskParameter` no longer incorrectly coerce a single
Expand Down
9 changes: 7 additions & 2 deletions baybe/parameters/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,12 @@
from baybe.utils.numerical import DTypeFloatNumpy


def _convert_values(value, self, field) -> tuple[str, ...]:
"""Sort and convert values for categorical parameters."""
value = nonstring_to_tuple(value, self, field)
return tuple(sorted(value))


@define(frozen=True, slots=False)
class CategoricalParameter(DiscreteParameter):
"""Parameter class for categorical parameters."""
Expand All @@ -26,8 +32,7 @@ class CategoricalParameter(DiscreteParameter):
# object variables
_values: tuple[str, ...] = field(
alias="values",
# FIXME[typing]: `attrs.Converter` is not yet supported by type checkers
converter=Converter(nonstring_to_tuple, takes_self=True, takes_field=True), # type: ignore
converter=Converter(_convert_values, takes_self=True, takes_field=True), # type: ignore
validator=( # type: ignore
min_len(2),
validate_unique_values,
Expand Down
3 changes: 2 additions & 1 deletion baybe/parameters/substance.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,13 @@ class SubstanceParameter(DiscreteParameter):

# object variables
data: dict[str, Smiles] = field(
converter=lambda x: dict(sorted(x.items())),
validator=deep_mapping(
mapping_validator=min_len(2),
# FIXME[typing]: https://github.com/python-attrs/attrs/issues/1206
key_validator=and_(instance_of(str), min_len(1)),
value_validator=lambda *x: None,
)
),
)
"""A mapping that provides the SMILES strings for all available parameter values."""

Expand Down
5 changes: 5 additions & 0 deletions baybe/parameters/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,8 @@ def get_parameters_from_dataframe(
)

return parameters


def sort_parameters(parameters: Collection[Parameter]) -> tuple[Parameter, ...]:
"""Sort parameters alphabetically by their names."""
return tuple(sorted(parameters, key=lambda p: p.name))
5 changes: 3 additions & 2 deletions baybe/searchspace/continuous.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
)
from baybe.parameters import NumericalContinuousParameter
from baybe.parameters.base import ContinuousParameter
from baybe.parameters.utils import get_parameters_from_dataframe
from baybe.parameters.utils import get_parameters_from_dataframe, sort_parameters
from baybe.searchspace.validation import (
get_transform_parameters,
validate_parameter_names,
Expand All @@ -47,7 +47,8 @@ class SubspaceContinuous(SerialMixin):
"""

parameters: tuple[NumericalContinuousParameter, ...] = field(
converter=to_tuple, validator=lambda _, __, x: validate_parameter_names(x)
converter=sort_parameters,
validator=lambda _, __, x: validate_parameter_names(x),
)
"""The parameters of the subspace."""

Expand Down
5 changes: 3 additions & 2 deletions baybe/searchspace/discrete.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
TaskParameter,
)
from baybe.parameters.base import DiscreteParameter, Parameter
from baybe.parameters.utils import get_parameters_from_dataframe
from baybe.parameters.utils import get_parameters_from_dataframe, sort_parameters
from baybe.searchspace.validation import (
get_transform_parameters,
validate_parameter_names,
Expand Down Expand Up @@ -91,7 +91,8 @@ class SubspaceDiscrete(SerialMixin):
"""

parameters: tuple[DiscreteParameter, ...] = field(
converter=to_tuple, validator=lambda _, __, x: validate_parameter_names(x)
converter=sort_parameters,
validator=lambda _, __, x: validate_parameter_names(x),
)
"""The list of parameters of the subspace."""

Expand Down
58 changes: 17 additions & 41 deletions examples/Custom_Hooks/campaign_stopping.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,44 +59,28 @@

### Problem Definition and Lookup Functionality

# We load the dataframe containing the lookup data for the closed-loop simulation:

try:
lookup = pd.read_excel("./../Backtesting/lookup.xlsx")
except FileNotFoundError:
lookup = pd.read_excel("examples/Backtesting/lookup.xlsx")

# Following the setup described [here](../Backtesting/full_lookup.md), we create the
# building blocks for the optimization problem:

dict_solvent = {
"DMAc": r"CC(N(C)C)=O",
"Butyornitrile": r"CCCC#N",
"Butyl Ester": r"CCCCOC(C)=O",
"p-Xylene": r"CC1=CC=C(C)C=C1",
}
dict_base = {
"Potassium acetate": r"O=C([O-])C.[K+]",
"Potassium pivalate": r"O=C([O-])C(C)(C)C.[K+]",
"Cesium acetate": r"O=C([O-])C.[Cs+]",
"Cesium pivalate": r"O=C([O-])C(C)(C)C.[Cs+]",
}
dict_ligand = {
"BrettPhos": r"CC(C)C1=CC(C(C)C)=C(C(C(C)C)=C1)C2=C(P(C3CCCCC3)C4CCCCC4)C(OC)="
"CC=C2OC",
"Di-tert-butylphenylphosphine": r"CC(C)(C)P(C1=CC=CC=C1)C(C)(C)C",
"(t-Bu)PhCPhos": r"CN(C)C1=CC=CC(N(C)C)=C1C2=CC=CC=C2P(C(C)(C)C)C3=CC=CC=C3",
"Tricyclohexylphosphine": r"P(C1CCCCC1)(C2CCCCC2)C3CCCCC3",
"PPh3": r"P(C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3",
"XPhos": r"CC(C1=C(C2=CC=CC=C2P(C3CCCCC3)C4CCCCC4)C(C(C)C)=CC(C(C)C)=C1)C",
"P(2-furyl)3": r"P(C1=CC=CO1)(C2=CC=CO2)C3=CC=CO3",
"Methyldiphenylphosphine": r"CP(C1=CC=CC=C1)C2=CC=CC=C2",
"1268824-69-6": r"CC(OC1=C(P(C2CCCCC2)C3CCCCC3)C(OC(C)C)=CC=C1)C",
"JackiePhos": r"FC(F)(F)C1=CC(P(C2=C(C3=C(C(C)C)C=C(C(C)C)C=C3C(C)C)C(OC)=CC=C2OC)"
r"C4=CC(C(F)(F)F)=CC(C(F)(F)F)=C4)=CC(C(F)(F)F)=C1",
"SCHEMBL15068049": r"C[C@]1(O2)O[C@](C[C@]2(C)P3C4=CC=CC=C4)(C)O[C@]3(C)C1",
"Me2PPh": r"CP(C)C1=CC=CC=C1",
}
solvent_data = dict(set(zip(lookup.Solvent, lookup.Solvent_SMILES)))
base_data = dict(set(zip(lookup.Base, lookup.Base_SMILES)))
ligand_data = dict(set(zip(lookup.Ligand, lookup.Ligand_SMILES)))
temperature_values = set(lookup.Temp_C)
concentration_values = set(lookup.Concentration)

parameters = [
SubstanceParameter(name="Solvent", data=dict_solvent, encoding="MORDRED"),
SubstanceParameter(name="Base", data=dict_base, encoding="MORDRED"),
SubstanceParameter(name="Ligand", data=dict_ligand, encoding="MORDRED"),
NumericalDiscreteParameter(name="Temp_C", values=[90, 105, 120], tolerance=2),
NumericalDiscreteParameter(name="Concentration", values=[0.057, 0.1, 0.153]),
SubstanceParameter(name="Solvent", data=solvent_data, encoding="MORDRED"),
SubstanceParameter(name="Base", data=base_data, encoding="MORDRED"),
SubstanceParameter(name="Ligand", data=ligand_data, encoding="MORDRED"),
NumericalDiscreteParameter(name="Temp_C", values=temperature_values, tolerance=2),
NumericalDiscreteParameter(name="Concentration", values=concentration_values),
]

searchspace = SearchSpace.from_product(parameters=parameters)
Expand All @@ -107,14 +91,6 @@
initial_recommender=RandomRecommender(), recommender=BotorchRecommender()
)

# Also, we load the dataframe containing the lookup data for the closed-loop simulation:

try:
lookup = pd.read_excel("./../Backtesting/lookup.xlsx")
except FileNotFoundError:
lookup = pd.read_excel("examples/Backtesting/lookup.xlsx")


### Simulating the Uninterrupted Campaigns

# First, we run several Monte Carlo repetitions of the uninterrupted campaign to get a
Expand Down
2 changes: 1 addition & 1 deletion tests/test_searchspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def test_discrete_searchspace_creation_from_dataframe():
name="cat_unspecified", values=["d", "e", "f"]
)

all_params = (num_specified, num_unspecified, cat_specified, cat_unspecified)
all_params = (cat_specified, cat_unspecified, num_specified, num_unspecified)

df = pd.DataFrame({param.name: param.values for param in all_params})
searchspace = SearchSpace(
Expand Down

0 comments on commit c2dd164

Please sign in to comment.