Merge branch 'main' into integration-tests-increase-indulgence

# Conflicts: # test/integration/test_genetic_schemes.py
aimclub · Mar 30, 2024 · ac95d93 · ac95d93
2 parents 33770db + 68706be
commit ac95d93
Show file tree

Hide file tree

Showing 103 changed files with 2,357 additions and 920 deletions.
diff --git a/.github/workflows/publish_pypi.yml b/.github/workflows/publish_pypi.yml
@@ -29,4 +29,4 @@ jobs:
         with:
           user: __token__
           password: ${{ secrets.GOLEM_PYPI_PUBLISH }}
-          repository_url: https://upload.pypi.org/legacy/
+          repository-url: https://upload.pypi.org/legacy/
diff --git a/README.rst b/README.rst
@@ -7,7 +7,7 @@
 
     |sai| |itmo|
 
-    |python| |pypi| |build| |integration| |docs| |license| |tg| |eng| |mirror|
+    |python| |pypi| |build| |integration| |coverage| |docs| |license| |tg| |eng| |mirror|
 
 
 Оптимизация и обучение графовых моделей эволюционными методами
@@ -80,7 +80,7 @@ GOLEM можно установить с помощью ``pip``:
 Быстрый старт
 =============
 
-Следующий пример показывает поиск графа по графу-эталону с помощью метрики расстояния редактирования (Edit Distance). Оптимизатор настраивается с минимальным набором параметров и простыми одноточечными мутациями. Более подробные примеры можно найти в файлах `simple_run.py <https://github.com/aimclub/GOLEM/blob/main/examples/synthetic_graph_evolution/simple_run.py>`_, `graph_search.py <https://github.com/aimclub/GOLEM/blob/main/examples/synthetic_graph_evolution/graph_search.py>`_ и `tree_search.py <https://github.com/aimclub/GOLEM/blob/main/examples/synthetic_graph_evolution/tree_search.py>`_ в директории `examples/synthetic_graph_evolution <https://github.com/aimclub/GOLEM/tree/main/examples/synthetic_graph_evolution>`_.
+Следующий пример показывает поиск графа по графу-эталону с помощью метрики редакционного расстояния (Edit Distance). Оптимизатор настраивается с минимальным набором параметров и простыми одноточечными мутациями. Более подробные примеры можно найти в файлах `simple_run.py <https://github.com/aimclub/GOLEM/blob/main/examples/synthetic_graph_evolution/simple_run.py>`_, `graph_search.py <https://github.com/aimclub/GOLEM/blob/main/examples/synthetic_graph_evolution/graph_search.py>`_ и `tree_search.py <https://github.com/aimclub/GOLEM/blob/main/examples/synthetic_graph_evolution/tree_search.py>`_ в директории `examples/synthetic_graph_evolution <https://github.com/aimclub/GOLEM/tree/main/examples/synthetic_graph_evolution>`_.
 
 .. code-block:: python
 
@@ -106,6 +106,13 @@ GOLEM можно установить с помощью ``pip``:
         optimiser.history.show.fitness_line()
         return found_graph
 
+Если проследить предков найденного графа, будет видно, как к нему один за другим применяются генетические операторы (мутации, скрещивания и т.д.), приводящие, в конечном итоге, к целевому графу:
+
+.. image:: /docs/source/img/evolution_process.gif
+   :alt: Процесс эволюции
+   :align: center
+
+Можно также заметить, что, несмотря на общее улучшение фитнеса вдоль генеалогического пути, оптимизатор иногда жертвует локальным уменьшением редакционного расстояния некоторых графов ради поддержания разнообразия и получения таким образом наилучшего решения в конце.
 
 Структура проекта
 =================
@@ -226,14 +233,14 @@ GOLEM можно установить с помощью ``pip``:
 .. |eng| image:: https://img.shields.io/badge/lang-en-red.svg
             :target: /README_en.rst
 
-.. |ITMO| image:: https://github.com/aimclub/open-source-ops/blob/add_badge/badges/ITMO_badge_rus.svg
+.. |ITMO| image:: https://raw.githubusercontent.com/aimclub/open-source-ops/43bb283758b43d75ec1df0a6bb4ae3eb20066323/badges/ITMO_badge_rus.svg
    :alt: Acknowledgement to ITMO
    :target: https://itmo.ru
 
-.. |SAI| image:: https://github.com/aimclub/open-source-ops/blob/add_badge/badges/SAI_badge.svg
+.. |SAI| image:: https://raw.githubusercontent.com/aimclub/open-source-ops/43bb283758b43d75ec1df0a6bb4ae3eb20066323/badges/SAI_badge.svg
    :alt: Acknowledgement to SAI
    :target: https://sai.itmo.ru/
 
-.. |mirror| image:: https://camo.githubusercontent.com/9bd7b8c5b418f1364e72110a83629772729b29e8f3393b6c86bff237a6b784f6/68747470733a2f2f62616467656e2e6e65742f62616467652f6769746c61622f6d6972726f722f6f72616e67653f69636f6e3d6769746c6162
+.. |mirror| image:: https://img.shields.io/badge/mirror-GitLab-orange
    :alt: GitLab mirror for this repository
-   :target: https://gitlab.actcognitive.org/itmo-nss-team/GOLEM
+   :target: https://gitlab.actcognitive.org/itmo-nss-team/GOLEM
diff --git a/README_en.rst b/README_en.rst
@@ -7,7 +7,7 @@
 
     |sai| |itmo|
 
-    |python| |pypi| |build| |integration| |docs| |license| |tg| |rus| |mirror|
+    |python| |pypi| |build| |integration| |coverage| |docs| |license| |tg| |rus| |mirror|
 
 
 Graph Optimization and Learning by Evolutionary Methods
@@ -105,6 +105,14 @@ Following example demonstrates graph search using reference graph & edit distanc
         return found_graph
 
 
+Tracing the lineage of the found_graph reveals how genetic operators (mutations, crossovers, etc.) are applied to a random graph one after another, eventually leading to the target graph:
+
+.. image:: /docs/source/img/evolution_process.gif
+   :alt: Evolution process
+   :align: center
+
+One can also notice that despite the fact that the edit distance generally decreases along the genealogical path, the optimizer sometimes sacrifices local fitness gain of some graphs in order to achieve diversity and thus obtain the best possible solution at the end.
+
 Project Structure
 =================
 
@@ -223,16 +231,16 @@ There are various cases solved with GOLEM's algorithms:
    :alt: Powered by GOLEM
 
 .. |rus| image:: https://img.shields.io/badge/lang-ru-yellow.svg
-            :target: /README.rst
+   :target: /README.rst
 
-.. |ITMO| image:: https://github.com/aimclub/open-source-ops/blob/add_badge/badges/ITMO_badge.svg
+.. |ITMO| image:: https://raw.githubusercontent.com/aimclub/open-source-ops/43bb283758b43d75ec1df0a6bb4ae3eb20066323/badges/ITMO_badge.svg
    :alt: Acknowledgement to ITMO
    :target: https://en.itmo.ru/en/
 
-.. |SAI| image:: https://github.com/aimclub/open-source-ops/blob/add_badge/badges/SAI_badge.svg
+.. |SAI| image:: https://raw.githubusercontent.com/aimclub/open-source-ops/43bb283758b43d75ec1df0a6bb4ae3eb20066323/badges/SAI_badge.svg
    :alt: Acknowledgement to SAI
    :target: https://sai.itmo.ru/
 
-.. |mirror| image:: https://camo.githubusercontent.com/9bd7b8c5b418f1364e72110a83629772729b29e8f3393b6c86bff237a6b784f6/68747470733a2f2f62616467656e2e6e65742f62616467652f6769746c61622f6d6972726f722f6f72616e67653f69636f6e3d6769746c6162
+.. |mirror| image:: https://img.shields.io/badge/mirror-GitLab-orange
    :alt: GitLab mirror for this repository
-   :target: https://gitlab.actcognitive.org/itmo-nss-team/GOLEM
+   :target: https://gitlab.actcognitive.org/itmo-nss-team/GOLEM
diff --git a/docs/source/api/tuning.rst b/docs/source/api/tuning.rst
@@ -52,11 +52,6 @@ You can tune all parameters of graph nodes simultaneously using ``SimultaneousTu
 .. note::
    ``IOptTuner`` implements deterministic algorithm.
 
-   For now ``IOptTuner`` can not be constrained by time, so constrain execution by number of iterations.
-
-   Also ``IOptTuner`` can optimise only `continuous` and `discrete` parameters but not `categorical` ones.
-   `Categorical` parameters will be ignored while tuning.
-
    ``IOptTuner`` is implemented using `IOpt library`_. See the `documentation`_ (in Russian) to learn more about
    the optimisation algorithm.
 

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -24,7 +24,7 @@
 author = 'NSS Lab'
 
 # The full version, including alpha/beta/rc tags
-release = '0.3.3'
+release = '0.4.0'
 # -- General configuration ---------------------------------------------------
 
 # Add any Sphinx extension module names here, as strings. They can be

diff --git a/docs/source/img/evolution_process.gif b/docs/source/img/evolution_process.gif
diff --git a/examples/molecule_search/experiment.py b/examples/molecule_search/experiment.py
@@ -16,6 +16,8 @@
     normalized_logp, CLScorer
 from golem.core.dag.verification_rules import has_no_self_cycled_nodes, has_no_isolated_components, \
     has_no_isolated_nodes
+from golem.core.optimisers.adaptive.agent_trainer import AgentTrainer
+from golem.core.optimisers.adaptive.history_collector import HistoryReader
 from golem.core.optimisers.adaptive.operator_agent import MutationAgentTypeEnum
 from golem.core.optimisers.genetic.gp_optimizer import EvoGraphOptimizer
 from golem.core.optimisers.genetic.gp_params import GPAlgorithmParameters
@@ -25,6 +27,7 @@
 from golem.core.optimisers.objective import Objective
 from golem.core.optimisers.opt_history_objects.opt_history import OptHistory
 from golem.core.optimisers.optimizer import GraphGenerationParams, GraphOptimizer
+from golem.core.paths import project_root
 from golem.visualisation.opt_history.multiple_fitness_line import MultipleFitnessLines
 from golem.visualisation.opt_viz_extra import visualise_pareto
 
@@ -129,6 +132,16 @@ def visualize_results(molecules: Iterable[MolGraph],
         image.show()
 
 
+def pretrain_agent(optimizer: EvoGraphOptimizer, objective: Objective, results_dir: str) -> AgentTrainer:
+    agent = optimizer.mutation.agent
+    trainer = AgentTrainer(objective, optimizer.mutation, agent)
+    # load histories
+    history_reader = HistoryReader(Path(results_dir))
+    # train agent
+    trainer.fit(histories=history_reader.load_histories(), validate_each=1)
+    return trainer
+
+
 def run_experiment(optimizer_setup: Callable,
                    optimizer_cls: Type[GraphOptimizer] = EvoGraphOptimizer,
                    adaptive_kind: MutationAgentTypeEnum = MutationAgentTypeEnum.random,
@@ -143,13 +156,14 @@ def run_experiment(optimizer_setup: Callable,
                    trial_iterations: Optional[int] = None,
                    visualize: bool = False,
                    save_history: bool = True,
+                   pretrain_dir: Optional[str] = None,
                    ):
+    metrics = metrics or ['qed_score']
     optimizer_id = optimizer_cls.__name__.lower()[:3]
     experiment_id = f'Experiment [optimizer={optimizer_id} metrics={", ".join(metrics)} pop_size={pop_size}]'
     exp_name = f'{optimizer_id}_{adaptive_kind.value}_popsize{pop_size}_min{trial_timeout}_{"_".join(metrics)}'
 
     atom_types = atom_types or ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br']
-    metrics = metrics or ['qed_score']
     trial_results = []
     trial_histories = []
     trial_timedelta = timedelta(minutes=trial_timeout) if trial_timeout else None
@@ -165,6 +179,9 @@ def run_experiment(optimizer_setup: Callable,
                                                pop_size,
                                                metrics,
                                                initial_molecules)
+        if pretrain_dir:
+            pretrain_agent(optimizer, objective, pretrain_dir)
+
         found_graphs = optimizer.optimise(objective)
         history = optimizer.history
 
@@ -208,10 +225,11 @@ def plot_experiment_comparison(experiment_ids: Sequence[str], metric_id: int = 0
 
 if __name__ == '__main__':
     run_experiment(molecule_search_setup,
-                   adaptive_kind=MutationAgentTypeEnum.random,
+                   adaptive_kind=MutationAgentTypeEnum.bandit,
                    max_heavy_atoms=38,
-                   trial_timeout=15,
+                   trial_timeout=6,
                    pop_size=50,
-                   metrics=['qed_score', 'cl_score'],
                    visualize=True,
-                   num_trials=5)
+                   num_trials=5,
+                   pretrain_dir=os.path.join(project_root(), 'examples', 'molecule_search', 'histories')
+                   )
diff --git a/examples/molecule_search/experiment_with_api.py b/examples/molecule_search/experiment_with_api.py
@@ -0,0 +1,141 @@
+import os.path
+from datetime import timedelta
+from pathlib import Path
+from typing import Type, Optional, Sequence, List
+
+import numpy as np
+from rdkit.Chem.rdchem import BondType
+
+from examples.molecule_search.experiment import visualize_results, get_methane, get_all_mol_metrics
+from examples.molecule_search.mol_adapter import MolAdapter
+from examples.molecule_search.mol_advisor import MolChangeAdvisor
+from examples.molecule_search.mol_graph import MolGraph
+from examples.molecule_search.mol_mutations import CHEMICAL_MUTATIONS
+from golem.api.main import GOLEM
+from golem.core.dag.verification_rules import has_no_self_cycled_nodes, has_no_isolated_components, \
+    has_no_isolated_nodes
+from golem.core.optimisers.adaptive.operator_agent import MutationAgentTypeEnum
+from golem.core.optimisers.genetic.gp_optimizer import EvoGraphOptimizer
+from golem.core.optimisers.genetic.operators.crossover import CrossoverTypesEnum
+from golem.core.optimisers.genetic.operators.elitism import ElitismTypesEnum
+from golem.core.optimisers.genetic.operators.inheritance import GeneticSchemeTypesEnum
+from golem.core.optimisers.objective import Objective
+from golem.core.optimisers.opt_history_objects.opt_history import OptHistory
+from golem.core.optimisers.optimizer import GraphOptimizer
+from golem.core.paths import project_root
+from golem.visualisation.opt_history.multiple_fitness_line import MultipleFitnessLines
+
+
+def run_experiment(optimizer_cls: Type[GraphOptimizer] = EvoGraphOptimizer,
+                   adaptive_kind: MutationAgentTypeEnum = MutationAgentTypeEnum.random,
+                   max_heavy_atoms: int = 50,
+                   atom_types: Optional[List[str]] = None,
+                   bond_types: Sequence[BondType] = (BondType.SINGLE, BondType.DOUBLE, BondType.TRIPLE),
+                   initial_molecules: Optional[Sequence[MolGraph]] = None,
+                   pop_size: int = 20,
+                   metrics: Optional[List[str]] = None,
+                   num_trials: int = 1,
+                   trial_timeout: Optional[int] = None,
+                   trial_iterations: Optional[int] = None,
+                   visualize: bool = False,
+                   save_history: bool = True,
+                   pretrain_dir: Optional[str] = None,
+                   ):
+    metrics = metrics or ['qed_score']
+    optimizer_id = optimizer_cls.__name__.lower()[:3]
+    experiment_id = f'Experiment [optimizer={optimizer_id} metrics={", ".join(metrics)} pop_size={pop_size}]'
+    exp_name = f'{optimizer_id}_{adaptive_kind.value}_popsize{pop_size}_min{trial_timeout}_{"_".join(metrics)}'
+
+    atom_types = atom_types or ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br']
+    trial_results = []
+    trial_histories = []
+    trial_timedelta = timedelta(minutes=trial_timeout) if trial_timeout else None
+    all_metrics = get_all_mol_metrics()
+    objective = Objective(
+        quality_metrics={metric_name: all_metrics[metric_name] for metric_name in metrics},
+        is_multi_objective=len(metrics) > 1
+    )
+
+    for trial in range(num_trials):
+
+        metrics = metrics or ['qed_score']
+
+        initial_graphs = initial_molecules or [get_methane()]
+        initial_graphs = MolAdapter().adapt(initial_graphs)
+        golem = GOLEM(
+            n_jobs=1,
+            timeout=trial_timedelta,
+            objective=objective,
+            optimizer=optimizer_cls,
+            initial_graphs=initial_graphs,
+            pop_size=pop_size,
+            max_pop_size=pop_size,
+            multi_objective=True,
+            genetic_scheme_type=GeneticSchemeTypesEnum.steady_state,
+            elitism_type=ElitismTypesEnum.replace_worst,
+            mutation_types=CHEMICAL_MUTATIONS,
+            crossover_types=[CrossoverTypesEnum.none],
+            adaptive_mutation_type=adaptive_kind,
+            adapter=MolAdapter(),
+            rules_for_constraint=[has_no_self_cycled_nodes, has_no_isolated_components, has_no_isolated_nodes],
+            advisor=MolChangeAdvisor(),
+            max_heavy_atoms=max_heavy_atoms,
+            available_atom_types=atom_types or ['C', 'N', 'O', 'F', 'P', 'S', 'Cl', 'Br'],
+            bond_types=bond_types,
+            early_stopping_timeout=np.inf,
+            early_stopping_iterations=np.inf,
+            keep_n_best=4,
+            num_of_generations=trial_iterations,
+            keep_history=True,
+            history_dir=None,
+        )
+        found_graphs = golem.optimise()
+        history = golem.optimiser.history
+
+        if visualize:
+            molecules = [MolAdapter().restore(graph) for graph in found_graphs]
+            save_dir = Path('visualisations') / exp_name / f'trial_{trial}'
+            visualize_results(set(molecules), objective, history, save_dir)
+        if save_history:
+            result_dir = Path('results') / exp_name
+            result_dir.mkdir(parents=True, exist_ok=True)
+            history.save(result_dir / f'history_trial_{trial}.json')
+        trial_results.extend(history.final_choices)
+        trial_histories.append(history)
+
+    # Compute mean & std for metrics of trials
+    ff = objective.format_fitness
+    trial_metrics = np.array([ind.fitness.values for ind in trial_results])
+    trial_metrics_mean = trial_metrics.mean(axis=0)
+    trial_metrics_std = trial_metrics.std(axis=0)
+    print(f'Experiment {experiment_id}\n'
+          f'finished with metrics:\n'
+          f'mean={ff(trial_metrics_mean)}\n'
+          f' std={ff(trial_metrics_std)}')
+
+
+def plot_experiment_comparison(experiment_ids: Sequence[str], metric_id: int = 0, results_dir='./results'):
+    root = Path(results_dir)
+    histories = {}
+    for exp_name in experiment_ids:
+        trials = []
+        for history_filename in os.listdir(root / exp_name):
+            if history_filename.startswith('history'):
+                history = OptHistory.load(root / exp_name / history_filename)
+                trials.append(history)
+        histories[exp_name] = trials
+        print(f'Loaded {len(trials)} trial histories for experiment: {exp_name}')
+    # Visualize
+    MultipleFitnessLines.from_histories(histories).visualize(metric_id=metric_id)
+    return histories
+
+
+if __name__ == '__main__':
+    run_experiment(adaptive_kind=MutationAgentTypeEnum.bandit,
+                   max_heavy_atoms=38,
+                   trial_timeout=6,
+                   pop_size=50,
+                   visualize=True,
+                   num_trials=5,
+                   pretrain_dir=os.path.join(project_root(), 'examples', 'molecule_search', 'histories')
+                   )
diff --git a/examples/molecule_search/mol_adapter.py b/examples/molecule_search/mol_adapter.py
@@ -17,8 +17,10 @@ def __init__(self):
 
     def _restore(self, opt_graph: OptGraph, metadata: Optional[Dict[str, Any]] = None) -> MolGraph:
         digraph = self.nx_adapter.restore(opt_graph)
-        # return to previous node indexing
-        digraph = nx.relabel_nodes(digraph, dict(digraph.nodes(data='nxid')))
+        # to ensure backward compatibility with old individuals without 'nxid' field in nodes
+        if not any(x is None for x in list(dict(digraph.nodes(data='nxid')).values())):
+            # return to previous node indexing
+            digraph = nx.relabel_nodes(digraph, dict(digraph.nodes(data='nxid')))
         digraph = restore_edges_params_from_nodes(digraph)
         nx_graph = digraph.to_undirected()
         mol_graph = MolGraph.from_nx_graph(nx_graph)
@@ -50,7 +52,11 @@ def restore_edges_params_from_nodes(graph: nx.DiGraph) -> nx.DiGraph:
     all_edges_params = {}
     for node in graph.nodes():
         for predecessor in graph.predecessors(node):
-            edge_params = edge_params_by_node[node][predecessor]
-            all_edges_params.update({(predecessor, node): edge_params})
+            node_params = edge_params_by_node[node]
+            # sometimes by unknown reason some nodes are encoded as int, some as str.
+            # maybe that's deserialization messing up somewhere.
+            edge_params = node_params.get(predecessor) or node_params.get(str(predecessor))
+            if edge_params:
+                all_edges_params[(predecessor, node)] = edge_params
     nx.set_edge_attributes(graph, all_edges_params)
     return graph