From bdf21dc43b6045c5d363893404f72a18e6a8d64a Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 20 Jun 2022 15:16:07 -0700 Subject: [PATCH 01/50] bump version --- nequip/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nequip/_version.py b/nequip/_version.py index 91faf40b..b02164d2 100644 --- a/nequip/_version.py +++ b/nequip/_version.py @@ -2,4 +2,4 @@ # See Python packaging guide # https://packaging.python.org/guides/single-sourcing-package-version/ -__version__ = "0.5.5" +__version__ = "0.5.6" From fd3a3f4bd2894757100833b6a900505bc642f481 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 20 Jun 2022 15:46:04 -0700 Subject: [PATCH 02/50] fix CI versions --- .github/workflows/tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index dea57ca1..b93d1ccd 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -15,8 +15,8 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.9] - torch-version: [1.10.0, 1.11.0] + python-version: [3.9] + torch-version: [1.10.1, 1.11.0] steps: - uses: actions/checkout@v2 From 2a444e376fb1291a99dc272d92ea63562a146c8a Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 6 Sep 2022 17:56:15 -0400 Subject: [PATCH 03/50] Report num (trainable) weights --- CHANGELOG.md | 2 ++ nequip/scripts/benchmark.py | 4 ++++ nequip/train/trainer.py | 3 +++ 3 files changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f13a642..4d8bbe93 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ Most recent change on the bottom. ## [Unreleased] - 0.5.6 +### Added +- `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights ## [0.5.5] - 2022-06-20 ### Added diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index 579e60ea..e1ac63a9 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -126,6 +126,10 @@ def main(args=None): model = model_from_config(config, initialize=True, dataset=dataset) model_time = time.time() - model_time print(f" building model took {model_time:.4f}s") + print(f" model has {sum(p.numel() for p in model.parameters())} weights") + print( + f" model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable weights" + ) print("Compile...") # "Deploy" it model.eval() diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 7ce4d4e1..8a8f28ed 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -706,6 +706,9 @@ def init(self): self.num_weights = sum(p.numel() for p in self.model.parameters()) self.logger.info(f"Number of weights: {self.num_weights}") + self.logger.info( + f"Number of trainable weights: {sum(p.numel() for p in self.model.parameters() if p.requires_grad)}" + ) self.rescale_layers = [] outer_layer = self.model From b827bba56fd12e46cf14e581f37ac7e7a0b0d0c6 Mon Sep 17 00:00:00 2001 From: Lixin Sun Date: Mon, 12 Sep 2022 23:55:03 +0100 Subject: [PATCH 04/50] make dataset an optional argument for rescaling builder --- nequip/model/_scaling.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/nequip/model/_scaling.py b/nequip/model/_scaling.py index f5554d25..b3f0f4d0 100644 --- a/nequip/model/_scaling.py +++ b/nequip/model/_scaling.py @@ -12,7 +12,7 @@ def RescaleEnergyEtc( - model: GraphModuleMixin, config, dataset: AtomicDataset, initialize: bool + model: GraphModuleMixin, config, initialize: bool, dataset: Optional[AtomicDataset] = None ): return GlobalRescale( model=model, @@ -34,7 +34,6 @@ def RescaleEnergyEtc( def GlobalRescale( model: GraphModuleMixin, config, - dataset: AtomicDataset, initialize: bool, module_prefix: str, default_scale: Union[str, float, list], @@ -43,6 +42,7 @@ def GlobalRescale( default_shift_keys: list, default_related_scale_keys: list, default_related_shift_keys: list, + dataset: Optional[AtomicDataset] = None, ): """Add global rescaling for energy(-based quantities). @@ -75,11 +75,12 @@ def GlobalRescale( raise ValueError(f"Invalid global scale `{value}`") # = Compute shifts and scales = - computed_stats = _compute_stats( - str_names=str_names, - dataset=dataset, - stride=config.dataset_statistics_stride, - ) + if len(str_names) > 0: + computed_stats = _compute_stats( + str_names=str_names, + dataset=dataset, + stride=config.dataset_statistics_stride, + ) if isinstance(global_scale, str): s = global_scale @@ -129,8 +130,8 @@ def GlobalRescale( def PerSpeciesRescale( model: GraphModuleMixin, config, - dataset: AtomicDataset, initialize: bool, + dataset: Optional[AtomicDataset] = None, ): """Add global rescaling for energy(-based quantities). @@ -199,12 +200,13 @@ def PerSpeciesRescale( ], "Requested to set either the shifts or scales of the per_species_rescale using dataset values, but chose to provide the other in non-dataset units. Please give the explictly specified shifts/scales in dataset units and set per_species_rescale_arguments_in_dataset_units" # = Compute shifts and scales = - computed_stats = _compute_stats( - str_names=str_names, - dataset=dataset, - stride=config.dataset_statistics_stride, - kwargs=config.get(module_prefix + "_kwargs", {}), - ) + if len(str_names) > 0: + computed_stats = _compute_stats( + str_names=str_names, + dataset=dataset, + stride=config.dataset_statistics_stride, + kwargs=config.get(module_prefix + "_kwargs", {}), + ) if isinstance(scales, str): s = scales From 8ad52e7fdf2f4d429e943ba69c8306dd06ede623 Mon Sep 17 00:00:00 2001 From: Lixin Sun Date: Mon, 12 Sep 2022 23:57:32 +0100 Subject: [PATCH 05/50] black --- nequip/model/_scaling.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nequip/model/_scaling.py b/nequip/model/_scaling.py index b3f0f4d0..8a7ffa46 100644 --- a/nequip/model/_scaling.py +++ b/nequip/model/_scaling.py @@ -12,7 +12,10 @@ def RescaleEnergyEtc( - model: GraphModuleMixin, config, initialize: bool, dataset: Optional[AtomicDataset] = None + model: GraphModuleMixin, + config, + initialize: bool, + dataset: Optional[AtomicDataset] = None, ): return GlobalRescale( model=model, From 4b2ef8132e46cb9d5c9828160b3477f821da6149 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 13 Sep 2022 14:13:38 -0400 Subject: [PATCH 06/50] document `model_builders` in `full.yaml` --- configs/full.yaml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/configs/full.yaml b/configs/full.yaml index 1b8a3a2c..2a44d981 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -17,7 +17,19 @@ default_dtype: float32 allow_tf32: false # whether to use TensorFloat32 if it is available # device: cuda # which device to use. Default: automatically detected cuda or "cpu" -# network +# == network == + +# `model_builders` defines a series of functions that will be called to construct the model +# each model builder has the opportunity to update the model, the config, or both +# model builders from other packages are allowed (see mir-group/allegro for an example); those from `nequip.model` don't require a prefix +# these are the default model builders: +model_builders: + - SimpleIrrepsConfig # update the config with all the irreps for the network if using the simplified `l_max` / `num_features` / `parity` syntax + - EnergyModel # build a full NequIP model + - PerSpeciesRescale # add per-atom / per-species scaling and shifting to the NequIP model before the total energy sum + - ForceOutput # wrap the energy model in a module that uses autodifferention to compute the forces + - RescaleEnergyEtc # wrap the entire model in the appropriate global rescaling of the energy, forces, etc. + r_max: 4.0 # cutoff radius in length units, here Angstrom, this is an important hyperparamter to scan num_layers: 4 # number of interaction blocks, we find 3-5 to work best From 086d72422b4e15959fea1da54f8a2765dd91d655 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 19 Sep 2022 23:07:22 -0400 Subject: [PATCH 07/50] better message --- nequip/scripts/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nequip/scripts/evaluate.py b/nequip/scripts/evaluate.py index d67d750f..7c4e2abc 100644 --- a/nequip/scripts/evaluate.py +++ b/nequip/scripts/evaluate.py @@ -74,7 +74,7 @@ def main(args=None, running_as_script: bool = True): ) parser.add_argument( "--batch-size", - help="Batch size to use. Larger is usually faster on GPU. If you run out of memory, lower this.", + help="Batch size to use. Larger is usually faster on GPU. If you run out of memory, lower this. You can also try to raise this for faster evaluation. Default: 50.", type=int, default=50, ) From 7da5b61d8b68194666e2952450fddbbcf5ba783b Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 17 Oct 2022 13:34:57 -0400 Subject: [PATCH 08/50] print dataset size --- nequip/data/dataset.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nequip/data/dataset.py b/nequip/data/dataset.py index 847b3795..2b2279d9 100644 --- a/nequip/data/dataset.py +++ b/nequip/data/dataset.py @@ -295,7 +295,13 @@ def process(self): # type conversion _process_dict(fixed_fields, ignore_fields=["r_max"]) - logging.info(f"Loaded data: {data}") + total_MBs = sum(item.numel() * item.element_size() for _, item in data) / ( + 1024 * 1024 + ) + logging.info( + f"Loaded data: {data}\n processed data size: ~{total_MBs:.2f} MB" + ) + del total_MBs # use atomic writes to avoid race conditions between # different trainings that use the same dataset From 17cb6f201f3a4e821627d0f8365664a2d548935c Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 17 Oct 2022 13:54:28 -0400 Subject: [PATCH 09/50] docs note --- configs/full.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/configs/full.yaml b/configs/full.yaml index 2a44d981..3c3e8984 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -29,6 +29,7 @@ model_builders: - PerSpeciesRescale # add per-atom / per-species scaling and shifting to the NequIP model before the total energy sum - ForceOutput # wrap the energy model in a module that uses autodifferention to compute the forces - RescaleEnergyEtc # wrap the entire model in the appropriate global rescaling of the energy, forces, etc. +# ^ global rescaling blocks must always go last! r_max: 4.0 # cutoff radius in length units, here Angstrom, this is an important hyperparamter to scan num_layers: 4 # number of interaction blocks, we find 3-5 to work best From da3e4bdf7ce9a25a1427f0efd15504dee71964df Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Sun, 6 Nov 2022 11:54:38 -0500 Subject: [PATCH 10/50] better error message --- nequip/data/AtomicData.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nequip/data/AtomicData.py b/nequip/data/AtomicData.py index 3f2e348b..f5b8339a 100644 --- a/nequip/data/AtomicData.py +++ b/nequip/data/AtomicData.py @@ -773,7 +773,7 @@ def neighbor_list_and_relative_vec( keep_edge = ~bad_edge if not np.any(keep_edge): raise ValueError( - "After eliminating self edges, no edges remain in this system." + f"Every single atom has no neighbors within the cutoff r_max={r_max} (after eliminating self edges, no edges remain in this system)" ) first_idex = first_idex[keep_edge] second_idex = second_idex[keep_edge] From bb2a1204c972cca9daed467a56236dc54aaf677b Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 7 Nov 2022 16:08:56 -0500 Subject: [PATCH 11/50] Fix error message typo --- nequip/train/trainer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nequip/train/trainer.py b/nequip/train/trainer.py index 8a8f28ed..55efec32 100644 --- a/nequip/train/trainer.py +++ b/nequip/train/trainer.py @@ -1180,7 +1180,9 @@ def set_dataset( if self.n_train > len(dataset): raise ValueError("Not enough data in dataset for requested n_train") if self.n_val > len(validation_dataset): - raise ValueError("Not enough data in dataset for requested n_train") + raise ValueError( + "Not enough data in validation dataset for requested n_val" + ) if self.train_val_split == "random": self.train_idcs = torch.randperm( len(dataset), generator=self.dataset_rng From d090f3224b0150cd878dde62d6a03e8c01033f0c Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 7 Nov 2022 16:18:39 -0500 Subject: [PATCH 12/50] avoid running `git` on `.egg` --- CHANGELOG.md | 3 +++ nequip/utils/git.py | 9 ++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4d8bbe93..85e07593 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,9 @@ Most recent change on the bottom. ### Added - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights +### Fixed +- Git utilities when installed as ZIPed `.egg` (#264) + ## [0.5.5] - 2022-06-20 ### Added - BETA! Support for stress in training and inference diff --git a/nequip/utils/git.py b/nequip/utils/git.py index a78a87fc..a5fbe7f3 100644 --- a/nequip/utils/git.py +++ b/nequip/utils/git.py @@ -8,7 +8,14 @@ def get_commit(module: str) -> Optional[str]: module = import_module(module) - path = str(Path(module.__file__).parents[0] / "..") + package = Path(module.__file__).parents[0] + if package.is_file(): + # We're installed as a ZIP .egg file, + # which means there's no git information + # and looking for the parent would fail anyway + # https://github.com/mir-group/nequip/issues/264 + return None + path = str(package / "..") retcode = subprocess.run( "git show --oneline --abbrev=40 -s".split(), From 144139f5d527ee7eab6ddeddf8fcc91b18a65a8f Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 14 Nov 2022 23:42:33 -0500 Subject: [PATCH 13/50] always output something for stress --- nequip/nn/_grad_output.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nequip/nn/_grad_output.py b/nequip/nn/_grad_output.py index ffc13140..673f8ff0 100644 --- a/nequip/nn/_grad_output.py +++ b/nequip/nn/_grad_output.py @@ -315,10 +315,10 @@ def forward(self, data: AtomicDataDict.Type) -> AtomicDataDict.Type: torch.cross(cell[:, 1, :], cell[:, 2, :], dim=1), ).unsqueeze(-1) stress = virial / volume.view(-1, 1, 1) - data[AtomicDataDict.STRESS_KEY] = stress data[AtomicDataDict.CELL_KEY] = orig_cell else: stress = self._empty # torchscript + data[AtomicDataDict.STRESS_KEY] = stress # see discussion in https://github.com/libAtoms/QUIP/issues/227 about sign convention # they say the standard convention is virial = -stress x volume From e644c0d46cf7bb0b206ebda5de1fba4ac2a71494 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 14 Nov 2022 23:46:27 -0500 Subject: [PATCH 14/50] add EDGE_FEATURES --- nequip/data/AtomicData.py | 1 + nequip/data/_keys.py | 1 + 2 files changed, 2 insertions(+) diff --git a/nequip/data/AtomicData.py b/nequip/data/AtomicData.py index f5b8339a..728c260b 100644 --- a/nequip/data/AtomicData.py +++ b/nequip/data/AtomicData.py @@ -48,6 +48,7 @@ AtomicDataDict.EDGE_LENGTH_KEY, AtomicDataDict.EDGE_ATTRS_KEY, AtomicDataDict.EDGE_EMBEDDING_KEY, + AtomicDataDict.EDGE_FEATURES_KEY, } _DEFAULT_GRAPH_FIELDS: Set[str] = { AtomicDataDict.TOTAL_ENERGY_KEY, diff --git a/nequip/data/_keys.py b/nequip/data/_keys.py index c0535edd..54b66ce3 100644 --- a/nequip/data/_keys.py +++ b/nequip/data/_keys.py @@ -44,6 +44,7 @@ EDGE_ATTRS_KEY: Final[str] = "edge_attrs" # [n_edge, dim] invariant embedding of the edges EDGE_EMBEDDING_KEY: Final[str] = "edge_embedding" +EDGE_FEATURES_KEY: Final[str] = "edge_features" NODE_FEATURES_KEY: Final[str] = "node_features" NODE_ATTRS_KEY: Final[str] = "node_attrs" From 3d44dcc9fd9b5c9db5b82c06b3cdb3c75310d768 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 00:06:57 -0500 Subject: [PATCH 15/50] allow type mapper for GPU also --- nequip/data/transforms.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/nequip/data/transforms.py b/nequip/data/transforms.py index f2c7ec32..5a37e2be 100644 --- a/nequip/data/transforms.py +++ b/nequip/data/transforms.py @@ -121,11 +121,13 @@ def transform(self, atomic_numbers): f"Data included atomic numbers {bad_set} that are not part of the atomic number -> type mapping!" ) - return self._Z_to_index[atomic_numbers - self._min_Z] + return self._Z_to_index[atomic_numbers - self._min_Z].to( + device=atomic_numbers.device + ) def untransform(self, atom_types): """Transform atom types back into atomic numbers""" - return self._index_to_Z[atom_types] + return self._index_to_Z[atom_types].to(device=atom_types.device) @property def has_chemical_symbols(self) -> bool: From c1096e842807e9330fb87a490326f0fa5a6bf1af Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 00:07:37 -0500 Subject: [PATCH 16/50] refactor unit tests --- nequip/utils/unittests/__init__.py | 3 + nequip/utils/unittests/conftest.py | 137 +++++ .../utils/unittests/model_tests.py | 536 ++++++++---------- tests/conftest.py | 140 +---- tests/unit/model/test_nequip_model.py | 118 ++++ 5 files changed, 486 insertions(+), 448 deletions(-) create mode 100644 nequip/utils/unittests/__init__.py create mode 100644 nequip/utils/unittests/conftest.py rename tests/unit/model/test_eng_force.py => nequip/utils/unittests/model_tests.py (58%) create mode 100644 tests/unit/model/test_nequip_model.py diff --git a/nequip/utils/unittests/__init__.py b/nequip/utils/unittests/__init__.py new file mode 100644 index 00000000..2309cb02 --- /dev/null +++ b/nequip/utils/unittests/__init__.py @@ -0,0 +1,3 @@ +import pathlib + +CONFTEST_PATH = pathlib.Path(__file__).parent / "conftest.py" diff --git a/nequip/utils/unittests/conftest.py b/nequip/utils/unittests/conftest.py new file mode 100644 index 00000000..060e5e7b --- /dev/null +++ b/nequip/utils/unittests/conftest.py @@ -0,0 +1,137 @@ +from typing import List, Tuple +import numpy as np +import pathlib +import pytest +import tempfile +import os + +from ase.atoms import Atoms +from ase.build import molecule +from ase.calculators.singlepoint import SinglePointCalculator +from ase.io import write + +import torch + +from nequip.utils.test import set_irreps_debug +from nequip.data import AtomicData, ASEDataset +from nequip.data.transforms import TypeMapper +from nequip.utils.torch_geometric import Batch +from nequip.utils._global_options import _set_global_options +from nequip.utils.misc import dtype_from_name + +if "NEQUIP_NUM_TASKS" not in os.environ: + # Test parallelization, but don't waste time spawning tons of workers if lots of cores available + os.environ["NEQUIP_NUM_TASKS"] = "2" + +# The default float tolerance +FLOAT_TOLERANCE = { + t: torch.as_tensor(v, dtype=dtype_from_name(t)) + for t, v in {"float32": 1e-3, "float64": 1e-10}.items() +} + + +@pytest.fixture(scope="session", autouse=True, params=["float32", "float64"]) +def float_tolerance(request): + """Run all tests with various PyTorch default dtypes. + + This is a session-wide, autouse fixture — you only need to request it explicitly if a test needs to know the tolerance for the current default dtype. + + Returns + -------- + A precision threshold to use for closeness tests. + """ + old_dtype = torch.get_default_dtype() + dtype = request.param + _set_global_options({"default_dtype": dtype}) + yield FLOAT_TOLERANCE[dtype] + _set_global_options( + { + "default_dtype": {torch.float32: "float32", torch.float64: "float64"}[ + old_dtype + ] + } + ) + + +# - Ampere and TF32 - +# Many of the tests for NequIP involve numerically checking +# algebraic properties— normalization, equivariance, +# continuity, etc. +# With the added numerical noise of TF32, some of those tests fail +# with the current (and usually generous) thresholds. +# +# Thus we go on the assumption that PyTorch + NVIDIA got everything +# right, that this setting DOES NOT AFFECT the model outputs except +# for increased numerical noise, and only test without it. +# +# TODO: consider running tests with and without +# TODO: check how much thresholds have to be changed to accomidate TF32 +torch.backends.cuda.matmul.allow_tf32 = False +torch.backends.cudnn.allow_tf32 = False + + +@pytest.fixture(scope="session") +def BENCHMARK_ROOT(): + return pathlib.Path(__file__).parent / "../benchmark_data/" + + +@pytest.fixture(scope="session") +def temp_data(float_tolerance): + with tempfile.TemporaryDirectory() as tmpdirname: + yield tmpdirname + + +@pytest.fixture(scope="session") +def CH3CHO(CH3CHO_no_typemap) -> Tuple[Atoms, AtomicData]: + atoms, data = CH3CHO_no_typemap + tm = TypeMapper(chemical_symbol_to_type={"C": 0, "O": 1, "H": 2}) + data = tm(data) + return atoms, data + + +@pytest.fixture(scope="session") +def CH3CHO_no_typemap(float_tolerance) -> Tuple[Atoms, AtomicData]: + atoms = molecule("CH3CHO") + data = AtomicData.from_ase(atoms, r_max=2.0) + return atoms, data + + +@pytest.fixture(scope="session") +def molecules() -> List[Atoms]: + atoms_list = [] + for i in range(8): + atoms = molecule("CH3CHO" if i % 2 == 0 else "H2") + atoms.rattle() + atoms.calc = SinglePointCalculator( + energy=np.random.random(), + forces=np.random.random((len(atoms), 3)), + stress=None, + magmoms=None, + atoms=atoms, + ) + atoms_list.append(atoms) + return atoms_list + + +@pytest.fixture(scope="session") +def nequip_dataset(molecules, temp_data, float_tolerance): + with tempfile.NamedTemporaryFile(suffix=".xyz") as fp: + for atoms in molecules: + write(fp.name, atoms, format="extxyz", append=True) + a = ASEDataset( + file_name=fp.name, + root=temp_data, + extra_fixed_fields={"r_max": 3.0}, + ase_args=dict(format="extxyz"), + type_mapper=TypeMapper(chemical_symbol_to_type={"H": 0, "C": 1, "O": 2}), + ) + yield a + + +@pytest.fixture(scope="session") +def atomic_batch(nequip_dataset): + return Batch.from_data_list([nequip_dataset[0], nequip_dataset[1]]) + + +# Use debug mode +set_irreps_debug(True) diff --git a/tests/unit/model/test_eng_force.py b/nequip/utils/unittests/model_tests.py similarity index 58% rename from tests/unit/model/test_eng_force.py rename to nequip/utils/unittests/model_tests.py index 0adcd4c9..372571d4 100644 --- a/tests/unit/model/test_eng_force.py +++ b/nequip/utils/unittests/model_tests.py @@ -1,150 +1,81 @@ import pytest -import logging import tempfile import functools import torch import numpy as np -from e3nn import o3 from e3nn.util.jit import script -from nequip.data import AtomicDataDict, AtomicData, Collater +from nequip.data import ( + AtomicDataDict, + AtomicData, + Collater, + _GRAPH_FIELDS, + _NODE_FIELDS, +) from nequip.data.transforms import TypeMapper -from nequip.model import model_from_config, uniform_initialize_FCs -from nequip.nn import GraphModuleMixin, AtomwiseLinear +from nequip.model import model_from_config +from nequip.nn import GraphModuleMixin from nequip.utils.test import assert_AtomicData_equivariant -logging.basicConfig(level=logging.DEBUG) - -COMMON_CONFIG = { - "num_types": 3, - "types_names": ["H", "C", "O"], - "avg_num_neighbors": None, -} -r_max = 3 -minimal_config1 = dict( - irreps_edge_sh="0e + 1o", - r_max=4, - feature_irreps_hidden="4x0e + 4x1o", - num_layers=2, - num_basis=8, - PolynomialCutoff_p=6, - nonlinearity_type="norm", - **COMMON_CONFIG -) -minimal_config2 = dict( - irreps_edge_sh="0e + 1o", - r_max=4, - chemical_embedding_irreps_out="8x0e + 8x0o + 8x1e + 8x1o", - irreps_mid_output_block="2x0e", - feature_irreps_hidden="4x0e + 4x1o", - **COMMON_CONFIG -) -minimal_config3 = dict( - irreps_edge_sh="0e + 1o", - r_max=4, - feature_irreps_hidden="4x0e + 4x1o", - num_layers=2, - num_basis=8, - PolynomialCutoff_p=6, - nonlinearity_type="gate", - **COMMON_CONFIG -) -minimal_config4 = dict( - irreps_edge_sh="0e + 1o + 2e", - r_max=4, - feature_irreps_hidden="2x0e + 2x1o + 2x2e", - num_layers=2, - num_basis=3, - PolynomialCutoff_p=6, - nonlinearity_type="gate", - # test custom nonlinearities - nonlinearity_scalars={"e": "silu", "o": "tanh"}, - nonlinearity_gates={"e": "silu", "o": "abs"}, - **COMMON_CONFIG -) - - -@pytest.fixture( - scope="module", - params=[minimal_config1, minimal_config2, minimal_config3, minimal_config4], -) -def config(request): - return request.param +# see https://github.com/pytest-dev/pytest/issues/421#issuecomment-943386533 +# to allow external packages to import tests through subclassing +class BaseModelTests: + @pytest.fixture(scope="class") + def config(self): + """Implemented by subclasses. + Return a tuple of config, out_field + """ + raise NotImplementedError -@pytest.fixture( - params=[ - ( - ["EnergyModel", "ForceOutput"], - AtomicDataDict.FORCE_KEY, + @pytest.fixture( + scope="class", + params=( + [torch.device("cuda"), torch.device("cpu")] + if torch.cuda.is_available() + else [torch.device("cpu")] ), - ( - ["EnergyModel"], - AtomicDataDict.TOTAL_ENERGY_KEY, - ), - ( - ["EnergyModel", "StressForceOutput"], - AtomicDataDict.STRESS_KEY, - ), - ] -) -def model(request, config): - torch.manual_seed(0) - np.random.seed(0) - builder, out_field = request.param - config = config.copy() - config["model_builders"] = builder - return model_from_config(config), out_field - - -@pytest.fixture( - scope="module", - params=( - [torch.device("cuda"), torch.device("cpu")] - if torch.cuda.is_available() - else [torch.device("cpu")] - ), -) -def device(request): - return request.param - - -class TestWorkflow: - """ - test class methods - """ + ) + def device(self, request): + return request.param + + @pytest.fixture(scope="class") + def model(self, config, device): + config, out_fields = config + torch.manual_seed(0) + np.random.seed(0) + config = config.copy() + config.update( + { + "num_types": 3, + "types_names": ["H", "C", "O"], + } + ) + model = model_from_config(config) + model = model.to(device) + return model, out_fields + # == common tests for all models == def test_init(self, model): instance, _ = model assert isinstance(instance, GraphModuleMixin) - def test_weight_init(self, model, atomic_batch, device): - instance, out_field = model - data = AtomicData.to_AtomicDataDict(atomic_batch.to(device=device)) - instance = instance.to(device=device) - - out_orig = instance(data)[out_field] - - instance = uniform_initialize_FCs(instance, initialize=True) - - out_unif = instance(data)[out_field] - assert not torch.allclose(out_orig, out_unif) - def test_jit(self, model, atomic_batch, device): - instance, out_field = model + instance, out_fields = model data = AtomicData.to_AtomicDataDict(atomic_batch.to(device=device)) instance = instance.to(device=device) model_script = script(instance) - assert torch.allclose( - instance(data)[out_field], - model_script(data)[out_field], - atol=1e-6, - ) + for out_field in out_fields: + assert torch.allclose( + instance(data)[out_field], + model_script(data)[out_field], + atol=1e-6, + ) # - Try saving, loading in another process, and running - with tempfile.TemporaryDirectory() as tmpdir: @@ -163,39 +94,25 @@ def test_jit(self, model, atomic_batch, device): torch.float64: 1e-10, }[torch.get_default_dtype()] - assert torch.allclose( - model_script(data)[out_field], - load_model(load_dat)[out_field], - atol=atol, - ) - - def test_submods(self): - config = minimal_config2.copy() - config["model_builders"] = ["EnergyModel"] - model = model_from_config(config=config, initialize=True) - assert isinstance(model.chemical_embedding, AtomwiseLinear) - true_irreps = o3.Irreps(minimal_config2["chemical_embedding_irreps_out"]) - assert ( - model.chemical_embedding.irreps_out[model.chemical_embedding.out_field] - == true_irreps - ) - # Make sure it propagates - assert ( - model.layer0_convnet.irreps_in[model.chemical_embedding.out_field] - == true_irreps - ) + for out_field in out_fields: + assert torch.allclose( + model_script(data)[out_field], + load_model(load_dat)[out_field], + atol=atol, + ) def test_forward(self, model, atomic_batch, device): - instance, out_field = model + instance, out_fields = model instance.to(device) data = atomic_batch.to(device) output = instance(AtomicData.to_AtomicDataDict(data)) - assert out_field in output + for out_field in out_fields: + assert out_field in output def test_batch(self, model, atomic_batch, device, float_tolerance): """Confirm that the results for individual examples are the same regardless of whether they are batched.""" allclose = functools.partial(torch.allclose, atol=float_tolerance) - instance, out_field = model + instance, out_fields = model instance.to(device) data = atomic_batch.to(device) data1 = data.get_example(0) @@ -203,33 +120,164 @@ def test_batch(self, model, atomic_batch, device, float_tolerance): output1 = instance(AtomicData.to_AtomicDataDict(data1)) output2 = instance(AtomicData.to_AtomicDataDict(data2)) output = instance(AtomicData.to_AtomicDataDict(data)) - if out_field in (AtomicDataDict.TOTAL_ENERGY_KEY, AtomicDataDict.STRESS_KEY): - assert allclose( - output1[out_field], - output[out_field][0], - ) - assert allclose( - output2[out_field], - output[out_field][1], - ) - elif out_field in (AtomicDataDict.FORCE_KEY,): - assert allclose( - output1[out_field], - output[out_field][output[AtomicDataDict.BATCH_KEY] == 0], - ) - assert allclose( - output2[out_field], - output[out_field][output[AtomicDataDict.BATCH_KEY] == 1], - ) + for out_field in out_fields: + if out_field in _GRAPH_FIELDS: + assert allclose( + output1[out_field], + output[out_field][0], + ) + assert allclose( + output2[out_field], + output[out_field][1], + ) + elif out_field in _NODE_FIELDS: + assert allclose( + output1[out_field], + output[out_field][output[AtomicDataDict.BATCH_KEY] == 0], + ) + assert allclose( + output2[out_field], + output[out_field][output[AtomicDataDict.BATCH_KEY] == 1], + ) + else: + raise NotImplementedError + + def test_equivariance(self, model, atomic_batch, device): + instance, out_fields = model + instance = instance.to(device=device) + atomic_batch = atomic_batch.to(device=device) + assert_AtomicData_equivariant(func=instance, data_in=atomic_batch) + + def test_embedding_cutoff(self, model, config, device): + instance, out_fields = model + config, out_fields = config + r_max = config["r_max"] + + # make a synthetic three atom example + data = AtomicData( + atom_types=np.random.choice([0, 1, 2], size=3), + pos=np.array([[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), + edge_index=np.array([[0, 1, 0, 2], [1, 0, 2, 0]]), + ) + data = data.to(device) + edge_embed = instance(AtomicData.to_AtomicDataDict(data)) + if AtomicDataDict.EDGE_FEATURES_KEY in edge_embed: + key = AtomicDataDict.EDGE_FEATURES_KEY else: - raise NotImplementedError + key = AtomicDataDict.EDGE_EMBEDDING_KEY + edge_embed = edge_embed[key] + data.pos[2, 1] = r_max # put it past the cutoff + edge_embed2 = instance(AtomicData.to_AtomicDataDict(data))[key] + assert torch.allclose(edge_embed[:2], edge_embed2[:2]) + assert edge_embed[2:].abs().sum() > 1e-6 # some nonzero terms + assert torch.allclose(edge_embed2[2:], torch.zeros(1, device=device)) -class TestGradient: - def test_numeric_gradient(self, config, atomic_batch, device, float_tolerance): - config = config.copy() - config["model_builders"] = ["EnergyModel", "ForceOutput"] - model = model_from_config(config=config, initialize=True) + # test gradients + in_dict = AtomicData.to_AtomicDataDict(data) + in_dict[AtomicDataDict.POSITIONS_KEY].requires_grad_(True) + + with torch.autograd.set_detect_anomaly(True): + out = instance(in_dict) + + # is the edge embedding of the cutoff length edge unchanged at the cutoff? + grads = torch.autograd.grad( + outputs=out[key][2:].sum(), + inputs=in_dict[AtomicDataDict.POSITIONS_KEY], + retain_graph=True, + )[0] + assert torch.allclose(grads, torch.zeros(1, device=device)) + + if AtomicDataDict.PER_ATOM_ENERGY_KEY in out: + # are the first two atom's energies unaffected by atom at the cutoff? + grads = torch.autograd.grad( + outputs=out[AtomicDataDict.PER_ATOM_ENERGY_KEY][:2].sum(), + inputs=in_dict[AtomicDataDict.POSITIONS_KEY], + )[0] + print(grads) + # only care about gradient wrt moved atom + assert grads.shape == (3, 3) + assert torch.allclose(grads[2], torch.zeros(1, device=device)) + + +class BaseEnergyModelTests(BaseModelTests): + def test_large_separation(self, model, config, molecules, device): + atol = {torch.float32: 1e-4, torch.float64: 1e-10}[torch.get_default_dtype()] + instance, _ = model + instance.to(device) + config, out_fields = config + r_max = config["r_max"] + atoms1 = molecules[0].copy() + atoms2 = molecules[1].copy() + # translate atoms2 far away + atoms2.positions += 40.0 + np.random.randn(3) + atoms_both = atoms1.copy() + atoms_both.extend(atoms2) + tm = TypeMapper(chemical_symbols=["H", "C", "O"]) + data1 = tm(AtomicData.from_ase(atoms1, r_max=r_max).to(device=device)) + data2 = tm(AtomicData.from_ase(atoms2, r_max=r_max).to(device=device)) + data_both = tm(AtomicData.from_ase(atoms_both, r_max=r_max).to(device=device)) + assert ( + data_both[AtomicDataDict.EDGE_INDEX_KEY].shape[1] + == data1[AtomicDataDict.EDGE_INDEX_KEY].shape[1] + + data2[AtomicDataDict.EDGE_INDEX_KEY].shape[1] + ) + + out1 = instance(AtomicData.to_AtomicDataDict(data1)) + out2 = instance(AtomicData.to_AtomicDataDict(data2)) + out_both = instance(AtomicData.to_AtomicDataDict(data_both)) + + assert torch.allclose( + out1[AtomicDataDict.TOTAL_ENERGY_KEY] + + out2[AtomicDataDict.TOTAL_ENERGY_KEY], + out_both[AtomicDataDict.TOTAL_ENERGY_KEY], + atol=atol, + ) + + atoms_both2 = atoms1.copy() + atoms3 = atoms2.copy() + atoms3.positions += np.random.randn(3) + atoms_both2.extend(atoms3) + data_both2 = tm(AtomicData.from_ase(atoms_both2, r_max=r_max).to(device=device)) + out_both2 = instance(AtomicData.to_AtomicDataDict(data_both2)) + assert torch.allclose( + out_both2[AtomicDataDict.TOTAL_ENERGY_KEY], + out_both[AtomicDataDict.TOTAL_ENERGY_KEY], + atol=atol, + ) + assert torch.allclose( + out_both2[AtomicDataDict.PER_ATOM_ENERGY_KEY], + out_both[AtomicDataDict.PER_ATOM_ENERGY_KEY], + atol=atol, + ) + + def test_cross_frame_grad(self, model, device, nequip_dataset): + c = Collater.for_dataset(nequip_dataset) + batch = c([nequip_dataset[i] for i in range(len(nequip_dataset))]) + energy_model, out_fields = model + energy_model.to(device) + data = AtomicData.to_AtomicDataDict(batch.to(device)) + data[AtomicDataDict.POSITIONS_KEY].requires_grad = True + + output = energy_model(data) + grads = torch.autograd.grad( + outputs=output[AtomicDataDict.TOTAL_ENERGY_KEY][-1], + inputs=data[AtomicDataDict.POSITIONS_KEY], + allow_unused=True, + )[0] + + last_frame_n_atom = batch.ptr[-1] - batch.ptr[-2] + + in_frame_grad = grads[-last_frame_n_atom:] + cross_frame_grad = grads[:-last_frame_n_atom] + + assert cross_frame_grad.abs().max().item() == 0 + assert in_frame_grad.abs().max().item() > 0 + + def test_numeric_gradient(self, model, atomic_batch, device): + model, out_fields = model + if AtomicDataDict.FORCE_KEY not in out_fields: + pytest.skip() model.to(device) data = atomic_batch.to(device) output = model(AtomicData.to_AtomicDataDict(data)) @@ -256,16 +304,15 @@ def test_numeric_gradient(self, config, atomic_batch, device, float_tolerance): numeric, analytical, rtol=5e-2 ) - def test_partial_forces(self, atomic_batch, device): - config = minimal_config1.copy() - config["model_builders"] = [ - "EnergyModel", - "ForceOutput", - ] + def test_partial_forces(self, config, atomic_batch, device): + config, out_fields = config + if "ForceOutput" not in config["model_builders"]: + pytest.skip() + config = config.copy() partial_config = config.copy() partial_config["model_builders"] = [ - "EnergyModel", - "PartialForceOutput", + "PartialForceOutput" if b == "ForceOutput" else b + for b in partial_config["model_builders"] ] model = model_from_config(config=config, initialize=True) partial_model = model_from_config(config=partial_config, initialize=True) @@ -284,7 +331,7 @@ def test_partial_forces(self, atomic_batch, device): assert torch.allclose( output[k], output_partial[k], - atol=1e-6 if k == AtomicDataDict.FORCE_KEY else 1e-8, + atol=1e-8 if k == AtomicDataDict.TOTAL_ENERGY_KEY else 1e-6, ) else: assert torch.equal(output[k], output_partial[k]) @@ -307,138 +354,3 @@ def test_partial_forces(self, atomic_batch, device): AtomicDataDict.BATCH_KEY ].view(1, -1) assert torch.equal(adjacency, torch.any(partial_forces != 0, dim=-1)) - - -class TestAutoGradient: - def test_cross_frame_grad(self, config, nequip_dataset): - c = Collater.for_dataset(nequip_dataset) - batch = c([nequip_dataset[i] for i in range(len(nequip_dataset))]) - device = "cpu" - config = config.copy() - config["model_builders"] = ["EnergyModel"] - energy_model = model_from_config(config=config, initialize=True) - energy_model.to(device) - data = AtomicData.to_AtomicDataDict(batch.to(device)) - data[AtomicDataDict.POSITIONS_KEY].requires_grad = True - - output = energy_model(data) - grads = torch.autograd.grad( - outputs=output[AtomicDataDict.TOTAL_ENERGY_KEY][-1], - inputs=data[AtomicDataDict.POSITIONS_KEY], - allow_unused=True, - )[0] - - last_frame_n_atom = batch.ptr[-1] - batch.ptr[-2] - - in_frame_grad = grads[-last_frame_n_atom:] - cross_frame_grad = grads[:-last_frame_n_atom] - - assert cross_frame_grad.abs().max().item() == 0 - assert in_frame_grad.abs().max().item() > 0 - - -class TestEquivariance: - def test_forward(self, model, atomic_batch, device): - instance, out_field = model - instance = instance.to(device=device) - atomic_batch = atomic_batch.to(device=device) - assert_AtomicData_equivariant(func=instance, data_in=atomic_batch) - - -class TestCutoff: - def test_large_separation(self, model, config, molecules): - atol = {torch.float32: 1e-4, torch.float64: 1e-10}[torch.get_default_dtype()] - instance, _ = model - r_max = config["r_max"] - atoms1 = molecules[0].copy() - atoms2 = molecules[1].copy() - # translate atoms2 far away - atoms2.positions += 40.0 + np.random.randn(3) - atoms_both = atoms1.copy() - atoms_both.extend(atoms2) - tm = TypeMapper(chemical_symbols=["H", "C", "O"]) - data1 = tm(AtomicData.from_ase(atoms1, r_max=r_max)) - data2 = tm(AtomicData.from_ase(atoms2, r_max=r_max)) - data_both = tm(AtomicData.from_ase(atoms_both, r_max=r_max)) - assert ( - data_both[AtomicDataDict.EDGE_INDEX_KEY].shape[1] - == data1[AtomicDataDict.EDGE_INDEX_KEY].shape[1] - + data2[AtomicDataDict.EDGE_INDEX_KEY].shape[1] - ) - - out1 = instance(AtomicData.to_AtomicDataDict(data1)) - out2 = instance(AtomicData.to_AtomicDataDict(data2)) - out_both = instance(AtomicData.to_AtomicDataDict(data_both)) - - assert torch.allclose( - out1[AtomicDataDict.TOTAL_ENERGY_KEY] - + out2[AtomicDataDict.TOTAL_ENERGY_KEY], - out_both[AtomicDataDict.TOTAL_ENERGY_KEY], - atol=atol, - ) - - atoms_both2 = atoms1.copy() - atoms3 = atoms2.copy() - atoms3.positions += np.random.randn(3) - atoms_both2.extend(atoms3) - data_both2 = tm(AtomicData.from_ase(atoms_both2, r_max=r_max)) - out_both2 = instance(AtomicData.to_AtomicDataDict(data_both2)) - assert torch.allclose( - out_both2[AtomicDataDict.TOTAL_ENERGY_KEY], - out_both[AtomicDataDict.TOTAL_ENERGY_KEY], - atol=atol, - ) - assert torch.allclose( - out_both2[AtomicDataDict.PER_ATOM_ENERGY_KEY], - out_both[AtomicDataDict.PER_ATOM_ENERGY_KEY], - atol=atol, - ) - - def test_embedding_cutoff(self, config): - config = config.copy() - config["model_builders"] = ["EnergyModel"] - instance = model_from_config(config=config, initialize=True) - r_max = config["r_max"] - - # make a synthetic three atom example - data = AtomicData( - atom_types=np.random.choice([0, 1, 2], size=3), - pos=np.array([[0.0, 0.0, 0.0], [1.0, 0.0, 0.0], [0.0, 1.0, 0.0]]), - edge_index=np.array([[0, 1, 0, 2], [1, 0, 2, 0]]), - ) - edge_embed = instance(AtomicData.to_AtomicDataDict(data))[ - AtomicDataDict.EDGE_EMBEDDING_KEY - ] - data.pos[2, 1] = r_max # put it past the cutoff - edge_embed2 = instance(AtomicData.to_AtomicDataDict(data))[ - AtomicDataDict.EDGE_EMBEDDING_KEY - ] - - assert torch.allclose(edge_embed[:2], edge_embed2[:2]) - assert edge_embed[2:].abs().sum() > 1e-6 # some nonzero terms - assert torch.allclose(edge_embed2[2:], torch.zeros(1)) - - # test gradients - in_dict = AtomicData.to_AtomicDataDict(data) - in_dict[AtomicDataDict.POSITIONS_KEY].requires_grad_(True) - - with torch.autograd.set_detect_anomaly(True): - out = instance(in_dict) - - # is the edge embedding of the cutoff length edge unchanged at the cutoff? - grads = torch.autograd.grad( - outputs=out[AtomicDataDict.EDGE_EMBEDDING_KEY][2:].sum(), - inputs=in_dict[AtomicDataDict.POSITIONS_KEY], - retain_graph=True, - )[0] - assert torch.allclose(grads, torch.zeros(1)) - - # are the first two atom's energies unaffected by atom at the cutoff? - grads = torch.autograd.grad( - outputs=out[AtomicDataDict.PER_ATOM_ENERGY_KEY][:2].sum(), - inputs=in_dict[AtomicDataDict.POSITIONS_KEY], - )[0] - print(grads) - # only care about gradient wrt moved atom - assert grads.shape == (3, 3) - assert torch.allclose(grads[2], torch.zeros(1)) diff --git a/tests/conftest.py b/tests/conftest.py index 060e5e7b..e9719bcd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,137 +1,5 @@ -from typing import List, Tuple -import numpy as np -import pathlib -import pytest -import tempfile -import os +from nequip.utils.unittests import CONFTEST_PATH -from ase.atoms import Atoms -from ase.build import molecule -from ase.calculators.singlepoint import SinglePointCalculator -from ase.io import write - -import torch - -from nequip.utils.test import set_irreps_debug -from nequip.data import AtomicData, ASEDataset -from nequip.data.transforms import TypeMapper -from nequip.utils.torch_geometric import Batch -from nequip.utils._global_options import _set_global_options -from nequip.utils.misc import dtype_from_name - -if "NEQUIP_NUM_TASKS" not in os.environ: - # Test parallelization, but don't waste time spawning tons of workers if lots of cores available - os.environ["NEQUIP_NUM_TASKS"] = "2" - -# The default float tolerance -FLOAT_TOLERANCE = { - t: torch.as_tensor(v, dtype=dtype_from_name(t)) - for t, v in {"float32": 1e-3, "float64": 1e-10}.items() -} - - -@pytest.fixture(scope="session", autouse=True, params=["float32", "float64"]) -def float_tolerance(request): - """Run all tests with various PyTorch default dtypes. - - This is a session-wide, autouse fixture — you only need to request it explicitly if a test needs to know the tolerance for the current default dtype. - - Returns - -------- - A precision threshold to use for closeness tests. - """ - old_dtype = torch.get_default_dtype() - dtype = request.param - _set_global_options({"default_dtype": dtype}) - yield FLOAT_TOLERANCE[dtype] - _set_global_options( - { - "default_dtype": {torch.float32: "float32", torch.float64: "float64"}[ - old_dtype - ] - } - ) - - -# - Ampere and TF32 - -# Many of the tests for NequIP involve numerically checking -# algebraic properties— normalization, equivariance, -# continuity, etc. -# With the added numerical noise of TF32, some of those tests fail -# with the current (and usually generous) thresholds. -# -# Thus we go on the assumption that PyTorch + NVIDIA got everything -# right, that this setting DOES NOT AFFECT the model outputs except -# for increased numerical noise, and only test without it. -# -# TODO: consider running tests with and without -# TODO: check how much thresholds have to be changed to accomidate TF32 -torch.backends.cuda.matmul.allow_tf32 = False -torch.backends.cudnn.allow_tf32 = False - - -@pytest.fixture(scope="session") -def BENCHMARK_ROOT(): - return pathlib.Path(__file__).parent / "../benchmark_data/" - - -@pytest.fixture(scope="session") -def temp_data(float_tolerance): - with tempfile.TemporaryDirectory() as tmpdirname: - yield tmpdirname - - -@pytest.fixture(scope="session") -def CH3CHO(CH3CHO_no_typemap) -> Tuple[Atoms, AtomicData]: - atoms, data = CH3CHO_no_typemap - tm = TypeMapper(chemical_symbol_to_type={"C": 0, "O": 1, "H": 2}) - data = tm(data) - return atoms, data - - -@pytest.fixture(scope="session") -def CH3CHO_no_typemap(float_tolerance) -> Tuple[Atoms, AtomicData]: - atoms = molecule("CH3CHO") - data = AtomicData.from_ase(atoms, r_max=2.0) - return atoms, data - - -@pytest.fixture(scope="session") -def molecules() -> List[Atoms]: - atoms_list = [] - for i in range(8): - atoms = molecule("CH3CHO" if i % 2 == 0 else "H2") - atoms.rattle() - atoms.calc = SinglePointCalculator( - energy=np.random.random(), - forces=np.random.random((len(atoms), 3)), - stress=None, - magmoms=None, - atoms=atoms, - ) - atoms_list.append(atoms) - return atoms_list - - -@pytest.fixture(scope="session") -def nequip_dataset(molecules, temp_data, float_tolerance): - with tempfile.NamedTemporaryFile(suffix=".xyz") as fp: - for atoms in molecules: - write(fp.name, atoms, format="extxyz", append=True) - a = ASEDataset( - file_name=fp.name, - root=temp_data, - extra_fixed_fields={"r_max": 3.0}, - ase_args=dict(format="extxyz"), - type_mapper=TypeMapper(chemical_symbol_to_type={"H": 0, "C": 1, "O": 2}), - ) - yield a - - -@pytest.fixture(scope="session") -def atomic_batch(nequip_dataset): - return Batch.from_data_list([nequip_dataset[0], nequip_dataset[1]]) - - -# Use debug mode -set_irreps_debug(True) +# like `source` in bash +with open(CONFTEST_PATH) as f: + exec(f.read()) diff --git a/tests/unit/model/test_nequip_model.py b/tests/unit/model/test_nequip_model.py new file mode 100644 index 00000000..20cf7064 --- /dev/null +++ b/tests/unit/model/test_nequip_model.py @@ -0,0 +1,118 @@ +import pytest + +from e3nn import o3 + +from nequip.data import AtomicDataDict +from nequip.model import model_from_config +from nequip.nn import AtomwiseLinear +from nequip.utils.unittests.model_tests import BaseEnergyModelTests + +COMMON_CONFIG = { + "avg_num_neighbors": None, + "num_types": 3, + "types_names": ["H", "C", "O"], +} +r_max = 3 +minimal_config1 = dict( + irreps_edge_sh="0e + 1o", + r_max=4, + feature_irreps_hidden="4x0e + 4x1o", + num_layers=2, + num_basis=8, + PolynomialCutoff_p=6, + nonlinearity_type="norm", + **COMMON_CONFIG +) +minimal_config2 = dict( + irreps_edge_sh="0e + 1o", + r_max=4, + chemical_embedding_irreps_out="8x0e + 8x0o + 8x1e + 8x1o", + irreps_mid_output_block="2x0e", + feature_irreps_hidden="4x0e + 4x1o", + **COMMON_CONFIG +) +minimal_config3 = dict( + irreps_edge_sh="0e + 1o", + r_max=4, + feature_irreps_hidden="4x0e + 4x1o", + num_layers=2, + num_basis=8, + PolynomialCutoff_p=6, + nonlinearity_type="gate", + **COMMON_CONFIG +) +minimal_config4 = dict( + irreps_edge_sh="0e + 1o + 2e", + r_max=4, + feature_irreps_hidden="2x0e + 2x1o + 2x2e", + num_layers=2, + num_basis=3, + PolynomialCutoff_p=6, + nonlinearity_type="gate", + # test custom nonlinearities + nonlinearity_scalars={"e": "silu", "o": "tanh"}, + nonlinearity_gates={"e": "silu", "o": "abs"}, + **COMMON_CONFIG +) + + +class TestNequIPModel(BaseEnergyModelTests): + @pytest.fixture( + params=[minimal_config1, minimal_config2, minimal_config3, minimal_config4], + scope="class", + ) + def base_config(self, request): + return request.param + + @pytest.fixture( + params=[ + ( + ["EnergyModel", "ForceOutput"], + [ + AtomicDataDict.TOTAL_ENERGY_KEY, + AtomicDataDict.PER_ATOM_ENERGY_KEY, + AtomicDataDict.FORCE_KEY, + ], + ), + ( + ["EnergyModel"], + [ + AtomicDataDict.TOTAL_ENERGY_KEY, + AtomicDataDict.PER_ATOM_ENERGY_KEY, + ], + ), + ( + ["EnergyModel", "StressForceOutput"], + [ + AtomicDataDict.TOTAL_ENERGY_KEY, + AtomicDataDict.PER_ATOM_ENERGY_KEY, + AtomicDataDict.FORCE_KEY, + AtomicDataDict.STRESS_KEY, + AtomicDataDict.VIRIAL_KEY, + ], + ), + ], + scope="class", + ) + def config(self, request, base_config): + config = base_config.copy() + builder, out_fields = request.param + config = config.copy() + config["model_builders"] = builder + return config, out_fields + + def test_submods(self): + config = minimal_config2.copy() + config["model_builders"] = ["EnergyModel"] + model = model_from_config(config=config, initialize=True) + assert isinstance(model.chemical_embedding, AtomwiseLinear) + true_irreps = o3.Irreps(minimal_config2["chemical_embedding_irreps_out"]) + assert ( + model.chemical_embedding.irreps_out[model.chemical_embedding.out_field] + == true_irreps + ) + # Make sure it propagates + assert ( + model.layer0_convnet.irreps_in[model.chemical_embedding.out_field] + == true_irreps + ) From ad58349db4d6787a9802f2c52c9f2a34bd0135c6 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 00:20:11 -0500 Subject: [PATCH 17/50] narrower test collection --- .github/workflows/tests.yml | 2 +- .github/workflows/tests_develop.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index b93d1ccd..a6e9a8f0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -44,4 +44,4 @@ jobs: - name: Test with pytest run: | # See https://github.com/pytest-dev/pytest/issues/1075 - PYTHONHASHSEED=0 pytest -n auto --ignore=docs/ . + PYTHONHASHSEED=0 pytest -n auto tests/ diff --git a/.github/workflows/tests_develop.yml b/.github/workflows/tests_develop.yml index bae5795e..e301f5e3 100644 --- a/.github/workflows/tests_develop.yml +++ b/.github/workflows/tests_develop.yml @@ -44,4 +44,4 @@ jobs: - name: Test with pytest run: | # See https://github.com/pytest-dev/pytest/issues/1075 - PYTHONHASHSEED=0 pytest -n auto --ignore=docs/ . + PYTHONHASHSEED=0 pytest -n auto tests/ From e8868eb378945b6ea8d0dba9ec3df3de28306571 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 00:42:17 -0500 Subject: [PATCH 18/50] test edge fields also for batching --- nequip/utils/unittests/model_tests.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/nequip/utils/unittests/model_tests.py b/nequip/utils/unittests/model_tests.py index 372571d4..00f3c5bb 100644 --- a/nequip/utils/unittests/model_tests.py +++ b/nequip/utils/unittests/model_tests.py @@ -14,6 +14,7 @@ Collater, _GRAPH_FIELDS, _NODE_FIELDS, + _EDGE_FIELDS, ) from nequip.data.transforms import TypeMapper from nequip.model import model_from_config @@ -139,6 +140,25 @@ def test_batch(self, model, atomic_batch, device, float_tolerance): output2[out_field], output[out_field][output[AtomicDataDict.BATCH_KEY] == 1], ) + elif out_field in _EDGE_FIELDS: + assert allclose( + output1[out_field], + output[out_field][ + output[AtomicDataDict.BATCH_KEY][ + output[AtomicDataDict.EDGE_INDEX_KEY][0] + ] + == 0 + ], + ) + assert allclose( + output2[out_field], + output[out_field][ + output[AtomicDataDict.BATCH_KEY][ + output[AtomicDataDict.EDGE_INDEX_KEY][0] + ] + == 1 + ], + ) else: raise NotImplementedError From 0a92126ef8bb5cfac5ec1f3b3dcf7b8b5e477f06 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 00:55:21 -0500 Subject: [PATCH 19/50] fix tests for edge features --- nequip/utils/unittests/model_tests.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nequip/utils/unittests/model_tests.py b/nequip/utils/unittests/model_tests.py index 00f3c5bb..2fe951d9 100644 --- a/nequip/utils/unittests/model_tests.py +++ b/nequip/utils/unittests/model_tests.py @@ -189,7 +189,10 @@ def test_embedding_cutoff(self, model, config, device): data.pos[2, 1] = r_max # put it past the cutoff edge_embed2 = instance(AtomicData.to_AtomicDataDict(data))[key] - assert torch.allclose(edge_embed[:2], edge_embed2[:2]) + if key == AtomicDataDict.EDGE_EMBEDDING_KEY: + # we can only check that other edges are unaffected if we know it's an embedding + # For example, an Allegro edge feature is many body so will be affected + assert torch.allclose(edge_embed[:2], edge_embed2[:2]) assert edge_embed[2:].abs().sum() > 1e-6 # some nonzero terms assert torch.allclose(edge_embed2[2:], torch.zeros(1, device=device)) From d26f0139caf52bb9ab5637acfbdf0a4cf2dc30c2 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 01:04:46 -0500 Subject: [PATCH 20/50] fix partial forces test --- nequip/utils/unittests/model_tests.py | 12 ++++++------ tests/unit/model/test_nequip_model.py | 4 ++++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/nequip/utils/unittests/model_tests.py b/nequip/utils/unittests/model_tests.py index 2fe951d9..e0d807bd 100644 --- a/nequip/utils/unittests/model_tests.py +++ b/nequip/utils/unittests/model_tests.py @@ -327,7 +327,7 @@ def test_numeric_gradient(self, model, atomic_batch, device): numeric, analytical, rtol=5e-2 ) - def test_partial_forces(self, config, atomic_batch, device): + def test_partial_forces(self, config, atomic_batch, device, strict_locality): config, out_fields = config if "ForceOutput" not in config["model_builders"]: pytest.skip() @@ -363,14 +363,14 @@ def test_partial_forces(self, config, atomic_batch, device): assert partial_forces.shape == (n_at, n_at, 3) # confirm that sparsity matches graph topology: edge_index = data[AtomicDataDict.EDGE_INDEX_KEY] - adjacency = torch.zeros(n_at, n_at, dtype=torch.bool) - strict_locality = False + adjacency = torch.zeros( + n_at, n_at, dtype=torch.bool, device=partial_forces.device + ) if strict_locality: # only adjacent for nonzero deriv to neighbors adjacency[edge_index[0], edge_index[1]] = True - adjacency[ - torch.arange(n_at), torch.arange(n_at) - ] = True # diagonal is ofc True + arange = torch.arange(n_at, device=partial_forces.device) + adjacency[arange, arange] = True # diagonal is ofc True else: # technically only adjacent to n-th degree neighbor, but in this tiny test system that is same as all-to-all and easier to program adjacency = data[AtomicDataDict.BATCH_KEY].view(-1, 1) == data[ diff --git a/tests/unit/model/test_nequip_model.py b/tests/unit/model/test_nequip_model.py index 20cf7064..2aa82e15 100644 --- a/tests/unit/model/test_nequip_model.py +++ b/tests/unit/model/test_nequip_model.py @@ -57,6 +57,10 @@ class TestNequIPModel(BaseEnergyModelTests): + @pytest.fixture + def strict_locality(self): + return False + @pytest.fixture( params=[minimal_config1, minimal_config2, minimal_config3, minimal_config4], scope="class", From 14c129e947bd677240f223cc3f1a366a8810d3b5 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 17:29:58 -0500 Subject: [PATCH 21/50] add --no-compile --- CHANGELOG.md | 1 + nequip/scripts/benchmark.py | 43 ++++++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 17 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 85e07593..4e79bf8c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Most recent change on the bottom. ## [Unreleased] - 0.5.6 ### Added - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights +- `nequip-benchmark --no-compile` ### Fixed - Git utilities when installed as ZIPed `.egg` (#264) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index e1ac63a9..43480bac 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -55,6 +55,11 @@ def main(args=None): type=float, default=1, ) + parser.add_argument( + "--no-compile", + help="Don't compile the model to TorchScript", + action="store_true", + ) # TODO: option to show memory use @@ -130,24 +135,28 @@ def main(args=None): print( f" model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable weights" ) - print("Compile...") - # "Deploy" it + model.eval() - compile_time = time.time() - model = script(model) - model = _compile_for_deploy(model) - compile_time = time.time() - compile_time - print(f" compilation took {compile_time:.4f}s") - - # save and reload to avoid bugs - with tempfile.NamedTemporaryFile() as f: - torch.jit.save(model, f.name) - model = torch.jit.load(f.name, map_location=device) - # freeze like in the LAMMPS plugin - model = torch.jit.freeze(model) - # and reload again just to avoid bugs - torch.jit.save(model, f.name) - model = torch.jit.load(f.name, map_location=device) + if args.no_compile: + model = model.to(device) + else: + print("Compile...") + # "Deploy" it + compile_time = time.time() + model = script(model) + model = _compile_for_deploy(model) + compile_time = time.time() - compile_time + print(f" compilation took {compile_time:.4f}s") + + # save and reload to avoid bugs + with tempfile.NamedTemporaryFile() as f: + torch.jit.save(model, f.name) + model = torch.jit.load(f.name, map_location=device) + # freeze like in the LAMMPS plugin + model = torch.jit.freeze(model) + # and reload again just to avoid bugs + torch.jit.save(model, f.name) + model = torch.jit.load(f.name, map_location=device) # Make sure we're warm past compilation warmup = config["_jit_bailout_depth"] + 4 # just to be safe... From 03142e1618ad5e09372df26a01890443039b5266 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 19:02:39 -0500 Subject: [PATCH 22/50] verbose benchmark --- CHANGELOG.md | 2 +- nequip/scripts/benchmark.py | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4e79bf8c..afe5f2bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ Most recent change on the bottom. ## [Unreleased] - 0.5.6 ### Added - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights -- `nequip-benchmark --no-compile` +- `nequip-benchmark --no-compile` and `--verbose` ### Fixed - Git utilities when installed as ZIPed `.egg` (#264) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index 43480bac..d856c4fa 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -3,6 +3,8 @@ import tempfile import itertools import time +import logging +import sys import torch from torch.utils.benchmark import Timer, Measurement @@ -60,12 +62,19 @@ def main(args=None): help="Don't compile the model to TorchScript", action="store_true", ) + parser.add_argument( + "--verbose", help="Logging verbosity level", type=str, default="error" + ) # TODO: option to show memory use # Parse the args args = parser.parse_args(args=args) + root_logger = logging.getLogger() + root_logger.setLevel(getattr(logging, args.verbose.upper())) + root_logger.handlers = [logging.StreamHandler(sys.stderr)] + if args.device is None: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") else: From 036b488e978122b1d4ecfd99416810b58ceb8381 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 15 Nov 2022 19:11:02 -0500 Subject: [PATCH 23/50] memory summary --- CHANGELOG.md | 2 +- nequip/scripts/benchmark.py | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index afe5f2bc..be6b1fa6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ Most recent change on the bottom. ## [Unreleased] - 0.5.6 ### Added - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights -- `nequip-benchmark --no-compile` and `--verbose` +- `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary` ### Fixed - Git utilities when installed as ZIPed `.egg` (#264) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index d856c4fa..ac3a5b94 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -62,12 +62,15 @@ def main(args=None): help="Don't compile the model to TorchScript", action="store_true", ) + parser.add_argument( + "--memory-summary", + help="Print torch.cuda.memory_summary() after running the model", + action="store_true", + ) parser.add_argument( "--verbose", help="Logging verbosity level", type=str, default="error" ) - # TODO: option to show memory use - # Parse the args args = parser.parse_args(args=args) @@ -144,6 +147,9 @@ def main(args=None): print( f" model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable weights" ) + print( + f" model weights and buffers take {sum(p.numel() * p.element_size() for p in itertools.chain(model.parameters(), model.buffers())) / (1024 * 1024):.2f} MB" + ) model.eval() if args.no_compile: @@ -205,6 +211,10 @@ def trace_handler(p): ) perloop: Measurement = t.timeit(args.n) + if args.memory_summary and torch.cuda.is_available(): + print("Memory usage summary:") + print(torch.cuda.memory_summary()) + print(" -- Results --") print( f"PLEASE NOTE: these are speeds for the MODEL, evaluated on --n-data={args.n_data} configurations kept in memory." From 17c62bbc6104e8f17bd345b5001e54110edeebb0 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Wed, 16 Nov 2022 23:24:15 -0500 Subject: [PATCH 24/50] allow `deploy` model builder argument --- nequip/model/_build.py | 11 +++++++++-- nequip/scripts/benchmark.py | 2 +- nequip/scripts/deploy.py | 2 +- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/nequip/model/_build.py b/nequip/model/_build.py index 0fe4e21d..7e1a63fd 100644 --- a/nequip/model/_build.py +++ b/nequip/model/_build.py @@ -8,7 +8,10 @@ def model_from_config( - config, initialize: bool = False, dataset: Optional[AtomicDataset] = None + config, + initialize: bool = False, + dataset: Optional[AtomicDataset] = None, + deploy: bool = False, ) -> GraphModuleMixin: """Build a model based on `config`. @@ -17,11 +20,13 @@ def model_from_config( - ``model``: the model produced by the previous builder. Cannot be requested by the first builder, must be requested by subsequent ones. - ``initialize``: whether to initialize the model - ``dataset``: if ``initialize`` is True, the dataset + - ``deploy``: whether the model object is for deployment / inference Args: config - initialize (bool): if True (default False), ``model_initializers`` will also be run. + initialize (bool): whether ``model_builders`` should be instructed to initialize the model dataset: dataset for initializers if ``initialize`` is True. + deploy (bool): whether ``model_builders`` should be told the model is for deployment / inference Returns: The build model. @@ -61,6 +66,8 @@ def model_from_config( params = {} if "initialize" in pnames: params["initialize"] = initialize + if "deploy" in pnames: + params["deploy"] = deploy if "config" in pnames: params["config"] = config if "dataset" in pnames: diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index ac3a5b94..82c61d37 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -140,7 +140,7 @@ def main(args=None): # Load model: print("Building model... ") model_time = time.time() - model = model_from_config(config, initialize=True, dataset=dataset) + model = model_from_config(config, initialize=True, dataset=dataset, deploy=True) model_time = time.time() - model_time print(f" building model took {model_time:.4f}s") print(f" model has {sum(p.numel() for p in model.parameters())} weights") diff --git a/nequip/scripts/deploy.py b/nequip/scripts/deploy.py index 8185ab75..7aafd5fc 100644 --- a/nequip/scripts/deploy.py +++ b/nequip/scripts/deploy.py @@ -198,7 +198,7 @@ def main(args=None): args.train_dir, model_name="best_model.pth", device="cpu" ) elif args.model is not None: - model = model_from_config(config) + model = model_from_config(config, deploy=True) else: raise AssertionError From b2aa837a7ba53d32d64a6ff2ff5d69014881b423 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Sun, 27 Nov 2022 23:08:55 -0500 Subject: [PATCH 25/50] fix model deploy status in test --- nequip/utils/unittests/model_tests.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/nequip/utils/unittests/model_tests.py b/nequip/utils/unittests/model_tests.py index e0d807bd..2b6a8b63 100644 --- a/nequip/utils/unittests/model_tests.py +++ b/nequip/utils/unittests/model_tests.py @@ -44,11 +44,10 @@ def config(self): def device(self, request): return request.param - @pytest.fixture(scope="class") - def model(self, config, device): - config, out_fields = config - torch.manual_seed(0) - np.random.seed(0) + @staticmethod + def make_model(config, device, initialize: bool = True, deploy: bool = False): + torch.manual_seed(127) + np.random.seed(193) config = config.copy() config.update( { @@ -56,8 +55,14 @@ def model(self, config, device): "types_names": ["H", "C", "O"], } ) - model = model_from_config(config) + model = model_from_config(config, initialize=initialize, deploy=deploy) model = model.to(device) + return model + + @pytest.fixture(scope="class") + def model(self, config, device): + config, out_fields = config + model = self.make_model(config, device=device) return model, out_fields # == common tests for all models == From 225473933536522fff25f4a5e3b961b6e24df645 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 29 Nov 2022 17:32:11 -0500 Subject: [PATCH 26/50] nequip-benchmark --pdb --- CHANGELOG.md | 1 + nequip/scripts/benchmark.py | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be6b1fa6..50338467 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ Most recent change on the bottom. ### Added - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights - `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary` +- `nequip-benchmark --pdb` for debugging model (builder) errors ### Fixed - Git utilities when installed as ZIPed `.egg` (#264) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index 82c61d37..5814a838 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -5,6 +5,7 @@ import time import logging import sys +import pdb import torch from torch.utils.benchmark import Timer, Measurement @@ -70,9 +71,16 @@ def main(args=None): parser.add_argument( "--verbose", help="Logging verbosity level", type=str, default="error" ) + parser.add_argument( + "--pdb", + help="Run model builders and model under debugger to easily drop to debugger to investigate errors.", + action="store_true", + ) # Parse the args args = parser.parse_args(args=args) + if args.pdb: + assert args.profile is None root_logger = logging.getLogger() root_logger.setLevel(getattr(logging, args.verbose.upper())) @@ -140,7 +148,13 @@ def main(args=None): # Load model: print("Building model... ") model_time = time.time() - model = model_from_config(config, initialize=True, dataset=dataset, deploy=True) + try: + model = model_from_config(config, initialize=True, dataset=dataset, deploy=True) + except: # noqa: E722 + if args.pdb: + pdb.post_mortem() + else: + raise model_time = time.time() - model_time print(f" building model took {model_time:.4f}s") print(f" model has {sum(p.numel() for p in model.parameters())} weights") @@ -196,6 +210,14 @@ def trace_handler(p): for _ in range(1 + warmup + args.n): model(next(datas).copy()) p.step() + elif args.pdb: + print("Running model under debugger...") + try: + for _ in range(args.n): + model(next(datas).copy()) + except: # noqa: E722) + pdb.post_mortem() + print("Done.") else: print("Warmup...") warmup_time = time.time() From b6331c013c11b0e3fa697808e2cdd8e457f5b1b4 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Fri, 2 Dec 2022 14:49:38 -0500 Subject: [PATCH 27/50] fix device bug --- nequip/data/transforms.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nequip/data/transforms.py b/nequip/data/transforms.py index 5a37e2be..4f6331b7 100644 --- a/nequip/data/transforms.py +++ b/nequip/data/transforms.py @@ -121,9 +121,9 @@ def transform(self, atomic_numbers): f"Data included atomic numbers {bad_set} that are not part of the atomic number -> type mapping!" ) - return self._Z_to_index[atomic_numbers - self._min_Z].to( - device=atomic_numbers.device - ) + return self._Z_to_index.to(device=atomic_numbers.device)[ + atomic_numbers - self._min_Z + ] def untransform(self, atom_types): """Transform atom types back into atomic numbers""" From 1fb33a29d4d128a304a365351a762fe92c2dff2a Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Fri, 2 Dec 2022 16:19:06 -0500 Subject: [PATCH 28/50] remove broken test --- tests/unit/trainer/test_trainer.py | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/tests/unit/trainer/test_trainer.py b/tests/unit/trainer/test_trainer.py index c8169fda..860be357 100644 --- a/tests/unit/trainer/test_trainer.py +++ b/tests/unit/trainer/test_trainer.py @@ -106,26 +106,6 @@ def test_save(self, trainer, format, suffix): assert isfile(file_name), "fail to save to file" assert suffix in file_name - @pytest.mark.parametrize("append", [True]) # , False]) - def test_from_dict(self, trainer, append): - - # torch.save(trainer.model, trainer.best_model_path) - - dictionary = trainer.as_dict(state_dict=True, training_progress=True) - trainer1 = Trainer.from_dict(dictionary, append=append) - - for key in [ - "best_model_path", - "last_model_path", - "logfile", - "epoch_log", - "batch_log", - "workdir", - ]: - v1 = getattr(trainer, key, None) - v2 = getattr(trainer1, key, None) - assert append == (v1 == v2) - @pytest.mark.parametrize("append", [True]) # , False]) def test_from_file(self, trainer, append): From 288706abecdf1e860ba391a66e0531b50820ec86 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Tue, 6 Dec 2022 21:50:13 -0500 Subject: [PATCH 29/50] fix test --- nequip/utils/unittests/model_tests.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nequip/utils/unittests/model_tests.py b/nequip/utils/unittests/model_tests.py index 2b6a8b63..f5f7700b 100644 --- a/nequip/utils/unittests/model_tests.py +++ b/nequip/utils/unittests/model_tests.py @@ -74,7 +74,9 @@ def test_jit(self, model, atomic_batch, device): instance, out_fields = model data = AtomicData.to_AtomicDataDict(atomic_batch.to(device=device)) instance = instance.to(device=device) - model_script = script(instance) + model_script = script( + instance, in_place=False + ) # don't modify the model used by other tests later for out_field in out_fields: assert torch.allclose( From 0f3909161d387e4a35b26969694b1aaf3c298e2a Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Wed, 7 Dec 2022 09:52:58 -0500 Subject: [PATCH 30/50] Revert " fix test" This reverts commit 288706abecdf1e860ba391a66e0531b50820ec86. --- nequip/utils/unittests/model_tests.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nequip/utils/unittests/model_tests.py b/nequip/utils/unittests/model_tests.py index f5f7700b..2b6a8b63 100644 --- a/nequip/utils/unittests/model_tests.py +++ b/nequip/utils/unittests/model_tests.py @@ -74,9 +74,7 @@ def test_jit(self, model, atomic_batch, device): instance, out_fields = model data = AtomicData.to_AtomicDataDict(atomic_batch.to(device=device)) instance = instance.to(device=device) - model_script = script( - instance, in_place=False - ) # don't modify the model used by other tests later + model_script = script(instance) for out_field in out_fields: assert torch.allclose( From 64df5e52e269fadc1480ae78c4d1682b59bb8486 Mon Sep 17 00:00:00 2001 From: Lixin Sun Date: Wed, 7 Dec 2022 16:31:54 +0000 Subject: [PATCH 31/50] remove sklearn dependence (#248) * change solver * remove dependency on dataset * add ridge tests * swap to ridge * add down sampling * change to coef * change to torch.solve * black * fix correlated columns * fix sqrt error * black * black * black * add normalization * black * flake8 * change explaination * add debug log * Update tests/conftest.py Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> * Update tests/conftest.py Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> * Fix typo in README (#270) * update change log * Update tests/conftest.py Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> * Update tests/conftest.py Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> Co-authored-by: Lixin Sun Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> Co-authored-by: Simon Batzner --- CHANGELOG.md | 1 + README.md | 2 +- configs/full.yaml | 4 +- nequip/utils/regressor.py | 235 ++++++++--------------------- nequip/utils/unittests/conftest.py | 17 +++ setup.py | 1 - tests/unit/data/test_dataset.py | 32 ++-- tests/unit/utils/test_gp.py | 37 ----- tests/unit/utils/test_solver.py | 38 +++++ 9 files changed, 132 insertions(+), 235 deletions(-) delete mode 100644 tests/unit/utils/test_gp.py create mode 100644 tests/unit/utils/test_solver.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 50338467..67bd0c81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Most recent change on the bottom. ## [Unreleased] - 0.5.6 ### Added +- sklearn dependency removed - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights - `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary` - `nequip-benchmark --pdb` for debugging model (builder) errors diff --git a/README.md b/README.md index f70840b8..da741c09 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,6 @@ under the guidance of [Boris Kozinsky at Harvard](https://bkoz.seas.harvard.edu/ If you have questions, please don't hesitate to reach out at batzner[at]g[dot]harvard[dot]edu. If you find a bug or have a proposal for a feature, please post it in the [Issues](https://github.com/mir-group/nequip/issues). -If you have a question, topic, or issue that isn't obviously one of those, try our [GitHub Disucssions](https://github.com/mir-group/nequip/discussions). +If you have a question, topic, or issue that isn't obviously one of those, try our [GitHub Discussions](https://github.com/mir-group/nequip/discussions). If you want to contribute to the code, please read [`CONTRIBUTING.md`](CONTRIBUTING.md). diff --git a/configs/full.yaml b/configs/full.yaml index 3c3e8984..d310c70a 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -315,10 +315,10 @@ per_species_rescale_scales: dataset_forces_rms # If not provided, defaults to dataset_per_species_force_rms or dataset_per_atom_total_energy_std, depending on whether forces are being trained. # per_species_rescale_kwargs: # total_energy: -# alpha: 0.1 +# alpha: 0.001 # max_iteration: 20 # stride: 100 -# keywords for GP decomposition of per specie energy. Optional. Defaults to 0.1 +# keywords for ridge regression decomposition of per specie energy. Optional. Defaults to 0.001. The value should be in the range of 1e-3 to 1e-2 # per_species_rescale_arguments_in_dataset_units: True # if explicit numbers are given for the shifts/scales, this parameter must specify whether the given numbers are unitless shifts/scales or are in the units of the dataset. If ``True``, any global rescalings will correctly be applied to the per-species values. diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py index 3d23cf84..30c8f9ab 100644 --- a/nequip/utils/regressor.py +++ b/nequip/utils/regressor.py @@ -1,181 +1,72 @@ import logging import torch -import numpy as np -from typing import Optional -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import DotProduct, Kernel, Hyperparameter +from torch import matmul +from torch.linalg import solve, inv +from typing import Optional, Sequence +from opt_einsum import contract -def solver(X, y, regressor: Optional[str] = "NormalizedGaussianProcess", **kwargs): - if regressor == "GaussianProcess": - return gp(X, y, **kwargs) - elif regressor == "NormalizedGaussianProcess": - return normalized_gp(X, y, **kwargs) - else: - raise NotImplementedError(f"{regressor} is not implemented") +def solver(X, y, alpha: Optional[float] = 0.001, stride: Optional[int] = 1, **kwargs): + + dtype = torch.get_default_dtype() + X = X[::stride].to(dtype) + y = y[::stride].to(dtype) + + X, y = down_sampling_by_composition(X, y) + + X_norm = torch.sum(X) + + X = X / X_norm + y = y / X_norm -def normalized_gp(X, y, **kwargs): - feature_rms = 1.0 / np.sqrt(np.average(X**2, axis=0)) - feature_rms = np.nan_to_num(feature_rms, 1) y_mean = torch.sum(y) / torch.sum(X) - mean, std = base_gp( - X, - y - (torch.sum(X, axis=1) * y_mean).reshape(y.shape), - NormalizedDotProduct, - {"diagonal_elements": feature_rms}, - **kwargs, - ) - return mean + y_mean, std - - -def gp(X, y, **kwargs): - return base_gp( - X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, **kwargs - ) - - -def base_gp( - X, - y, - kernel, - kernel_kwargs, - alpha: Optional[float] = 0.1, - max_iteration: int = 20, - stride: Optional[int] = None, + + feature_rms = torch.sqrt(torch.mean(X**2, axis=0)) + + alpha_mat = torch.diag(feature_rms) * alpha * alpha + + A = matmul(X.T, X) + alpha_mat + dy = y - (torch.sum(X, axis=1, keepdim=True) * y_mean).reshape(y.shape) + Xy = matmul(X.T, dy) + + mean = solve(A, Xy) + + sigma2 = torch.var(matmul(X, mean) - dy) + Ainv = inv(A) + cov = torch.sqrt(sigma2 * contract("ij,kj,kl,li->i", Ainv, X, X, Ainv)) + + mean = mean + y_mean.reshape([-1]) + + logging.debug(f"Ridge Regression, residue {sigma2}") + + return mean, cov + + +def down_sampling_by_composition( + X: torch.Tensor, y: torch.Tensor, percentage: Sequence = [0.25, 0.5, 0.75] ): - if len(y.shape) == 1: - y = y.reshape([-1, 1]) - - if stride is not None: - X = X[::stride] - y = y[::stride] - - not_fit = True - iteration = 0 - mean = None - std = None - while not_fit: - logging.debug(f"GP fitting iteration {iteration} {alpha}") - try: - _kernel = kernel(**kernel_kwargs) - gpr = GaussianProcessRegressor(kernel=_kernel, random_state=0, alpha=alpha) - gpr = gpr.fit(X, y) - - vec = torch.diag(torch.ones(X.shape[1])) - mean, std = gpr.predict(vec, return_std=True) - - mean = torch.as_tensor(mean, dtype=torch.get_default_dtype()).reshape([-1]) - # ignore all the off-diagonal terms - std = torch.as_tensor(std, dtype=torch.get_default_dtype()).reshape([-1]) - likelihood = gpr.log_marginal_likelihood() - - res = torch.sqrt( - torch.square(torch.matmul(X, mean.reshape([-1, 1])) - y).mean() - ) - - logging.debug( - f"GP fitting: alpha {alpha}:\n" - f" residue {res}\n" - f" mean {mean} std {std}\n" - f" log marginal likelihood {likelihood}" - ) - not_fit = False - - except Exception as e: - logging.info(f"GP fitting failed for alpha={alpha} and {e.args}") - if alpha == 0 or alpha is None: - logging.info("try a non-zero alpha") - not_fit = False - raise ValueError( - f"Please set the {alpha} to non-zero value. \n" - "The dataset energy is rank deficient to be solved with GP" - ) - else: - alpha = alpha * 2 - iteration += 1 - logging.debug(f" increase alpha to {alpha}") - - if iteration >= max_iteration or not_fit is False: - raise ValueError( - "Please set the per species shift and scale to zeros and ones. \n" - "The dataset energy is to diverge to be solved with GP" - ) - - return mean, std - - -class NormalizedDotProduct(Kernel): - r"""Dot-Product kernel. - .. math:: - k(x_i, x_j) = x_i \cdot A \cdot x_j - """ - - def __init__(self, diagonal_elements): - # TO DO: check shape - self.diagonal_elements = diagonal_elements - self.A = np.diag(diagonal_elements) - - def __call__(self, X, Y=None, eval_gradient=False): - """Return the kernel k(X, Y) and optionally its gradient. - Parameters - ---------- - X : ndarray of shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y) - Y : ndarray of shape (n_samples_Y, n_features), default=None - Right argument of the returned kernel k(X, Y). If None, k(X, X) - if evaluated instead. - eval_gradient : bool, default=False - Determines whether the gradient with respect to the log of - the kernel hyperparameter is computed. - Only supported when Y is None. - Returns - ------- - K : ndarray of shape (n_samples_X, n_samples_Y) - Kernel k(X, Y) - K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\ - optional - The gradient of the kernel k(X, X) with respect to the log of the - hyperparameter of the kernel. Only returned when `eval_gradient` - is True. - """ - X = np.atleast_2d(X) - if Y is None: - K = (X.dot(self.A)).dot(X.T) - else: - if eval_gradient: - raise ValueError("Gradient can only be evaluated when Y is None.") - K = (X.dot(self.A)).dot(Y.T) - - if eval_gradient: - return K, np.empty((X.shape[0], X.shape[0], 0)) - else: - return K - - def diag(self, X): - """Returns the diagonal of the kernel k(X, X). - The result of this method is identical to np.diag(self(X)); however, - it can be evaluated more efficiently since only the diagonal is - evaluated. - Parameters - ---------- - X : ndarray of shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y). - Returns - ------- - K_diag : ndarray of shape (n_samples_X,) - Diagonal of kernel k(X, X). - """ - return np.einsum("ij,ij,jj->i", X, X, self.A) - - def __repr__(self): - return "" - - def is_stationary(self): - """Returns whether the kernel is stationary.""" - return False - - @property - def hyperparameter_diagonal_elements(self): - return Hyperparameter("diagonal_elements", "numeric", "fixed") + unique_comps, comp_ids = torch.unique(X, dim=0, return_inverse=True) + + n_types = torch.max(comp_ids) + 1 + + sort_by = torch.argsort(comp_ids) + + # find out the block for each composition + d_icomp = comp_ids[sort_by] + d_icomp = d_icomp[:-1] - d_icomp[1:] + node_icomp = torch.where(d_icomp != 0)[0] + id_start = torch.cat((torch.as_tensor([0]), node_icomp + 1)) + id_end = torch.cat((node_icomp + 1, torch.as_tensor([len(sort_by)]))) + + n_points = len(percentage) + new_X = torch.zeros((n_types * n_points, X.shape[1])) + new_y = torch.zeros((n_types * n_points)) + for i in range(n_types): + ids = sort_by[id_start[i] : id_end[i]] + for j, p in enumerate(percentage): + new_y[i * n_points + j] = torch.quantile(y[ids], p, interpolation="linear") + new_X[i * n_points + j] = unique_comps[i] + + return new_X, new_y diff --git a/nequip/utils/unittests/conftest.py b/nequip/utils/unittests/conftest.py index 060e5e7b..77a91930 100644 --- a/nequip/utils/unittests/conftest.py +++ b/nequip/utils/unittests/conftest.py @@ -133,5 +133,22 @@ def atomic_batch(nequip_dataset): return Batch.from_data_list([nequip_dataset[0], nequip_dataset[1]]) +@pytest.fixture(scope="function") +def per_species_set(): + dtype = torch.get_default_dtype() + torch.manual_seed(0) + mean_min = 1 + mean_max = 100 + std = 20 + n_sample = 1000 + n_species = 9 + ref_mean = torch.rand((n_species)) * (mean_max - mean_min) + mean_min + t_mean = torch.ones((n_sample, 1)) * ref_mean.reshape([1, -1]) + ref_std = torch.rand((n_species)) * std + t_std = torch.ones((n_sample, 1)) * ref_std.reshape([1, -1]) + E = torch.normal(t_mean, t_std) + return ref_mean.to(dtype), ref_std.to(dtype), E.to(dtype), n_sample, n_species + + # Use debug mode set_irreps_debug(True) diff --git a/setup.py b/setup.py index 8c977e0a..cba6b51f 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ "typing_extensions;python_version<'3.8'", # backport of Final "torch-runstats>=0.2.0", "torch-ema>=0.3.0", - "scikit_learn<=1.0.1", # for GaussianProcess for per-species statistics; 1.0.2 has a bug! ], zip_safe=True, ) diff --git a/tests/unit/data/test_dataset.py b/tests/unit/data/test_dataset.py index f45e0ca8..bad796c3 100644 --- a/tests/unit/data/test_dataset.py +++ b/tests/unit/data/test_dataset.py @@ -31,7 +31,7 @@ def ase_file(molecules): MAX_ATOMIC_NUMBER: int = 5 -NATOMS = 3 +NATOMS = 10 @pytest.fixture(scope="function") @@ -277,16 +277,11 @@ def test_per_node_field(self, npz_dataset, fixed_field, mode, subset): ) print(result) - @pytest.mark.parametrize("alpha", [1e-5, 1e-3, 0.1, 0.5]) + @pytest.mark.parametrize("alpha", [0, 1e-3, 0.01]) @pytest.mark.parametrize("fixed_field", [True, False]) @pytest.mark.parametrize("full_rank", [True, False]) @pytest.mark.parametrize("subset", [True, False]) - @pytest.mark.parametrize( - "regressor", ["NormalizedGaussianProcess", "GaussianProcess"] - ) - def test_per_graph_field( - self, npz_dataset, alpha, fixed_field, full_rank, regressor, subset - ): + def test_per_graph_field(self, npz_dataset, alpha, fixed_field, full_rank, subset): if alpha <= 1e-4 and not full_rank: return @@ -308,10 +303,7 @@ def test_per_graph_field( del n_spec del Ns - if alpha == 1e-5: - ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.0) - else: - ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.5) + ref_mean, ref_std, E = generate_E(N, 100, 1000, 10) if subset: E_orig_order = torch.zeros_like( @@ -333,7 +325,6 @@ def test_per_graph_field( AtomicDataDict.TOTAL_ENERGY_KEY + "per_species_mean_std": { "alpha": alpha, - "regressor": regressor, "stride": 1, } }, @@ -341,21 +332,18 @@ def test_per_graph_field( res = torch.matmul(N, mean.reshape([-1, 1])) - E.reshape([-1, 1]) res2 = torch.sum(torch.square(res)) - print("residue", alpha, res2 - ref_res2) + print("alpha, residue, actual residue", alpha, res2, ref_res2) print("mean", mean, ref_mean) print("diff in mean", mean - ref_mean) print("std", std, ref_std) + tolerance = torch.max(ref_std) * 4 if full_rank: - if alpha == 1e-5: - assert torch.allclose(mean, ref_mean, rtol=1e-1) - else: - assert torch.allclose(mean, ref_mean, rtol=1) - assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100) - elif regressor == "NormalizedGaussianProcess": - assert torch.std(mean).numpy() == 0 + assert torch.allclose(mean, ref_mean, atol=tolerance) + # assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100) else: - assert mean[0] == mean[1] * 2 + assert torch.allclose(mean, mean[0], atol=tolerance) + # assert torch.std(mean).numpy() == 0 class TestReload: diff --git a/tests/unit/utils/test_gp.py b/tests/unit/utils/test_gp.py deleted file mode 100644 index 4792b9d2..00000000 --- a/tests/unit/utils/test_gp.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch -import pytest - -from nequip.utils.regressor import base_gp -from sklearn.gaussian_process.kernels import DotProduct - - -# @pytest.mark.parametrize("full_rank", [True, False]) -@pytest.mark.parametrize("full_rank", [False]) -@pytest.mark.parametrize("alpha", [0, 1e-3, 0.1, 1]) -def test_random(full_rank, alpha): - - if alpha == 0 and not full_rank: - return - - torch.manual_seed(0) - n_samples = 10 - n_dim = 3 - - if full_rank: - X = torch.randint(low=1, high=10, size=(n_samples, n_dim)) - else: - X = torch.randint(low=1, high=10, size=(n_samples, 1)) * torch.ones( - (n_samples, n_dim) - ) - - ref_mean = torch.rand((n_dim, 1)) - y = torch.matmul(X, ref_mean) - - mean, std = base_gp( - X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, alpha=0.1 - ) - - if full_rank: - assert torch.allclose(ref_mean, mean, rtol=0.5) - else: - assert torch.allclose(mean, mean[0], rtol=1e-3) diff --git a/tests/unit/utils/test_solver.py b/tests/unit/utils/test_solver.py new file mode 100644 index 00000000..049c897d --- /dev/null +++ b/tests/unit/utils/test_solver.py @@ -0,0 +1,38 @@ +import torch +import pytest + +from nequip.utils.regressor import solver + + +@pytest.mark.parametrize("full_rank", [True, False]) +@pytest.mark.parametrize("alpha", [0, 1e-3, 1e-2]) +def test_random(full_rank, alpha, per_species_set): + + if alpha == 0 and not full_rank: + return + + torch.manual_seed(0) + + ref_mean, ref_std, E, n_samples, n_dim = per_species_set + + dtype = torch.get_default_dtype() + + X = torch.randint(low=1, high=10, size=(n_samples, n_dim)).to(dtype) + if not full_rank: + X[:, n_dim - 2] = X[:, n_dim - 1] * 2 + y = (X * E).sum(axis=-1) + + mean, std = solver(X, y, alpha=alpha) + + tolerance = torch.max(ref_std) + + print("tolerance", tolerance) + print("solution", mean, std) + print("diff", mean - ref_mean) + + if full_rank: + assert torch.allclose(ref_mean, mean, atol=tolerance) + else: + assert torch.allclose(mean[n_dim - 1], mean[n_dim - 2], atol=tolerance) + + assert torch.max(std) < tolerance From 6ae6710aa7ae145f2d0b6917671f7a53c23975ec Mon Sep 17 00:00:00 2001 From: Lixin Sun Date: Wed, 7 Dec 2022 16:32:15 +0000 Subject: [PATCH 32/50] Better documentation and guidance (#102) * add rst for how to setup a dataset * Update dataset.rst * upadte command line instruction * update link * update dataset keys in all_options * RST cleanup * command line reference * update the per_species doc * lint * test * another test * update all colders * fix misspell * add trainer to api * add install instructions to docs * simpler install Co-authored-by: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Co-authored-by: Lixin Sun --- configs/full.yaml | 9 +- docs/api/nequip.rst | 3 +- docs/api/trainer.rst | 10 ++ docs/cite.rst | 3 + docs/commandline/commands.rst | 132 ++++++++++++++++++ docs/{guide/FAQ.rst => errors/errors.rst} | 13 +- docs/faq/FAQ.rst | 14 ++ docs/guide/guide.rst | 9 -- docs/guide/intro.rst | 4 - docs/guide/irreps.rst | 9 -- docs/{guide => howto}/conventions.rst | 0 docs/howto/dataset.rst | 156 ++++++++++++++++++++++ docs/howto/howto.rst | 7 + docs/{guide => howto}/migrate.rst | 0 docs/index.rst | 12 +- docs/installation/install.rst | 39 ++++++ docs/introduction/intro.rst | 4 + docs/lammps/all.rst | 7 + docs/lammps/ase.rst | 2 + docs/lammps/lammps.rst | 2 + docs/options/dataset.rst | 10 +- docs/yaml/yaml.rst | 4 + nequip/data/dataset.py | 2 +- nequip/scripts/deploy.py | 2 +- nequip/scripts/evaluate.py | 10 +- nequip/scripts/train.py | 8 +- 26 files changed, 421 insertions(+), 50 deletions(-) create mode 100644 docs/api/trainer.rst create mode 100644 docs/cite.rst create mode 100644 docs/commandline/commands.rst rename docs/{guide/FAQ.rst => errors/errors.rst} (56%) create mode 100644 docs/faq/FAQ.rst delete mode 100644 docs/guide/guide.rst delete mode 100644 docs/guide/intro.rst delete mode 100644 docs/guide/irreps.rst rename docs/{guide => howto}/conventions.rst (100%) create mode 100644 docs/howto/dataset.rst create mode 100644 docs/howto/howto.rst rename docs/{guide => howto}/migrate.rst (100%) create mode 100644 docs/installation/install.rst create mode 100644 docs/introduction/intro.rst create mode 100644 docs/lammps/all.rst create mode 100644 docs/lammps/ase.rst create mode 100644 docs/lammps/lammps.rst create mode 100644 docs/yaml/yaml.rst diff --git a/configs/full.yaml b/configs/full.yaml index d310c70a..2f98164e 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -211,6 +211,8 @@ loss_coeffs: total_energy: - 1 - PerAtomMSELoss +# note that the ratio between force and energy loss matters for the training process. One may consider using 1:1 with the PerAtomMSELoss. If the energy loss still significantly dominate the loss function at the initial epochs, tune the energy loss weight lower helps the training a lot. + # # default loss function is MSELoss, the name has to be exactly the same as those in torch.nn. # the only supprted targets are forces and total_energy @@ -342,9 +344,10 @@ global_rescale_scale_trainable: false # global_rescale_shift_trainable: false # global_rescale_scale: dataset_forces_rms # global_rescale_scale_trainable: false -# per_species_rescale_trainable: true -# per_species_rescale_shifts: dataset_per_atom_total_energy_mean -# per_species_rescale_scales: dataset_per_atom_total_energy_std +# per_species_rescale_shifts_trainable: false +# per_species_rescale_scales_trainable: true +# per_species_rescale_shifts: dataset_per_species_total_energy_mean +# per_species_rescale_scales: dataset_per_species_forces_rms # # full block needed for global rescale # global_rescale_shift: dataset_total_energy_mean diff --git a/docs/api/nequip.rst b/docs/api/nequip.rst index 13bc37ca..6f6250cf 100644 --- a/docs/api/nequip.rst +++ b/docs/api/nequip.rst @@ -3,4 +3,5 @@ Python API .. toctree:: - data \ No newline at end of file + data + trainer diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst new file mode 100644 index 00000000..983e6f6b --- /dev/null +++ b/docs/api/trainer.rst @@ -0,0 +1,10 @@ +nequip.trainer +============== + + .. automodule:: nequip.train.trainer + :members: + :imported-members: + + .. automodule:: nequip.train.trainer_wandb + :members: + :imported-members: diff --git a/docs/cite.rst b/docs/cite.rst new file mode 100644 index 00000000..9f8296cc --- /dev/null +++ b/docs/cite.rst @@ -0,0 +1,3 @@ +Citing Nequip +============= + diff --git a/docs/commandline/commands.rst b/docs/commandline/commands.rst new file mode 100644 index 00000000..b58c87ab --- /dev/null +++ b/docs/commandline/commands.rst @@ -0,0 +1,132 @@ +Command-line Executables +======================== + +``nequip-train`` +---------------- + + .. code :: + + usage: nequip-train [-h] [--equivariance-test] [--model-debug-mode] [--grad-anomaly-mode] [--log LOG] config + +Train (or restart training of) a NequIP model. + +positional arguments: + config YAML file configuring the model, dataset, and other options + +optional arguments: + -h, --help show this help message and exit + --equivariance-test test the model's equivariance before training + --model-debug-mode enable model debug mode, which can sometimes give much more useful error messages at the + cost of some speed. Do not use for production training! + --grad-anomaly-mode enable PyTorch autograd anomaly mode to debug NaN gradients. Do not use for production + training! + --log LOG log file to store all the screen logging + +``nequip-evaluate`` +------------------- + + .. code :: + + usage: nequip-evaluate [-h] [--train-dir TRAIN_DIR] [--model MODEL] [--dataset-config DATASET_CONFIG] + [--metrics-config METRICS_CONFIG] [--test-indexes TEST_INDEXES] [--batch-size BATCH_SIZE] + [--device DEVICE] [--output OUTPUT] [--log LOG] + +Compute the error of a model on a test set using various metrics. The model, metrics, dataset, etc. can specified +in individual YAML config files, or a training session can be indicated with ``--train-dir``. In order of priority, +the global settings (dtype, TensorFloat32, etc.) are taken from: (1) the model config (for a training session), (2) +the dataset config (for a deployed model), or (3) the defaults. Prints only the final result in ``name = num`` format +to stdout; all other information is ``logging.debug``ed to stderr. WARNING: Please note that results of CUDA models +are rarely exactly reproducible, and that even CPU models can be nondeterministic. + +optional arguments: + -h, --help show this help message and exit + --train-dir TRAIN_DIR + Path to a working directory from a training session. + --model MODEL A deployed or pickled NequIP model to load. If omitted, defaults to `best_model.pth` in + `train_dir`. + --dataset-config DATASET_CONFIG + A YAML config file specifying the dataset to load test data from. If omitted, `config.yaml` + in `train_dir` will be used + --metrics-config METRICS_CONFIG + A YAML config file specifying the metrics to compute. If omitted, `config.yaml` in + `train_dir` will be used. If the config does not specify `metrics_components`, the default + is to logging.debug MAEs and RMSEs for all fields given in the loss function. If the + literal string `None`, no metrics will be computed. + --test-indexes TEST_INDEXES + Path to a file containing the indexes in the dataset that make up the test set. If omitted, + all data frames *not* used as training or validation data in the training session + `train_dir` will be used. + --batch-size BATCH_SIZE + Batch size to use. Larger is usually faster on GPU. + --device DEVICE Device to run the model on. If not provided, defaults to CUDA if available and CPU + otherwise. + --output OUTPUT XYZ file to write out the test set and model predicted forces, energies, etc. to. + --log LOG log file to store all the metrics and screen logging.debug + +``nequip-deploy`` +----------------- + + .. code :: + + usage: nequip-deploy [-h] {info,build} ... + +Deploy and view information about previously deployed NequIP models. + +optional arguments: + -h, --help show this help message and exit + +commands: + {info,build} + info Get information from a deployed model file + build Build a deployment model + +``nequip-deploy info`` +~~~~~~~~~~~~~~~~~~~~~~ + + .. code :: + + usage: nequip-deploy info [-h] model_path + +positional arguments: + model_path Path to a deployed model file. + +optional arguments: + -h, --help show this help message and exit + + +``nequip-deploy build`` +~~~~~~~~~~~~~~~~~~~~~~~ + + .. code :: + + usage: nequip-deploy build [-h] train_dir out_file + +positional arguments: + train_dir Path to a working directory from a training session. + out_file Output file for deployed model. + +optional arguments: + -h, --help show this help message and exit + + +``nequip-benchmark`` +-------------------- + + .. code :: + + usage: nequip-benchmark [-h] [--profile PROFILE] [--device DEVICE] [-n N] [--n-data N_DATA] [--timestep TIMESTEP] + config + +Benchmark the approximate MD performance of a given model configuration / dataset pair. + +positional arguments: + config configuration file + +optional arguments: + -h, --help show this help message and exit + --profile PROFILE Profile instead of timing, creating and outputing a Chrome trace JSON to the given path. + --device DEVICE Device to run the model on. If not provided, defaults to CUDA if available and CPU + otherwise. + -n N Number of trials. + --n-data N_DATA Number of frames to use. + --timestep TIMESTEP MD timestep for ns/day esimation, in fs. Defauts to 1fs. diff --git a/docs/guide/FAQ.rst b/docs/errors/errors.rst similarity index 56% rename from docs/guide/FAQ.rst rename to docs/errors/errors.rst index 92ac758e..576e553d 100644 --- a/docs/guide/FAQ.rst +++ b/docs/errors/errors.rst @@ -1,14 +1,5 @@ -FAQ -=== - -How do I... ------------ - -... continue to train a model that reached a stopping condition? - There will be an answer here. - -1. Reload the model trained with version 0.3.3 to the code in 0.4. - check out the migration note at :ref:`migration_note`. +Errors +====== Common errors ------------- diff --git a/docs/faq/FAQ.rst b/docs/faq/FAQ.rst new file mode 100644 index 00000000..411e77c1 --- /dev/null +++ b/docs/faq/FAQ.rst @@ -0,0 +1,14 @@ +FAQ +=== + +How do I... +----------- + +... continue to train a model that reached a stopping condition? + There will be an answer here. + +1. Reload the model trained with version 0.3.3 to the code in 0.4. + check out the migration note at :ref:`migration_note`. + +2. Specify my dataset for `nequip-train` and `nequip-eval`, see :ref:`_dataset_note`. + diff --git a/docs/guide/guide.rst b/docs/guide/guide.rst deleted file mode 100644 index 6def3859..00000000 --- a/docs/guide/guide.rst +++ /dev/null @@ -1,9 +0,0 @@ -NequIP User Guide -================= - - .. toctree:: - - intro - irreps - conventions - FAQ \ No newline at end of file diff --git a/docs/guide/intro.rst b/docs/guide/intro.rst deleted file mode 100644 index 7afa4132..00000000 --- a/docs/guide/intro.rst +++ /dev/null @@ -1,4 +0,0 @@ -Tutorial: Introduction to NequIP -================================ - -TODO \ No newline at end of file diff --git a/docs/guide/irreps.rst b/docs/guide/irreps.rst deleted file mode 100644 index 5f9b2735..00000000 --- a/docs/guide/irreps.rst +++ /dev/null @@ -1,9 +0,0 @@ -Irreps -====== - -.. _Irreps: - -Syntax to specify irreps ------------------------- - -TODO: descripe irreps syntax here \ No newline at end of file diff --git a/docs/guide/conventions.rst b/docs/howto/conventions.rst similarity index 100% rename from docs/guide/conventions.rst rename to docs/howto/conventions.rst diff --git a/docs/howto/dataset.rst b/docs/howto/dataset.rst new file mode 100644 index 00000000..2b5267e7 --- /dev/null +++ b/docs/howto/dataset.rst @@ -0,0 +1,156 @@ +.. _dataset_note: + +How to prepare training dataset +=============================== + +What does NequIP behind the scene +--------------------------------- + +NequIP uses AtomicDataset class to store the atomic configurations. +During the initialization of an AtomicDataset object, +NequIP reads the atomic structures from the dataset, +computes the neighbor list and other data structures needed for the GNN +by converting raw data to a list of ``AtomicData`` objects. + +The computed results are then cached on harddisk ``root/processed_hashkey`` folder. +The hashing is based on all the metadata provided for the dataset, +which includes the file name, the cutoff radius, float number precision and etc. +In the case where multiple training/evaluation runs use the same dataset, +the neighbor list will only be computed in the first NequIP run. +The later runs will directly load the AtomicDataset object from the cache file to save computation time. + +Note: be careful to the cached file. If you update your raw data file but keep using the same filename, +NequIP will not automatically update the cached data. + +Key concepts +------------ + +fixed_fields +~~~~~~~~~~~~ +Fixed fields are the quantities that are shared among all the configurations in the dataset. +For example, if the dataset is a trajectory of an NVT MD simulation, the super cell size and the atomic species +are indeed a constant matrix/vector through out the whole dataset. +In this case, in stead of repeating the same values for many times, +we specify the cell and species as fixed fields and only provide them once. + +yaml interface +~~~~~~~~~~~~~~ +``nequip-train`` and ``nequip-evaluate`` automatically construct the AtomicDataset based on the yaml arguments. +Later sections offer a couple different examples. + +If the training and validation datasets are from different raw files, the arguments for each set +can be defined with ``dataset`` prefix and ``validation_dataset`` prefix, respectively. + +For example, ``dataset_file_name`` is used for training data and ``validation_dataset_file_name`` is for validation data. + +Python interface +~~~~~~~~~~~~~~~~ +See ``nequip.data.dataset.AtomicInMemoryDataset``. + +Prepare dataset and specify in yaml config +------------------------------------------ + +ASE format +~~~~~~~~~~ + +NequIP accept all format that can be parsed by `ase.io.read` function. +We recommend `extxyz`. + +Example: Given an atomic data stored in "H2.extxyz" that looks like below: + +.. code:: extxyz + + 2 + Properties=species:S:1:pos:R:3 energy=-10 user_label=2.0 pbc="F F F" + H 0.00000000 0.00000000 0.00000000 + H 0.00000000 0.00000000 1.02000000 + +The yaml input should be + +.. code:: yaml + + dataset: ase + dataset_file_name: H2.extxyz + ase_args: + format: extxyz + include_keys: + - user_label + key_mapping: + user_label: label0 + chemical_symbol_to_type: + H: 0 + +For other formats than `extxyz`, be careful to the ase parsers; they may have different behavior from the extxyz parser. +For example, the ase vasp parser store potential energy to `free_energy` instead of `energy`. +Because we optimize our code to the `extxyz` parser, NequIP will not be able to load any `total_energy` labels. +We need some additional keys to help NequIP to understand the situtaion +Here's an example for vasp outcar. + +.. code:: yaml + + dataset: ase + dataset_file_name: OUTCAR + ase_args: + format: vasp-out + key_mapping: + free_energy: total_energy + chemical_symbol_to_type: + H: 0 + +The way around is to use key mapping, please see more note below. + +NPZ format +~~~~~~~~~~ + +If your dataset constitute configurations that always have the same number of atoms, npz data format can be an option. + +In the npz file, all the values should have the same row as the number of the configurations. +For example, the force array of 36 atomic configurations of an N-atom system should have the shape of (36, N, 3); +their total_energy array should have the shape of (36). + +Below is an example of the yaml specification. + +.. code:: yaml + + dataset: npz + dataset_file_name: example.npz + include_keys: + - user_label1 + - user_label2 + npz_fixed_field_keys: + - cell + - atomic_numbers + key_mapping: + position: pos + force: forces + energy: total_energy + Z: atomic_numbers + + +Note on key mapping +~~~~~~~~~~~~~~~~~~~ + +NequIP has default key names for energy, force, cell (defined at nequip.data._keys) +Unlike in the ASE format where these information is automatically parsed, +in the npz data format, the correct key names have to be provided. +The common key names are: `total_energy`, `forces`, `atomic_numbers`, `pos`, `cell`, `pbc`. +the key_mapping can help to convert the user defined name (key) to NequIP default name (value). + + +Advanced options +---------------- + +skip frames during data processing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The `include_frame` argument can be specified in yaml to skip certain frames in the raw datafile. +The item has to be a list or a python iteratable object. + +register user-defined graph, node, edge fields +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Graph, node, edge fields are quantities that belong to +the whole graph, each atom, each edge, respectively. +Example graph fields include cell, pbc, and total_energy. +Example node fields include pos, forces + +To help NequIP to properly assemble the batch data, graph quantity other than +cell, pbc, total_energy should be registered. diff --git a/docs/howto/howto.rst b/docs/howto/howto.rst new file mode 100644 index 00000000..07e84e84 --- /dev/null +++ b/docs/howto/howto.rst @@ -0,0 +1,7 @@ +How-to Tutorials +================ + + .. toctree:: + + dataset + migrate diff --git a/docs/guide/migrate.rst b/docs/howto/migrate.rst similarity index 100% rename from docs/guide/migrate.rst rename to docs/howto/migrate.rst diff --git a/docs/index.rst b/docs/index.rst index dc6ecd43..d2edd1a6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,12 +9,20 @@ NequIP NequIP is an open-source package for creating, training, and using E(3)-equivariant machine learning interatomic potentials. .. toctree:: - :maxdepth: 3 + :maxdepth: 2 :caption: Contents: - guide/guide + introduction/intro + cite + installation/install + yaml/yaml + howto/howto + faq/FAQ + commandline/commands + lammps/all options/options api/nequip + errors/errors diff --git a/docs/installation/install.rst b/docs/installation/install.rst new file mode 100644 index 00000000..3e946815 --- /dev/null +++ b/docs/installation/install.rst @@ -0,0 +1,39 @@ +Installation +============ + +NequIP requires: + + * Python >= 3.6 + * PyTorch >= 1.8, <=1.11.*. PyTorch can be installed following the `instructions from their documentation `_. Note that neither ``torchvision`` nor ``torchaudio``, included in the default install command, are needed for NequIP. + +To install: + + * We use `Weights&Biases `_ to keep track of experiments. This is not a strict requirement — you can use our package without it — but it may make your life easier. If you want to use it, create an account `here `_ and install the Python package:: + + pip install wandb + + * Install the latest stable NequIP:: + + pip install https://github.com/mir-group/nequip/archive/main.zip + +To install previous versions of NequIP, please clone the repository from GitHub and check out the appropriate tag (for example ``v0.3.3`` for version 0.3.3). + +To install the current **unstable** development version of NequIP, please clone our repository and check out the ``develop`` branch. + +Installation Issues +------------------- + +The easiest way to check if your installation is working is to train a _toy_ model:: + + nequip-train configs/minimal.yaml + +If you suspect something is wrong, encounter errors, or just want to confirm that everything is in working order, you can also run the unit tests:: + + pip install pytest + pytest tests/unit/ + +To run the full tests, including a set of longer/more intensive integration tests, run:: + + pytest tests/ + +If a GPU is present, the unit tests will use it. \ No newline at end of file diff --git a/docs/introduction/intro.rst b/docs/introduction/intro.rst new file mode 100644 index 00000000..e0dcc32c --- /dev/null +++ b/docs/introduction/intro.rst @@ -0,0 +1,4 @@ +Overview +======== + +TODO diff --git a/docs/lammps/all.rst b/docs/lammps/all.rst new file mode 100644 index 00000000..9faac07e --- /dev/null +++ b/docs/lammps/all.rst @@ -0,0 +1,7 @@ +Integration to LAMMPS, ASE +========================== + + .. toctree:: + + lammps + ase diff --git a/docs/lammps/ase.rst b/docs/lammps/ase.rst new file mode 100644 index 00000000..3729cde3 --- /dev/null +++ b/docs/lammps/ase.rst @@ -0,0 +1,2 @@ +ASE +=== diff --git a/docs/lammps/lammps.rst b/docs/lammps/lammps.rst new file mode 100644 index 00000000..f9d0ba9f --- /dev/null +++ b/docs/lammps/lammps.rst @@ -0,0 +1,2 @@ +LAMMPS +====== diff --git a/docs/options/dataset.rst b/docs/options/dataset.rst index 54b39fc9..f3ca194c 100644 --- a/docs/options/dataset.rst +++ b/docs/options/dataset.rst @@ -33,7 +33,7 @@ key_mapping | Type: dict | Default: ``{'positions': 'pos', 'energy': 'total_energy', 'force': 'forces', 'forces': 'forces', 'Z': 'atomic_numbers', 'atomic_number': 'atomic_numbers'}`` -npz_keys +include_keys ^^^^^^^^ | Type: list | Default: ``[]`` @@ -68,5 +68,11 @@ include_frames | Type: NoneType | Default: ``None`` +ase_args +^^^^^^^^ + | Type: dict + | Default: ``{}`` + Advanced --------- \ No newline at end of file +-------- +See tutorial on :ref:`../guide/_dataset_note`. diff --git a/docs/yaml/yaml.rst b/docs/yaml/yaml.rst new file mode 100644 index 00000000..fd804436 --- /dev/null +++ b/docs/yaml/yaml.rst @@ -0,0 +1,4 @@ +YAML input +========== + +TODO diff --git a/nequip/data/dataset.py b/nequip/data/dataset.py index 2b2279d9..c38b8eae 100644 --- a/nequip/data/dataset.py +++ b/nequip/data/dataset.py @@ -641,7 +641,7 @@ class NpzDataset(AtomicInMemoryDataset): """Load data from an npz file. To avoid loading unneeded data, keys are ignored by default unless they are in ``key_mapping``, ``include_keys``, - ``npz_fixed_fields`` or ``extra_fixed_fields``. + ``npz_fixed_fields_keys`` or ``extra_fixed_fields``. Args: key_mapping (Dict[str, str]): mapping of npz keys to ``AtomicData`` keys. Optional diff --git a/nequip/scripts/deploy.py b/nequip/scripts/deploy.py index 7aafd5fc..4d7686e1 100644 --- a/nequip/scripts/deploy.py +++ b/nequip/scripts/deploy.py @@ -129,7 +129,7 @@ def load_deployed_model( def main(args=None): parser = argparse.ArgumentParser( - description="Create and view information about deployed NequIP potentials." + description="Deploy and view information about previously deployed NequIP models." ) # backward compat for 3.6 if sys.version_info[1] > 6: diff --git a/nequip/scripts/evaluate.py b/nequip/scripts/evaluate.py index 7c4e2abc..f7dfa12b 100644 --- a/nequip/scripts/evaluate.py +++ b/nequip/scripts/evaluate.py @@ -30,13 +30,13 @@ def main(args=None, running_as_script: bool = True): description=textwrap.dedent( """Compute the error of a model on a test set using various metrics. - The model, metrics, dataset, etc. can specified individually, or a training session can be indicated with `--train-dir`. + The model, metrics, dataset, etc. can specified in individual YAML config files, or a training session can be indicated with `--train-dir`. In order of priority, the global settings (dtype, TensorFloat32, etc.) are taken from: - 1. The model config (for a training session) - 2. The dataset config (for a deployed model) - 3. The defaults + (1) the model config (for a training session), + (2) the dataset config (for a deployed model), + or (3) the defaults. - Prints only the final result in `name = num` format to stdout; all other information is logging.debuged to stderr. + Prints only the final result in `name = num` format to stdout; all other information is `logging.debug`ed to stderr. WARNING: Please note that results of CUDA models are rarely exactly reproducible, and that even CPU models can be nondeterministic. """ diff --git a/nequip/scripts/train.py b/nequip/scripts/train.py index c6aa7785..88b55f7e 100644 --- a/nequip/scripts/train.py +++ b/nequip/scripts/train.py @@ -81,8 +81,12 @@ def main(args=None, running_as_script: bool = True): def parse_command_line(args=None): - parser = argparse.ArgumentParser(description="Train a NequIP model.") - parser.add_argument("config", help="configuration file") + parser = argparse.ArgumentParser( + description="Train (or restart training of) a NequIP model." + ) + parser.add_argument( + "config", help="YAML file configuring the model, dataset, and other options" + ) parser.add_argument( "--equivariance-test", help="test the model's equivariance before training on n (default 1) random frames from the dataset", From 89977eab75dbc594dc3f7751b77f7f833a0df66f Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Wed, 7 Dec 2022 14:22:57 -0500 Subject: [PATCH 33/50] Revert "Better documentation and guidance (#102)" This reverts commit 6ae6710aa7ae145f2d0b6917671f7a53c23975ec. --- configs/full.yaml | 9 +- docs/api/nequip.rst | 3 +- docs/api/trainer.rst | 10 -- docs/cite.rst | 3 - docs/commandline/commands.rst | 132 ------------------ docs/faq/FAQ.rst | 14 -- docs/{errors/errors.rst => guide/FAQ.rst} | 13 +- docs/{howto => guide}/conventions.rst | 0 docs/guide/guide.rst | 9 ++ docs/guide/intro.rst | 4 + docs/guide/irreps.rst | 9 ++ docs/{howto => guide}/migrate.rst | 0 docs/howto/dataset.rst | 156 ---------------------- docs/howto/howto.rst | 7 - docs/index.rst | 12 +- docs/installation/install.rst | 39 ------ docs/introduction/intro.rst | 4 - docs/lammps/all.rst | 7 - docs/lammps/ase.rst | 2 - docs/lammps/lammps.rst | 2 - docs/options/dataset.rst | 10 +- docs/yaml/yaml.rst | 4 - nequip/data/dataset.py | 2 +- nequip/scripts/deploy.py | 2 +- nequip/scripts/evaluate.py | 10 +- nequip/scripts/train.py | 8 +- 26 files changed, 50 insertions(+), 421 deletions(-) delete mode 100644 docs/api/trainer.rst delete mode 100644 docs/cite.rst delete mode 100644 docs/commandline/commands.rst delete mode 100644 docs/faq/FAQ.rst rename docs/{errors/errors.rst => guide/FAQ.rst} (56%) rename docs/{howto => guide}/conventions.rst (100%) create mode 100644 docs/guide/guide.rst create mode 100644 docs/guide/intro.rst create mode 100644 docs/guide/irreps.rst rename docs/{howto => guide}/migrate.rst (100%) delete mode 100644 docs/howto/dataset.rst delete mode 100644 docs/howto/howto.rst delete mode 100644 docs/installation/install.rst delete mode 100644 docs/introduction/intro.rst delete mode 100644 docs/lammps/all.rst delete mode 100644 docs/lammps/ase.rst delete mode 100644 docs/lammps/lammps.rst delete mode 100644 docs/yaml/yaml.rst diff --git a/configs/full.yaml b/configs/full.yaml index 2f98164e..d310c70a 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -211,8 +211,6 @@ loss_coeffs: total_energy: - 1 - PerAtomMSELoss -# note that the ratio between force and energy loss matters for the training process. One may consider using 1:1 with the PerAtomMSELoss. If the energy loss still significantly dominate the loss function at the initial epochs, tune the energy loss weight lower helps the training a lot. - # # default loss function is MSELoss, the name has to be exactly the same as those in torch.nn. # the only supprted targets are forces and total_energy @@ -344,10 +342,9 @@ global_rescale_scale_trainable: false # global_rescale_shift_trainable: false # global_rescale_scale: dataset_forces_rms # global_rescale_scale_trainable: false -# per_species_rescale_shifts_trainable: false -# per_species_rescale_scales_trainable: true -# per_species_rescale_shifts: dataset_per_species_total_energy_mean -# per_species_rescale_scales: dataset_per_species_forces_rms +# per_species_rescale_trainable: true +# per_species_rescale_shifts: dataset_per_atom_total_energy_mean +# per_species_rescale_scales: dataset_per_atom_total_energy_std # # full block needed for global rescale # global_rescale_shift: dataset_total_energy_mean diff --git a/docs/api/nequip.rst b/docs/api/nequip.rst index 6f6250cf..13bc37ca 100644 --- a/docs/api/nequip.rst +++ b/docs/api/nequip.rst @@ -3,5 +3,4 @@ Python API .. toctree:: - data - trainer + data \ No newline at end of file diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst deleted file mode 100644 index 983e6f6b..00000000 --- a/docs/api/trainer.rst +++ /dev/null @@ -1,10 +0,0 @@ -nequip.trainer -============== - - .. automodule:: nequip.train.trainer - :members: - :imported-members: - - .. automodule:: nequip.train.trainer_wandb - :members: - :imported-members: diff --git a/docs/cite.rst b/docs/cite.rst deleted file mode 100644 index 9f8296cc..00000000 --- a/docs/cite.rst +++ /dev/null @@ -1,3 +0,0 @@ -Citing Nequip -============= - diff --git a/docs/commandline/commands.rst b/docs/commandline/commands.rst deleted file mode 100644 index b58c87ab..00000000 --- a/docs/commandline/commands.rst +++ /dev/null @@ -1,132 +0,0 @@ -Command-line Executables -======================== - -``nequip-train`` ----------------- - - .. code :: - - usage: nequip-train [-h] [--equivariance-test] [--model-debug-mode] [--grad-anomaly-mode] [--log LOG] config - -Train (or restart training of) a NequIP model. - -positional arguments: - config YAML file configuring the model, dataset, and other options - -optional arguments: - -h, --help show this help message and exit - --equivariance-test test the model's equivariance before training - --model-debug-mode enable model debug mode, which can sometimes give much more useful error messages at the - cost of some speed. Do not use for production training! - --grad-anomaly-mode enable PyTorch autograd anomaly mode to debug NaN gradients. Do not use for production - training! - --log LOG log file to store all the screen logging - -``nequip-evaluate`` -------------------- - - .. code :: - - usage: nequip-evaluate [-h] [--train-dir TRAIN_DIR] [--model MODEL] [--dataset-config DATASET_CONFIG] - [--metrics-config METRICS_CONFIG] [--test-indexes TEST_INDEXES] [--batch-size BATCH_SIZE] - [--device DEVICE] [--output OUTPUT] [--log LOG] - -Compute the error of a model on a test set using various metrics. The model, metrics, dataset, etc. can specified -in individual YAML config files, or a training session can be indicated with ``--train-dir``. In order of priority, -the global settings (dtype, TensorFloat32, etc.) are taken from: (1) the model config (for a training session), (2) -the dataset config (for a deployed model), or (3) the defaults. Prints only the final result in ``name = num`` format -to stdout; all other information is ``logging.debug``ed to stderr. WARNING: Please note that results of CUDA models -are rarely exactly reproducible, and that even CPU models can be nondeterministic. - -optional arguments: - -h, --help show this help message and exit - --train-dir TRAIN_DIR - Path to a working directory from a training session. - --model MODEL A deployed or pickled NequIP model to load. If omitted, defaults to `best_model.pth` in - `train_dir`. - --dataset-config DATASET_CONFIG - A YAML config file specifying the dataset to load test data from. If omitted, `config.yaml` - in `train_dir` will be used - --metrics-config METRICS_CONFIG - A YAML config file specifying the metrics to compute. If omitted, `config.yaml` in - `train_dir` will be used. If the config does not specify `metrics_components`, the default - is to logging.debug MAEs and RMSEs for all fields given in the loss function. If the - literal string `None`, no metrics will be computed. - --test-indexes TEST_INDEXES - Path to a file containing the indexes in the dataset that make up the test set. If omitted, - all data frames *not* used as training or validation data in the training session - `train_dir` will be used. - --batch-size BATCH_SIZE - Batch size to use. Larger is usually faster on GPU. - --device DEVICE Device to run the model on. If not provided, defaults to CUDA if available and CPU - otherwise. - --output OUTPUT XYZ file to write out the test set and model predicted forces, energies, etc. to. - --log LOG log file to store all the metrics and screen logging.debug - -``nequip-deploy`` ------------------ - - .. code :: - - usage: nequip-deploy [-h] {info,build} ... - -Deploy and view information about previously deployed NequIP models. - -optional arguments: - -h, --help show this help message and exit - -commands: - {info,build} - info Get information from a deployed model file - build Build a deployment model - -``nequip-deploy info`` -~~~~~~~~~~~~~~~~~~~~~~ - - .. code :: - - usage: nequip-deploy info [-h] model_path - -positional arguments: - model_path Path to a deployed model file. - -optional arguments: - -h, --help show this help message and exit - - -``nequip-deploy build`` -~~~~~~~~~~~~~~~~~~~~~~~ - - .. code :: - - usage: nequip-deploy build [-h] train_dir out_file - -positional arguments: - train_dir Path to a working directory from a training session. - out_file Output file for deployed model. - -optional arguments: - -h, --help show this help message and exit - - -``nequip-benchmark`` --------------------- - - .. code :: - - usage: nequip-benchmark [-h] [--profile PROFILE] [--device DEVICE] [-n N] [--n-data N_DATA] [--timestep TIMESTEP] - config - -Benchmark the approximate MD performance of a given model configuration / dataset pair. - -positional arguments: - config configuration file - -optional arguments: - -h, --help show this help message and exit - --profile PROFILE Profile instead of timing, creating and outputing a Chrome trace JSON to the given path. - --device DEVICE Device to run the model on. If not provided, defaults to CUDA if available and CPU - otherwise. - -n N Number of trials. - --n-data N_DATA Number of frames to use. - --timestep TIMESTEP MD timestep for ns/day esimation, in fs. Defauts to 1fs. diff --git a/docs/faq/FAQ.rst b/docs/faq/FAQ.rst deleted file mode 100644 index 411e77c1..00000000 --- a/docs/faq/FAQ.rst +++ /dev/null @@ -1,14 +0,0 @@ -FAQ -=== - -How do I... ------------ - -... continue to train a model that reached a stopping condition? - There will be an answer here. - -1. Reload the model trained with version 0.3.3 to the code in 0.4. - check out the migration note at :ref:`migration_note`. - -2. Specify my dataset for `nequip-train` and `nequip-eval`, see :ref:`_dataset_note`. - diff --git a/docs/errors/errors.rst b/docs/guide/FAQ.rst similarity index 56% rename from docs/errors/errors.rst rename to docs/guide/FAQ.rst index 576e553d..92ac758e 100644 --- a/docs/errors/errors.rst +++ b/docs/guide/FAQ.rst @@ -1,5 +1,14 @@ -Errors -====== +FAQ +=== + +How do I... +----------- + +... continue to train a model that reached a stopping condition? + There will be an answer here. + +1. Reload the model trained with version 0.3.3 to the code in 0.4. + check out the migration note at :ref:`migration_note`. Common errors ------------- diff --git a/docs/howto/conventions.rst b/docs/guide/conventions.rst similarity index 100% rename from docs/howto/conventions.rst rename to docs/guide/conventions.rst diff --git a/docs/guide/guide.rst b/docs/guide/guide.rst new file mode 100644 index 00000000..6def3859 --- /dev/null +++ b/docs/guide/guide.rst @@ -0,0 +1,9 @@ +NequIP User Guide +================= + + .. toctree:: + + intro + irreps + conventions + FAQ \ No newline at end of file diff --git a/docs/guide/intro.rst b/docs/guide/intro.rst new file mode 100644 index 00000000..7afa4132 --- /dev/null +++ b/docs/guide/intro.rst @@ -0,0 +1,4 @@ +Tutorial: Introduction to NequIP +================================ + +TODO \ No newline at end of file diff --git a/docs/guide/irreps.rst b/docs/guide/irreps.rst new file mode 100644 index 00000000..5f9b2735 --- /dev/null +++ b/docs/guide/irreps.rst @@ -0,0 +1,9 @@ +Irreps +====== + +.. _Irreps: + +Syntax to specify irreps +------------------------ + +TODO: descripe irreps syntax here \ No newline at end of file diff --git a/docs/howto/migrate.rst b/docs/guide/migrate.rst similarity index 100% rename from docs/howto/migrate.rst rename to docs/guide/migrate.rst diff --git a/docs/howto/dataset.rst b/docs/howto/dataset.rst deleted file mode 100644 index 2b5267e7..00000000 --- a/docs/howto/dataset.rst +++ /dev/null @@ -1,156 +0,0 @@ -.. _dataset_note: - -How to prepare training dataset -=============================== - -What does NequIP behind the scene ---------------------------------- - -NequIP uses AtomicDataset class to store the atomic configurations. -During the initialization of an AtomicDataset object, -NequIP reads the atomic structures from the dataset, -computes the neighbor list and other data structures needed for the GNN -by converting raw data to a list of ``AtomicData`` objects. - -The computed results are then cached on harddisk ``root/processed_hashkey`` folder. -The hashing is based on all the metadata provided for the dataset, -which includes the file name, the cutoff radius, float number precision and etc. -In the case where multiple training/evaluation runs use the same dataset, -the neighbor list will only be computed in the first NequIP run. -The later runs will directly load the AtomicDataset object from the cache file to save computation time. - -Note: be careful to the cached file. If you update your raw data file but keep using the same filename, -NequIP will not automatically update the cached data. - -Key concepts ------------- - -fixed_fields -~~~~~~~~~~~~ -Fixed fields are the quantities that are shared among all the configurations in the dataset. -For example, if the dataset is a trajectory of an NVT MD simulation, the super cell size and the atomic species -are indeed a constant matrix/vector through out the whole dataset. -In this case, in stead of repeating the same values for many times, -we specify the cell and species as fixed fields and only provide them once. - -yaml interface -~~~~~~~~~~~~~~ -``nequip-train`` and ``nequip-evaluate`` automatically construct the AtomicDataset based on the yaml arguments. -Later sections offer a couple different examples. - -If the training and validation datasets are from different raw files, the arguments for each set -can be defined with ``dataset`` prefix and ``validation_dataset`` prefix, respectively. - -For example, ``dataset_file_name`` is used for training data and ``validation_dataset_file_name`` is for validation data. - -Python interface -~~~~~~~~~~~~~~~~ -See ``nequip.data.dataset.AtomicInMemoryDataset``. - -Prepare dataset and specify in yaml config ------------------------------------------- - -ASE format -~~~~~~~~~~ - -NequIP accept all format that can be parsed by `ase.io.read` function. -We recommend `extxyz`. - -Example: Given an atomic data stored in "H2.extxyz" that looks like below: - -.. code:: extxyz - - 2 - Properties=species:S:1:pos:R:3 energy=-10 user_label=2.0 pbc="F F F" - H 0.00000000 0.00000000 0.00000000 - H 0.00000000 0.00000000 1.02000000 - -The yaml input should be - -.. code:: yaml - - dataset: ase - dataset_file_name: H2.extxyz - ase_args: - format: extxyz - include_keys: - - user_label - key_mapping: - user_label: label0 - chemical_symbol_to_type: - H: 0 - -For other formats than `extxyz`, be careful to the ase parsers; they may have different behavior from the extxyz parser. -For example, the ase vasp parser store potential energy to `free_energy` instead of `energy`. -Because we optimize our code to the `extxyz` parser, NequIP will not be able to load any `total_energy` labels. -We need some additional keys to help NequIP to understand the situtaion -Here's an example for vasp outcar. - -.. code:: yaml - - dataset: ase - dataset_file_name: OUTCAR - ase_args: - format: vasp-out - key_mapping: - free_energy: total_energy - chemical_symbol_to_type: - H: 0 - -The way around is to use key mapping, please see more note below. - -NPZ format -~~~~~~~~~~ - -If your dataset constitute configurations that always have the same number of atoms, npz data format can be an option. - -In the npz file, all the values should have the same row as the number of the configurations. -For example, the force array of 36 atomic configurations of an N-atom system should have the shape of (36, N, 3); -their total_energy array should have the shape of (36). - -Below is an example of the yaml specification. - -.. code:: yaml - - dataset: npz - dataset_file_name: example.npz - include_keys: - - user_label1 - - user_label2 - npz_fixed_field_keys: - - cell - - atomic_numbers - key_mapping: - position: pos - force: forces - energy: total_energy - Z: atomic_numbers - - -Note on key mapping -~~~~~~~~~~~~~~~~~~~ - -NequIP has default key names for energy, force, cell (defined at nequip.data._keys) -Unlike in the ASE format where these information is automatically parsed, -in the npz data format, the correct key names have to be provided. -The common key names are: `total_energy`, `forces`, `atomic_numbers`, `pos`, `cell`, `pbc`. -the key_mapping can help to convert the user defined name (key) to NequIP default name (value). - - -Advanced options ----------------- - -skip frames during data processing -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The `include_frame` argument can be specified in yaml to skip certain frames in the raw datafile. -The item has to be a list or a python iteratable object. - -register user-defined graph, node, edge fields -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Graph, node, edge fields are quantities that belong to -the whole graph, each atom, each edge, respectively. -Example graph fields include cell, pbc, and total_energy. -Example node fields include pos, forces - -To help NequIP to properly assemble the batch data, graph quantity other than -cell, pbc, total_energy should be registered. diff --git a/docs/howto/howto.rst b/docs/howto/howto.rst deleted file mode 100644 index 07e84e84..00000000 --- a/docs/howto/howto.rst +++ /dev/null @@ -1,7 +0,0 @@ -How-to Tutorials -================ - - .. toctree:: - - dataset - migrate diff --git a/docs/index.rst b/docs/index.rst index d2edd1a6..dc6ecd43 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,20 +9,12 @@ NequIP NequIP is an open-source package for creating, training, and using E(3)-equivariant machine learning interatomic potentials. .. toctree:: - :maxdepth: 2 + :maxdepth: 3 :caption: Contents: - introduction/intro - cite - installation/install - yaml/yaml - howto/howto - faq/FAQ - commandline/commands - lammps/all + guide/guide options/options api/nequip - errors/errors diff --git a/docs/installation/install.rst b/docs/installation/install.rst deleted file mode 100644 index 3e946815..00000000 --- a/docs/installation/install.rst +++ /dev/null @@ -1,39 +0,0 @@ -Installation -============ - -NequIP requires: - - * Python >= 3.6 - * PyTorch >= 1.8, <=1.11.*. PyTorch can be installed following the `instructions from their documentation `_. Note that neither ``torchvision`` nor ``torchaudio``, included in the default install command, are needed for NequIP. - -To install: - - * We use `Weights&Biases `_ to keep track of experiments. This is not a strict requirement — you can use our package without it — but it may make your life easier. If you want to use it, create an account `here `_ and install the Python package:: - - pip install wandb - - * Install the latest stable NequIP:: - - pip install https://github.com/mir-group/nequip/archive/main.zip - -To install previous versions of NequIP, please clone the repository from GitHub and check out the appropriate tag (for example ``v0.3.3`` for version 0.3.3). - -To install the current **unstable** development version of NequIP, please clone our repository and check out the ``develop`` branch. - -Installation Issues -------------------- - -The easiest way to check if your installation is working is to train a _toy_ model:: - - nequip-train configs/minimal.yaml - -If you suspect something is wrong, encounter errors, or just want to confirm that everything is in working order, you can also run the unit tests:: - - pip install pytest - pytest tests/unit/ - -To run the full tests, including a set of longer/more intensive integration tests, run:: - - pytest tests/ - -If a GPU is present, the unit tests will use it. \ No newline at end of file diff --git a/docs/introduction/intro.rst b/docs/introduction/intro.rst deleted file mode 100644 index e0dcc32c..00000000 --- a/docs/introduction/intro.rst +++ /dev/null @@ -1,4 +0,0 @@ -Overview -======== - -TODO diff --git a/docs/lammps/all.rst b/docs/lammps/all.rst deleted file mode 100644 index 9faac07e..00000000 --- a/docs/lammps/all.rst +++ /dev/null @@ -1,7 +0,0 @@ -Integration to LAMMPS, ASE -========================== - - .. toctree:: - - lammps - ase diff --git a/docs/lammps/ase.rst b/docs/lammps/ase.rst deleted file mode 100644 index 3729cde3..00000000 --- a/docs/lammps/ase.rst +++ /dev/null @@ -1,2 +0,0 @@ -ASE -=== diff --git a/docs/lammps/lammps.rst b/docs/lammps/lammps.rst deleted file mode 100644 index f9d0ba9f..00000000 --- a/docs/lammps/lammps.rst +++ /dev/null @@ -1,2 +0,0 @@ -LAMMPS -====== diff --git a/docs/options/dataset.rst b/docs/options/dataset.rst index f3ca194c..54b39fc9 100644 --- a/docs/options/dataset.rst +++ b/docs/options/dataset.rst @@ -33,7 +33,7 @@ key_mapping | Type: dict | Default: ``{'positions': 'pos', 'energy': 'total_energy', 'force': 'forces', 'forces': 'forces', 'Z': 'atomic_numbers', 'atomic_number': 'atomic_numbers'}`` -include_keys +npz_keys ^^^^^^^^ | Type: list | Default: ``[]`` @@ -68,11 +68,5 @@ include_frames | Type: NoneType | Default: ``None`` -ase_args -^^^^^^^^ - | Type: dict - | Default: ``{}`` - Advanced --------- -See tutorial on :ref:`../guide/_dataset_note`. +-------- \ No newline at end of file diff --git a/docs/yaml/yaml.rst b/docs/yaml/yaml.rst deleted file mode 100644 index fd804436..00000000 --- a/docs/yaml/yaml.rst +++ /dev/null @@ -1,4 +0,0 @@ -YAML input -========== - -TODO diff --git a/nequip/data/dataset.py b/nequip/data/dataset.py index c38b8eae..2b2279d9 100644 --- a/nequip/data/dataset.py +++ b/nequip/data/dataset.py @@ -641,7 +641,7 @@ class NpzDataset(AtomicInMemoryDataset): """Load data from an npz file. To avoid loading unneeded data, keys are ignored by default unless they are in ``key_mapping``, ``include_keys``, - ``npz_fixed_fields_keys`` or ``extra_fixed_fields``. + ``npz_fixed_fields`` or ``extra_fixed_fields``. Args: key_mapping (Dict[str, str]): mapping of npz keys to ``AtomicData`` keys. Optional diff --git a/nequip/scripts/deploy.py b/nequip/scripts/deploy.py index 4d7686e1..7aafd5fc 100644 --- a/nequip/scripts/deploy.py +++ b/nequip/scripts/deploy.py @@ -129,7 +129,7 @@ def load_deployed_model( def main(args=None): parser = argparse.ArgumentParser( - description="Deploy and view information about previously deployed NequIP models." + description="Create and view information about deployed NequIP potentials." ) # backward compat for 3.6 if sys.version_info[1] > 6: diff --git a/nequip/scripts/evaluate.py b/nequip/scripts/evaluate.py index f7dfa12b..7c4e2abc 100644 --- a/nequip/scripts/evaluate.py +++ b/nequip/scripts/evaluate.py @@ -30,13 +30,13 @@ def main(args=None, running_as_script: bool = True): description=textwrap.dedent( """Compute the error of a model on a test set using various metrics. - The model, metrics, dataset, etc. can specified in individual YAML config files, or a training session can be indicated with `--train-dir`. + The model, metrics, dataset, etc. can specified individually, or a training session can be indicated with `--train-dir`. In order of priority, the global settings (dtype, TensorFloat32, etc.) are taken from: - (1) the model config (for a training session), - (2) the dataset config (for a deployed model), - or (3) the defaults. + 1. The model config (for a training session) + 2. The dataset config (for a deployed model) + 3. The defaults - Prints only the final result in `name = num` format to stdout; all other information is `logging.debug`ed to stderr. + Prints only the final result in `name = num` format to stdout; all other information is logging.debuged to stderr. WARNING: Please note that results of CUDA models are rarely exactly reproducible, and that even CPU models can be nondeterministic. """ diff --git a/nequip/scripts/train.py b/nequip/scripts/train.py index 88b55f7e..c6aa7785 100644 --- a/nequip/scripts/train.py +++ b/nequip/scripts/train.py @@ -81,12 +81,8 @@ def main(args=None, running_as_script: bool = True): def parse_command_line(args=None): - parser = argparse.ArgumentParser( - description="Train (or restart training of) a NequIP model." - ) - parser.add_argument( - "config", help="YAML file configuring the model, dataset, and other options" - ) + parser = argparse.ArgumentParser(description="Train a NequIP model.") + parser.add_argument("config", help="configuration file") parser.add_argument( "--equivariance-test", help="test the model's equivariance before training on n (default 1) random frames from the dataset", From 2b5eefb1891a8580ea7ec646bf71828e9ae188f2 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Wed, 7 Dec 2022 14:24:05 -0500 Subject: [PATCH 34/50] Revert "Revert "Better documentation and guidance (#102)"" This reverts commit 89977eab75dbc594dc3f7751b77f7f833a0df66f. --- configs/full.yaml | 9 +- docs/api/nequip.rst | 3 +- docs/api/trainer.rst | 10 ++ docs/cite.rst | 3 + docs/commandline/commands.rst | 132 ++++++++++++++++++ docs/{guide/FAQ.rst => errors/errors.rst} | 13 +- docs/faq/FAQ.rst | 14 ++ docs/guide/guide.rst | 9 -- docs/guide/intro.rst | 4 - docs/guide/irreps.rst | 9 -- docs/{guide => howto}/conventions.rst | 0 docs/howto/dataset.rst | 156 ++++++++++++++++++++++ docs/howto/howto.rst | 7 + docs/{guide => howto}/migrate.rst | 0 docs/index.rst | 12 +- docs/installation/install.rst | 39 ++++++ docs/introduction/intro.rst | 4 + docs/lammps/all.rst | 7 + docs/lammps/ase.rst | 2 + docs/lammps/lammps.rst | 2 + docs/options/dataset.rst | 10 +- docs/yaml/yaml.rst | 4 + nequip/data/dataset.py | 2 +- nequip/scripts/deploy.py | 2 +- nequip/scripts/evaluate.py | 10 +- nequip/scripts/train.py | 8 +- 26 files changed, 421 insertions(+), 50 deletions(-) create mode 100644 docs/api/trainer.rst create mode 100644 docs/cite.rst create mode 100644 docs/commandline/commands.rst rename docs/{guide/FAQ.rst => errors/errors.rst} (56%) create mode 100644 docs/faq/FAQ.rst delete mode 100644 docs/guide/guide.rst delete mode 100644 docs/guide/intro.rst delete mode 100644 docs/guide/irreps.rst rename docs/{guide => howto}/conventions.rst (100%) create mode 100644 docs/howto/dataset.rst create mode 100644 docs/howto/howto.rst rename docs/{guide => howto}/migrate.rst (100%) create mode 100644 docs/installation/install.rst create mode 100644 docs/introduction/intro.rst create mode 100644 docs/lammps/all.rst create mode 100644 docs/lammps/ase.rst create mode 100644 docs/lammps/lammps.rst create mode 100644 docs/yaml/yaml.rst diff --git a/configs/full.yaml b/configs/full.yaml index d310c70a..2f98164e 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -211,6 +211,8 @@ loss_coeffs: total_energy: - 1 - PerAtomMSELoss +# note that the ratio between force and energy loss matters for the training process. One may consider using 1:1 with the PerAtomMSELoss. If the energy loss still significantly dominate the loss function at the initial epochs, tune the energy loss weight lower helps the training a lot. + # # default loss function is MSELoss, the name has to be exactly the same as those in torch.nn. # the only supprted targets are forces and total_energy @@ -342,9 +344,10 @@ global_rescale_scale_trainable: false # global_rescale_shift_trainable: false # global_rescale_scale: dataset_forces_rms # global_rescale_scale_trainable: false -# per_species_rescale_trainable: true -# per_species_rescale_shifts: dataset_per_atom_total_energy_mean -# per_species_rescale_scales: dataset_per_atom_total_energy_std +# per_species_rescale_shifts_trainable: false +# per_species_rescale_scales_trainable: true +# per_species_rescale_shifts: dataset_per_species_total_energy_mean +# per_species_rescale_scales: dataset_per_species_forces_rms # # full block needed for global rescale # global_rescale_shift: dataset_total_energy_mean diff --git a/docs/api/nequip.rst b/docs/api/nequip.rst index 13bc37ca..6f6250cf 100644 --- a/docs/api/nequip.rst +++ b/docs/api/nequip.rst @@ -3,4 +3,5 @@ Python API .. toctree:: - data \ No newline at end of file + data + trainer diff --git a/docs/api/trainer.rst b/docs/api/trainer.rst new file mode 100644 index 00000000..983e6f6b --- /dev/null +++ b/docs/api/trainer.rst @@ -0,0 +1,10 @@ +nequip.trainer +============== + + .. automodule:: nequip.train.trainer + :members: + :imported-members: + + .. automodule:: nequip.train.trainer_wandb + :members: + :imported-members: diff --git a/docs/cite.rst b/docs/cite.rst new file mode 100644 index 00000000..9f8296cc --- /dev/null +++ b/docs/cite.rst @@ -0,0 +1,3 @@ +Citing Nequip +============= + diff --git a/docs/commandline/commands.rst b/docs/commandline/commands.rst new file mode 100644 index 00000000..b58c87ab --- /dev/null +++ b/docs/commandline/commands.rst @@ -0,0 +1,132 @@ +Command-line Executables +======================== + +``nequip-train`` +---------------- + + .. code :: + + usage: nequip-train [-h] [--equivariance-test] [--model-debug-mode] [--grad-anomaly-mode] [--log LOG] config + +Train (or restart training of) a NequIP model. + +positional arguments: + config YAML file configuring the model, dataset, and other options + +optional arguments: + -h, --help show this help message and exit + --equivariance-test test the model's equivariance before training + --model-debug-mode enable model debug mode, which can sometimes give much more useful error messages at the + cost of some speed. Do not use for production training! + --grad-anomaly-mode enable PyTorch autograd anomaly mode to debug NaN gradients. Do not use for production + training! + --log LOG log file to store all the screen logging + +``nequip-evaluate`` +------------------- + + .. code :: + + usage: nequip-evaluate [-h] [--train-dir TRAIN_DIR] [--model MODEL] [--dataset-config DATASET_CONFIG] + [--metrics-config METRICS_CONFIG] [--test-indexes TEST_INDEXES] [--batch-size BATCH_SIZE] + [--device DEVICE] [--output OUTPUT] [--log LOG] + +Compute the error of a model on a test set using various metrics. The model, metrics, dataset, etc. can specified +in individual YAML config files, or a training session can be indicated with ``--train-dir``. In order of priority, +the global settings (dtype, TensorFloat32, etc.) are taken from: (1) the model config (for a training session), (2) +the dataset config (for a deployed model), or (3) the defaults. Prints only the final result in ``name = num`` format +to stdout; all other information is ``logging.debug``ed to stderr. WARNING: Please note that results of CUDA models +are rarely exactly reproducible, and that even CPU models can be nondeterministic. + +optional arguments: + -h, --help show this help message and exit + --train-dir TRAIN_DIR + Path to a working directory from a training session. + --model MODEL A deployed or pickled NequIP model to load. If omitted, defaults to `best_model.pth` in + `train_dir`. + --dataset-config DATASET_CONFIG + A YAML config file specifying the dataset to load test data from. If omitted, `config.yaml` + in `train_dir` will be used + --metrics-config METRICS_CONFIG + A YAML config file specifying the metrics to compute. If omitted, `config.yaml` in + `train_dir` will be used. If the config does not specify `metrics_components`, the default + is to logging.debug MAEs and RMSEs for all fields given in the loss function. If the + literal string `None`, no metrics will be computed. + --test-indexes TEST_INDEXES + Path to a file containing the indexes in the dataset that make up the test set. If omitted, + all data frames *not* used as training or validation data in the training session + `train_dir` will be used. + --batch-size BATCH_SIZE + Batch size to use. Larger is usually faster on GPU. + --device DEVICE Device to run the model on. If not provided, defaults to CUDA if available and CPU + otherwise. + --output OUTPUT XYZ file to write out the test set and model predicted forces, energies, etc. to. + --log LOG log file to store all the metrics and screen logging.debug + +``nequip-deploy`` +----------------- + + .. code :: + + usage: nequip-deploy [-h] {info,build} ... + +Deploy and view information about previously deployed NequIP models. + +optional arguments: + -h, --help show this help message and exit + +commands: + {info,build} + info Get information from a deployed model file + build Build a deployment model + +``nequip-deploy info`` +~~~~~~~~~~~~~~~~~~~~~~ + + .. code :: + + usage: nequip-deploy info [-h] model_path + +positional arguments: + model_path Path to a deployed model file. + +optional arguments: + -h, --help show this help message and exit + + +``nequip-deploy build`` +~~~~~~~~~~~~~~~~~~~~~~~ + + .. code :: + + usage: nequip-deploy build [-h] train_dir out_file + +positional arguments: + train_dir Path to a working directory from a training session. + out_file Output file for deployed model. + +optional arguments: + -h, --help show this help message and exit + + +``nequip-benchmark`` +-------------------- + + .. code :: + + usage: nequip-benchmark [-h] [--profile PROFILE] [--device DEVICE] [-n N] [--n-data N_DATA] [--timestep TIMESTEP] + config + +Benchmark the approximate MD performance of a given model configuration / dataset pair. + +positional arguments: + config configuration file + +optional arguments: + -h, --help show this help message and exit + --profile PROFILE Profile instead of timing, creating and outputing a Chrome trace JSON to the given path. + --device DEVICE Device to run the model on. If not provided, defaults to CUDA if available and CPU + otherwise. + -n N Number of trials. + --n-data N_DATA Number of frames to use. + --timestep TIMESTEP MD timestep for ns/day esimation, in fs. Defauts to 1fs. diff --git a/docs/guide/FAQ.rst b/docs/errors/errors.rst similarity index 56% rename from docs/guide/FAQ.rst rename to docs/errors/errors.rst index 92ac758e..576e553d 100644 --- a/docs/guide/FAQ.rst +++ b/docs/errors/errors.rst @@ -1,14 +1,5 @@ -FAQ -=== - -How do I... ------------ - -... continue to train a model that reached a stopping condition? - There will be an answer here. - -1. Reload the model trained with version 0.3.3 to the code in 0.4. - check out the migration note at :ref:`migration_note`. +Errors +====== Common errors ------------- diff --git a/docs/faq/FAQ.rst b/docs/faq/FAQ.rst new file mode 100644 index 00000000..411e77c1 --- /dev/null +++ b/docs/faq/FAQ.rst @@ -0,0 +1,14 @@ +FAQ +=== + +How do I... +----------- + +... continue to train a model that reached a stopping condition? + There will be an answer here. + +1. Reload the model trained with version 0.3.3 to the code in 0.4. + check out the migration note at :ref:`migration_note`. + +2. Specify my dataset for `nequip-train` and `nequip-eval`, see :ref:`_dataset_note`. + diff --git a/docs/guide/guide.rst b/docs/guide/guide.rst deleted file mode 100644 index 6def3859..00000000 --- a/docs/guide/guide.rst +++ /dev/null @@ -1,9 +0,0 @@ -NequIP User Guide -================= - - .. toctree:: - - intro - irreps - conventions - FAQ \ No newline at end of file diff --git a/docs/guide/intro.rst b/docs/guide/intro.rst deleted file mode 100644 index 7afa4132..00000000 --- a/docs/guide/intro.rst +++ /dev/null @@ -1,4 +0,0 @@ -Tutorial: Introduction to NequIP -================================ - -TODO \ No newline at end of file diff --git a/docs/guide/irreps.rst b/docs/guide/irreps.rst deleted file mode 100644 index 5f9b2735..00000000 --- a/docs/guide/irreps.rst +++ /dev/null @@ -1,9 +0,0 @@ -Irreps -====== - -.. _Irreps: - -Syntax to specify irreps ------------------------- - -TODO: descripe irreps syntax here \ No newline at end of file diff --git a/docs/guide/conventions.rst b/docs/howto/conventions.rst similarity index 100% rename from docs/guide/conventions.rst rename to docs/howto/conventions.rst diff --git a/docs/howto/dataset.rst b/docs/howto/dataset.rst new file mode 100644 index 00000000..2b5267e7 --- /dev/null +++ b/docs/howto/dataset.rst @@ -0,0 +1,156 @@ +.. _dataset_note: + +How to prepare training dataset +=============================== + +What does NequIP behind the scene +--------------------------------- + +NequIP uses AtomicDataset class to store the atomic configurations. +During the initialization of an AtomicDataset object, +NequIP reads the atomic structures from the dataset, +computes the neighbor list and other data structures needed for the GNN +by converting raw data to a list of ``AtomicData`` objects. + +The computed results are then cached on harddisk ``root/processed_hashkey`` folder. +The hashing is based on all the metadata provided for the dataset, +which includes the file name, the cutoff radius, float number precision and etc. +In the case where multiple training/evaluation runs use the same dataset, +the neighbor list will only be computed in the first NequIP run. +The later runs will directly load the AtomicDataset object from the cache file to save computation time. + +Note: be careful to the cached file. If you update your raw data file but keep using the same filename, +NequIP will not automatically update the cached data. + +Key concepts +------------ + +fixed_fields +~~~~~~~~~~~~ +Fixed fields are the quantities that are shared among all the configurations in the dataset. +For example, if the dataset is a trajectory of an NVT MD simulation, the super cell size and the atomic species +are indeed a constant matrix/vector through out the whole dataset. +In this case, in stead of repeating the same values for many times, +we specify the cell and species as fixed fields and only provide them once. + +yaml interface +~~~~~~~~~~~~~~ +``nequip-train`` and ``nequip-evaluate`` automatically construct the AtomicDataset based on the yaml arguments. +Later sections offer a couple different examples. + +If the training and validation datasets are from different raw files, the arguments for each set +can be defined with ``dataset`` prefix and ``validation_dataset`` prefix, respectively. + +For example, ``dataset_file_name`` is used for training data and ``validation_dataset_file_name`` is for validation data. + +Python interface +~~~~~~~~~~~~~~~~ +See ``nequip.data.dataset.AtomicInMemoryDataset``. + +Prepare dataset and specify in yaml config +------------------------------------------ + +ASE format +~~~~~~~~~~ + +NequIP accept all format that can be parsed by `ase.io.read` function. +We recommend `extxyz`. + +Example: Given an atomic data stored in "H2.extxyz" that looks like below: + +.. code:: extxyz + + 2 + Properties=species:S:1:pos:R:3 energy=-10 user_label=2.0 pbc="F F F" + H 0.00000000 0.00000000 0.00000000 + H 0.00000000 0.00000000 1.02000000 + +The yaml input should be + +.. code:: yaml + + dataset: ase + dataset_file_name: H2.extxyz + ase_args: + format: extxyz + include_keys: + - user_label + key_mapping: + user_label: label0 + chemical_symbol_to_type: + H: 0 + +For other formats than `extxyz`, be careful to the ase parsers; they may have different behavior from the extxyz parser. +For example, the ase vasp parser store potential energy to `free_energy` instead of `energy`. +Because we optimize our code to the `extxyz` parser, NequIP will not be able to load any `total_energy` labels. +We need some additional keys to help NequIP to understand the situtaion +Here's an example for vasp outcar. + +.. code:: yaml + + dataset: ase + dataset_file_name: OUTCAR + ase_args: + format: vasp-out + key_mapping: + free_energy: total_energy + chemical_symbol_to_type: + H: 0 + +The way around is to use key mapping, please see more note below. + +NPZ format +~~~~~~~~~~ + +If your dataset constitute configurations that always have the same number of atoms, npz data format can be an option. + +In the npz file, all the values should have the same row as the number of the configurations. +For example, the force array of 36 atomic configurations of an N-atom system should have the shape of (36, N, 3); +their total_energy array should have the shape of (36). + +Below is an example of the yaml specification. + +.. code:: yaml + + dataset: npz + dataset_file_name: example.npz + include_keys: + - user_label1 + - user_label2 + npz_fixed_field_keys: + - cell + - atomic_numbers + key_mapping: + position: pos + force: forces + energy: total_energy + Z: atomic_numbers + + +Note on key mapping +~~~~~~~~~~~~~~~~~~~ + +NequIP has default key names for energy, force, cell (defined at nequip.data._keys) +Unlike in the ASE format where these information is automatically parsed, +in the npz data format, the correct key names have to be provided. +The common key names are: `total_energy`, `forces`, `atomic_numbers`, `pos`, `cell`, `pbc`. +the key_mapping can help to convert the user defined name (key) to NequIP default name (value). + + +Advanced options +---------------- + +skip frames during data processing +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +The `include_frame` argument can be specified in yaml to skip certain frames in the raw datafile. +The item has to be a list or a python iteratable object. + +register user-defined graph, node, edge fields +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Graph, node, edge fields are quantities that belong to +the whole graph, each atom, each edge, respectively. +Example graph fields include cell, pbc, and total_energy. +Example node fields include pos, forces + +To help NequIP to properly assemble the batch data, graph quantity other than +cell, pbc, total_energy should be registered. diff --git a/docs/howto/howto.rst b/docs/howto/howto.rst new file mode 100644 index 00000000..07e84e84 --- /dev/null +++ b/docs/howto/howto.rst @@ -0,0 +1,7 @@ +How-to Tutorials +================ + + .. toctree:: + + dataset + migrate diff --git a/docs/guide/migrate.rst b/docs/howto/migrate.rst similarity index 100% rename from docs/guide/migrate.rst rename to docs/howto/migrate.rst diff --git a/docs/index.rst b/docs/index.rst index dc6ecd43..d2edd1a6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -9,12 +9,20 @@ NequIP NequIP is an open-source package for creating, training, and using E(3)-equivariant machine learning interatomic potentials. .. toctree:: - :maxdepth: 3 + :maxdepth: 2 :caption: Contents: - guide/guide + introduction/intro + cite + installation/install + yaml/yaml + howto/howto + faq/FAQ + commandline/commands + lammps/all options/options api/nequip + errors/errors diff --git a/docs/installation/install.rst b/docs/installation/install.rst new file mode 100644 index 00000000..3e946815 --- /dev/null +++ b/docs/installation/install.rst @@ -0,0 +1,39 @@ +Installation +============ + +NequIP requires: + + * Python >= 3.6 + * PyTorch >= 1.8, <=1.11.*. PyTorch can be installed following the `instructions from their documentation `_. Note that neither ``torchvision`` nor ``torchaudio``, included in the default install command, are needed for NequIP. + +To install: + + * We use `Weights&Biases `_ to keep track of experiments. This is not a strict requirement — you can use our package without it — but it may make your life easier. If you want to use it, create an account `here `_ and install the Python package:: + + pip install wandb + + * Install the latest stable NequIP:: + + pip install https://github.com/mir-group/nequip/archive/main.zip + +To install previous versions of NequIP, please clone the repository from GitHub and check out the appropriate tag (for example ``v0.3.3`` for version 0.3.3). + +To install the current **unstable** development version of NequIP, please clone our repository and check out the ``develop`` branch. + +Installation Issues +------------------- + +The easiest way to check if your installation is working is to train a _toy_ model:: + + nequip-train configs/minimal.yaml + +If you suspect something is wrong, encounter errors, or just want to confirm that everything is in working order, you can also run the unit tests:: + + pip install pytest + pytest tests/unit/ + +To run the full tests, including a set of longer/more intensive integration tests, run:: + + pytest tests/ + +If a GPU is present, the unit tests will use it. \ No newline at end of file diff --git a/docs/introduction/intro.rst b/docs/introduction/intro.rst new file mode 100644 index 00000000..e0dcc32c --- /dev/null +++ b/docs/introduction/intro.rst @@ -0,0 +1,4 @@ +Overview +======== + +TODO diff --git a/docs/lammps/all.rst b/docs/lammps/all.rst new file mode 100644 index 00000000..9faac07e --- /dev/null +++ b/docs/lammps/all.rst @@ -0,0 +1,7 @@ +Integration to LAMMPS, ASE +========================== + + .. toctree:: + + lammps + ase diff --git a/docs/lammps/ase.rst b/docs/lammps/ase.rst new file mode 100644 index 00000000..3729cde3 --- /dev/null +++ b/docs/lammps/ase.rst @@ -0,0 +1,2 @@ +ASE +=== diff --git a/docs/lammps/lammps.rst b/docs/lammps/lammps.rst new file mode 100644 index 00000000..f9d0ba9f --- /dev/null +++ b/docs/lammps/lammps.rst @@ -0,0 +1,2 @@ +LAMMPS +====== diff --git a/docs/options/dataset.rst b/docs/options/dataset.rst index 54b39fc9..f3ca194c 100644 --- a/docs/options/dataset.rst +++ b/docs/options/dataset.rst @@ -33,7 +33,7 @@ key_mapping | Type: dict | Default: ``{'positions': 'pos', 'energy': 'total_energy', 'force': 'forces', 'forces': 'forces', 'Z': 'atomic_numbers', 'atomic_number': 'atomic_numbers'}`` -npz_keys +include_keys ^^^^^^^^ | Type: list | Default: ``[]`` @@ -68,5 +68,11 @@ include_frames | Type: NoneType | Default: ``None`` +ase_args +^^^^^^^^ + | Type: dict + | Default: ``{}`` + Advanced --------- \ No newline at end of file +-------- +See tutorial on :ref:`../guide/_dataset_note`. diff --git a/docs/yaml/yaml.rst b/docs/yaml/yaml.rst new file mode 100644 index 00000000..fd804436 --- /dev/null +++ b/docs/yaml/yaml.rst @@ -0,0 +1,4 @@ +YAML input +========== + +TODO diff --git a/nequip/data/dataset.py b/nequip/data/dataset.py index 2b2279d9..c38b8eae 100644 --- a/nequip/data/dataset.py +++ b/nequip/data/dataset.py @@ -641,7 +641,7 @@ class NpzDataset(AtomicInMemoryDataset): """Load data from an npz file. To avoid loading unneeded data, keys are ignored by default unless they are in ``key_mapping``, ``include_keys``, - ``npz_fixed_fields`` or ``extra_fixed_fields``. + ``npz_fixed_fields_keys`` or ``extra_fixed_fields``. Args: key_mapping (Dict[str, str]): mapping of npz keys to ``AtomicData`` keys. Optional diff --git a/nequip/scripts/deploy.py b/nequip/scripts/deploy.py index 7aafd5fc..4d7686e1 100644 --- a/nequip/scripts/deploy.py +++ b/nequip/scripts/deploy.py @@ -129,7 +129,7 @@ def load_deployed_model( def main(args=None): parser = argparse.ArgumentParser( - description="Create and view information about deployed NequIP potentials." + description="Deploy and view information about previously deployed NequIP models." ) # backward compat for 3.6 if sys.version_info[1] > 6: diff --git a/nequip/scripts/evaluate.py b/nequip/scripts/evaluate.py index 7c4e2abc..f7dfa12b 100644 --- a/nequip/scripts/evaluate.py +++ b/nequip/scripts/evaluate.py @@ -30,13 +30,13 @@ def main(args=None, running_as_script: bool = True): description=textwrap.dedent( """Compute the error of a model on a test set using various metrics. - The model, metrics, dataset, etc. can specified individually, or a training session can be indicated with `--train-dir`. + The model, metrics, dataset, etc. can specified in individual YAML config files, or a training session can be indicated with `--train-dir`. In order of priority, the global settings (dtype, TensorFloat32, etc.) are taken from: - 1. The model config (for a training session) - 2. The dataset config (for a deployed model) - 3. The defaults + (1) the model config (for a training session), + (2) the dataset config (for a deployed model), + or (3) the defaults. - Prints only the final result in `name = num` format to stdout; all other information is logging.debuged to stderr. + Prints only the final result in `name = num` format to stdout; all other information is `logging.debug`ed to stderr. WARNING: Please note that results of CUDA models are rarely exactly reproducible, and that even CPU models can be nondeterministic. """ diff --git a/nequip/scripts/train.py b/nequip/scripts/train.py index c6aa7785..88b55f7e 100644 --- a/nequip/scripts/train.py +++ b/nequip/scripts/train.py @@ -81,8 +81,12 @@ def main(args=None, running_as_script: bool = True): def parse_command_line(args=None): - parser = argparse.ArgumentParser(description="Train a NequIP model.") - parser.add_argument("config", help="configuration file") + parser = argparse.ArgumentParser( + description="Train (or restart training of) a NequIP model." + ) + parser.add_argument( + "config", help="YAML file configuring the model, dataset, and other options" + ) parser.add_argument( "--equivariance-test", help="test the model's equivariance before training on n (default 1) random frames from the dataset", From 4a7fb109e16b04633dd16234b4cc3eb137a2c75a Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Wed, 7 Dec 2022 14:24:44 -0500 Subject: [PATCH 35/50] Revert "remove sklearn dependence (#248)" This reverts commit 64df5e52e269fadc1480ae78c4d1682b59bb8486. --- CHANGELOG.md | 1 - README.md | 2 +- configs/full.yaml | 4 +- nequip/utils/regressor.py | 235 +++++++++++++++++++++-------- nequip/utils/unittests/conftest.py | 17 --- setup.py | 1 + tests/unit/data/test_dataset.py | 32 ++-- tests/unit/utils/test_gp.py | 37 +++++ tests/unit/utils/test_solver.py | 38 ----- 9 files changed, 235 insertions(+), 132 deletions(-) create mode 100644 tests/unit/utils/test_gp.py delete mode 100644 tests/unit/utils/test_solver.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 67bd0c81..50338467 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,6 @@ Most recent change on the bottom. ## [Unreleased] - 0.5.6 ### Added -- sklearn dependency removed - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights - `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary` - `nequip-benchmark --pdb` for debugging model (builder) errors diff --git a/README.md b/README.md index da741c09..f70840b8 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,6 @@ under the guidance of [Boris Kozinsky at Harvard](https://bkoz.seas.harvard.edu/ If you have questions, please don't hesitate to reach out at batzner[at]g[dot]harvard[dot]edu. If you find a bug or have a proposal for a feature, please post it in the [Issues](https://github.com/mir-group/nequip/issues). -If you have a question, topic, or issue that isn't obviously one of those, try our [GitHub Discussions](https://github.com/mir-group/nequip/discussions). +If you have a question, topic, or issue that isn't obviously one of those, try our [GitHub Disucssions](https://github.com/mir-group/nequip/discussions). If you want to contribute to the code, please read [`CONTRIBUTING.md`](CONTRIBUTING.md). diff --git a/configs/full.yaml b/configs/full.yaml index 2f98164e..daefc143 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -317,10 +317,10 @@ per_species_rescale_scales: dataset_forces_rms # If not provided, defaults to dataset_per_species_force_rms or dataset_per_atom_total_energy_std, depending on whether forces are being trained. # per_species_rescale_kwargs: # total_energy: -# alpha: 0.001 +# alpha: 0.1 # max_iteration: 20 # stride: 100 -# keywords for ridge regression decomposition of per specie energy. Optional. Defaults to 0.001. The value should be in the range of 1e-3 to 1e-2 +# keywords for GP decomposition of per specie energy. Optional. Defaults to 0.1 # per_species_rescale_arguments_in_dataset_units: True # if explicit numbers are given for the shifts/scales, this parameter must specify whether the given numbers are unitless shifts/scales or are in the units of the dataset. If ``True``, any global rescalings will correctly be applied to the per-species values. diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py index 30c8f9ab..3d23cf84 100644 --- a/nequip/utils/regressor.py +++ b/nequip/utils/regressor.py @@ -1,72 +1,181 @@ import logging import torch +import numpy as np +from typing import Optional +from sklearn.gaussian_process import GaussianProcessRegressor +from sklearn.gaussian_process.kernels import DotProduct, Kernel, Hyperparameter -from torch import matmul -from torch.linalg import solve, inv -from typing import Optional, Sequence -from opt_einsum import contract +def solver(X, y, regressor: Optional[str] = "NormalizedGaussianProcess", **kwargs): + if regressor == "GaussianProcess": + return gp(X, y, **kwargs) + elif regressor == "NormalizedGaussianProcess": + return normalized_gp(X, y, **kwargs) + else: + raise NotImplementedError(f"{regressor} is not implemented") -def solver(X, y, alpha: Optional[float] = 0.001, stride: Optional[int] = 1, **kwargs): - - dtype = torch.get_default_dtype() - X = X[::stride].to(dtype) - y = y[::stride].to(dtype) - - X, y = down_sampling_by_composition(X, y) - - X_norm = torch.sum(X) - - X = X / X_norm - y = y / X_norm +def normalized_gp(X, y, **kwargs): + feature_rms = 1.0 / np.sqrt(np.average(X**2, axis=0)) + feature_rms = np.nan_to_num(feature_rms, 1) y_mean = torch.sum(y) / torch.sum(X) - - feature_rms = torch.sqrt(torch.mean(X**2, axis=0)) - - alpha_mat = torch.diag(feature_rms) * alpha * alpha - - A = matmul(X.T, X) + alpha_mat - dy = y - (torch.sum(X, axis=1, keepdim=True) * y_mean).reshape(y.shape) - Xy = matmul(X.T, dy) - - mean = solve(A, Xy) - - sigma2 = torch.var(matmul(X, mean) - dy) - Ainv = inv(A) - cov = torch.sqrt(sigma2 * contract("ij,kj,kl,li->i", Ainv, X, X, Ainv)) - - mean = mean + y_mean.reshape([-1]) - - logging.debug(f"Ridge Regression, residue {sigma2}") - - return mean, cov - - -def down_sampling_by_composition( - X: torch.Tensor, y: torch.Tensor, percentage: Sequence = [0.25, 0.5, 0.75] + mean, std = base_gp( + X, + y - (torch.sum(X, axis=1) * y_mean).reshape(y.shape), + NormalizedDotProduct, + {"diagonal_elements": feature_rms}, + **kwargs, + ) + return mean + y_mean, std + + +def gp(X, y, **kwargs): + return base_gp( + X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, **kwargs + ) + + +def base_gp( + X, + y, + kernel, + kernel_kwargs, + alpha: Optional[float] = 0.1, + max_iteration: int = 20, + stride: Optional[int] = None, ): - unique_comps, comp_ids = torch.unique(X, dim=0, return_inverse=True) - - n_types = torch.max(comp_ids) + 1 - - sort_by = torch.argsort(comp_ids) - - # find out the block for each composition - d_icomp = comp_ids[sort_by] - d_icomp = d_icomp[:-1] - d_icomp[1:] - node_icomp = torch.where(d_icomp != 0)[0] - id_start = torch.cat((torch.as_tensor([0]), node_icomp + 1)) - id_end = torch.cat((node_icomp + 1, torch.as_tensor([len(sort_by)]))) - - n_points = len(percentage) - new_X = torch.zeros((n_types * n_points, X.shape[1])) - new_y = torch.zeros((n_types * n_points)) - for i in range(n_types): - ids = sort_by[id_start[i] : id_end[i]] - for j, p in enumerate(percentage): - new_y[i * n_points + j] = torch.quantile(y[ids], p, interpolation="linear") - new_X[i * n_points + j] = unique_comps[i] - - return new_X, new_y + if len(y.shape) == 1: + y = y.reshape([-1, 1]) + + if stride is not None: + X = X[::stride] + y = y[::stride] + + not_fit = True + iteration = 0 + mean = None + std = None + while not_fit: + logging.debug(f"GP fitting iteration {iteration} {alpha}") + try: + _kernel = kernel(**kernel_kwargs) + gpr = GaussianProcessRegressor(kernel=_kernel, random_state=0, alpha=alpha) + gpr = gpr.fit(X, y) + + vec = torch.diag(torch.ones(X.shape[1])) + mean, std = gpr.predict(vec, return_std=True) + + mean = torch.as_tensor(mean, dtype=torch.get_default_dtype()).reshape([-1]) + # ignore all the off-diagonal terms + std = torch.as_tensor(std, dtype=torch.get_default_dtype()).reshape([-1]) + likelihood = gpr.log_marginal_likelihood() + + res = torch.sqrt( + torch.square(torch.matmul(X, mean.reshape([-1, 1])) - y).mean() + ) + + logging.debug( + f"GP fitting: alpha {alpha}:\n" + f" residue {res}\n" + f" mean {mean} std {std}\n" + f" log marginal likelihood {likelihood}" + ) + not_fit = False + + except Exception as e: + logging.info(f"GP fitting failed for alpha={alpha} and {e.args}") + if alpha == 0 or alpha is None: + logging.info("try a non-zero alpha") + not_fit = False + raise ValueError( + f"Please set the {alpha} to non-zero value. \n" + "The dataset energy is rank deficient to be solved with GP" + ) + else: + alpha = alpha * 2 + iteration += 1 + logging.debug(f" increase alpha to {alpha}") + + if iteration >= max_iteration or not_fit is False: + raise ValueError( + "Please set the per species shift and scale to zeros and ones. \n" + "The dataset energy is to diverge to be solved with GP" + ) + + return mean, std + + +class NormalizedDotProduct(Kernel): + r"""Dot-Product kernel. + .. math:: + k(x_i, x_j) = x_i \cdot A \cdot x_j + """ + + def __init__(self, diagonal_elements): + # TO DO: check shape + self.diagonal_elements = diagonal_elements + self.A = np.diag(diagonal_elements) + + def __call__(self, X, Y=None, eval_gradient=False): + """Return the kernel k(X, Y) and optionally its gradient. + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y) + Y : ndarray of shape (n_samples_Y, n_features), default=None + Right argument of the returned kernel k(X, Y). If None, k(X, X) + if evaluated instead. + eval_gradient : bool, default=False + Determines whether the gradient with respect to the log of + the kernel hyperparameter is computed. + Only supported when Y is None. + Returns + ------- + K : ndarray of shape (n_samples_X, n_samples_Y) + Kernel k(X, Y) + K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\ + optional + The gradient of the kernel k(X, X) with respect to the log of the + hyperparameter of the kernel. Only returned when `eval_gradient` + is True. + """ + X = np.atleast_2d(X) + if Y is None: + K = (X.dot(self.A)).dot(X.T) + else: + if eval_gradient: + raise ValueError("Gradient can only be evaluated when Y is None.") + K = (X.dot(self.A)).dot(Y.T) + + if eval_gradient: + return K, np.empty((X.shape[0], X.shape[0], 0)) + else: + return K + + def diag(self, X): + """Returns the diagonal of the kernel k(X, X). + The result of this method is identical to np.diag(self(X)); however, + it can be evaluated more efficiently since only the diagonal is + evaluated. + Parameters + ---------- + X : ndarray of shape (n_samples_X, n_features) + Left argument of the returned kernel k(X, Y). + Returns + ------- + K_diag : ndarray of shape (n_samples_X,) + Diagonal of kernel k(X, X). + """ + return np.einsum("ij,ij,jj->i", X, X, self.A) + + def __repr__(self): + return "" + + def is_stationary(self): + """Returns whether the kernel is stationary.""" + return False + + @property + def hyperparameter_diagonal_elements(self): + return Hyperparameter("diagonal_elements", "numeric", "fixed") diff --git a/nequip/utils/unittests/conftest.py b/nequip/utils/unittests/conftest.py index 77a91930..060e5e7b 100644 --- a/nequip/utils/unittests/conftest.py +++ b/nequip/utils/unittests/conftest.py @@ -133,22 +133,5 @@ def atomic_batch(nequip_dataset): return Batch.from_data_list([nequip_dataset[0], nequip_dataset[1]]) -@pytest.fixture(scope="function") -def per_species_set(): - dtype = torch.get_default_dtype() - torch.manual_seed(0) - mean_min = 1 - mean_max = 100 - std = 20 - n_sample = 1000 - n_species = 9 - ref_mean = torch.rand((n_species)) * (mean_max - mean_min) + mean_min - t_mean = torch.ones((n_sample, 1)) * ref_mean.reshape([1, -1]) - ref_std = torch.rand((n_species)) * std - t_std = torch.ones((n_sample, 1)) * ref_std.reshape([1, -1]) - E = torch.normal(t_mean, t_std) - return ref_mean.to(dtype), ref_std.to(dtype), E.to(dtype), n_sample, n_species - - # Use debug mode set_irreps_debug(True) diff --git a/setup.py b/setup.py index cba6b51f..8c977e0a 100644 --- a/setup.py +++ b/setup.py @@ -37,6 +37,7 @@ "typing_extensions;python_version<'3.8'", # backport of Final "torch-runstats>=0.2.0", "torch-ema>=0.3.0", + "scikit_learn<=1.0.1", # for GaussianProcess for per-species statistics; 1.0.2 has a bug! ], zip_safe=True, ) diff --git a/tests/unit/data/test_dataset.py b/tests/unit/data/test_dataset.py index bad796c3..f45e0ca8 100644 --- a/tests/unit/data/test_dataset.py +++ b/tests/unit/data/test_dataset.py @@ -31,7 +31,7 @@ def ase_file(molecules): MAX_ATOMIC_NUMBER: int = 5 -NATOMS = 10 +NATOMS = 3 @pytest.fixture(scope="function") @@ -277,11 +277,16 @@ def test_per_node_field(self, npz_dataset, fixed_field, mode, subset): ) print(result) - @pytest.mark.parametrize("alpha", [0, 1e-3, 0.01]) + @pytest.mark.parametrize("alpha", [1e-5, 1e-3, 0.1, 0.5]) @pytest.mark.parametrize("fixed_field", [True, False]) @pytest.mark.parametrize("full_rank", [True, False]) @pytest.mark.parametrize("subset", [True, False]) - def test_per_graph_field(self, npz_dataset, alpha, fixed_field, full_rank, subset): + @pytest.mark.parametrize( + "regressor", ["NormalizedGaussianProcess", "GaussianProcess"] + ) + def test_per_graph_field( + self, npz_dataset, alpha, fixed_field, full_rank, regressor, subset + ): if alpha <= 1e-4 and not full_rank: return @@ -303,7 +308,10 @@ def test_per_graph_field(self, npz_dataset, alpha, fixed_field, full_rank, subse del n_spec del Ns - ref_mean, ref_std, E = generate_E(N, 100, 1000, 10) + if alpha == 1e-5: + ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.0) + else: + ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.5) if subset: E_orig_order = torch.zeros_like( @@ -325,6 +333,7 @@ def test_per_graph_field(self, npz_dataset, alpha, fixed_field, full_rank, subse AtomicDataDict.TOTAL_ENERGY_KEY + "per_species_mean_std": { "alpha": alpha, + "regressor": regressor, "stride": 1, } }, @@ -332,18 +341,21 @@ def test_per_graph_field(self, npz_dataset, alpha, fixed_field, full_rank, subse res = torch.matmul(N, mean.reshape([-1, 1])) - E.reshape([-1, 1]) res2 = torch.sum(torch.square(res)) - print("alpha, residue, actual residue", alpha, res2, ref_res2) + print("residue", alpha, res2 - ref_res2) print("mean", mean, ref_mean) print("diff in mean", mean - ref_mean) print("std", std, ref_std) - tolerance = torch.max(ref_std) * 4 if full_rank: - assert torch.allclose(mean, ref_mean, atol=tolerance) - # assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100) + if alpha == 1e-5: + assert torch.allclose(mean, ref_mean, rtol=1e-1) + else: + assert torch.allclose(mean, ref_mean, rtol=1) + assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100) + elif regressor == "NormalizedGaussianProcess": + assert torch.std(mean).numpy() == 0 else: - assert torch.allclose(mean, mean[0], atol=tolerance) - # assert torch.std(mean).numpy() == 0 + assert mean[0] == mean[1] * 2 class TestReload: diff --git a/tests/unit/utils/test_gp.py b/tests/unit/utils/test_gp.py new file mode 100644 index 00000000..4792b9d2 --- /dev/null +++ b/tests/unit/utils/test_gp.py @@ -0,0 +1,37 @@ +import torch +import pytest + +from nequip.utils.regressor import base_gp +from sklearn.gaussian_process.kernels import DotProduct + + +# @pytest.mark.parametrize("full_rank", [True, False]) +@pytest.mark.parametrize("full_rank", [False]) +@pytest.mark.parametrize("alpha", [0, 1e-3, 0.1, 1]) +def test_random(full_rank, alpha): + + if alpha == 0 and not full_rank: + return + + torch.manual_seed(0) + n_samples = 10 + n_dim = 3 + + if full_rank: + X = torch.randint(low=1, high=10, size=(n_samples, n_dim)) + else: + X = torch.randint(low=1, high=10, size=(n_samples, 1)) * torch.ones( + (n_samples, n_dim) + ) + + ref_mean = torch.rand((n_dim, 1)) + y = torch.matmul(X, ref_mean) + + mean, std = base_gp( + X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, alpha=0.1 + ) + + if full_rank: + assert torch.allclose(ref_mean, mean, rtol=0.5) + else: + assert torch.allclose(mean, mean[0], rtol=1e-3) diff --git a/tests/unit/utils/test_solver.py b/tests/unit/utils/test_solver.py deleted file mode 100644 index 049c897d..00000000 --- a/tests/unit/utils/test_solver.py +++ /dev/null @@ -1,38 +0,0 @@ -import torch -import pytest - -from nequip.utils.regressor import solver - - -@pytest.mark.parametrize("full_rank", [True, False]) -@pytest.mark.parametrize("alpha", [0, 1e-3, 1e-2]) -def test_random(full_rank, alpha, per_species_set): - - if alpha == 0 and not full_rank: - return - - torch.manual_seed(0) - - ref_mean, ref_std, E, n_samples, n_dim = per_species_set - - dtype = torch.get_default_dtype() - - X = torch.randint(low=1, high=10, size=(n_samples, n_dim)).to(dtype) - if not full_rank: - X[:, n_dim - 2] = X[:, n_dim - 1] * 2 - y = (X * E).sum(axis=-1) - - mean, std = solver(X, y, alpha=alpha) - - tolerance = torch.max(ref_std) - - print("tolerance", tolerance) - print("solution", mean, std) - print("diff", mean - ref_mean) - - if full_rank: - assert torch.allclose(ref_mean, mean, atol=tolerance) - else: - assert torch.allclose(mean[n_dim - 1], mean[n_dim - 2], atol=tolerance) - - assert torch.max(std) < tolerance From 332947f49a3f0b8d5e905e10d00def719dbfe19e Mon Sep 17 00:00:00 2001 From: Alby M <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Thu, 8 Dec 2022 05:19:02 -0500 Subject: [PATCH 36/50] Remove sklearn (recreated PR) (#277) * change solver * remove dependency on dataset * add ridge tests * swap to ridge * add down sampling * change to coef * change to torch.solve * black * fix correlated columns * fix sqrt error * black * black * black * add normalization * black * flake8 * change explaination * add debug log * Update tests/conftest.py Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> * Update tests/conftest.py Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> * Fix typo in README (#270) * update change log * Update tests/conftest.py Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> * Update tests/conftest.py Co-authored-by: Alby M. <1473644+Linux-cpp-lisp@users.noreply.github.com> Co-authored-by: Lixin Sun Co-authored-by: Lixin Sun Co-authored-by: Simon Batzner --- CHANGELOG.md | 1 + README.md | 2 +- configs/full.yaml | 4 +- nequip/utils/regressor.py | 235 ++++++++--------------------- nequip/utils/unittests/conftest.py | 17 +++ setup.py | 1 - tests/unit/data/test_dataset.py | 32 ++-- tests/unit/utils/test_gp.py | 37 ----- tests/unit/utils/test_solver.py | 38 +++++ 9 files changed, 132 insertions(+), 235 deletions(-) delete mode 100644 tests/unit/utils/test_gp.py create mode 100644 tests/unit/utils/test_solver.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 50338467..67bd0c81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Most recent change on the bottom. ## [Unreleased] - 0.5.6 ### Added +- sklearn dependency removed - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights - `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary` - `nequip-benchmark --pdb` for debugging model (builder) errors diff --git a/README.md b/README.md index f70840b8..da741c09 100644 --- a/README.md +++ b/README.md @@ -157,6 +157,6 @@ under the guidance of [Boris Kozinsky at Harvard](https://bkoz.seas.harvard.edu/ If you have questions, please don't hesitate to reach out at batzner[at]g[dot]harvard[dot]edu. If you find a bug or have a proposal for a feature, please post it in the [Issues](https://github.com/mir-group/nequip/issues). -If you have a question, topic, or issue that isn't obviously one of those, try our [GitHub Disucssions](https://github.com/mir-group/nequip/discussions). +If you have a question, topic, or issue that isn't obviously one of those, try our [GitHub Discussions](https://github.com/mir-group/nequip/discussions). If you want to contribute to the code, please read [`CONTRIBUTING.md`](CONTRIBUTING.md). diff --git a/configs/full.yaml b/configs/full.yaml index daefc143..2f98164e 100644 --- a/configs/full.yaml +++ b/configs/full.yaml @@ -317,10 +317,10 @@ per_species_rescale_scales: dataset_forces_rms # If not provided, defaults to dataset_per_species_force_rms or dataset_per_atom_total_energy_std, depending on whether forces are being trained. # per_species_rescale_kwargs: # total_energy: -# alpha: 0.1 +# alpha: 0.001 # max_iteration: 20 # stride: 100 -# keywords for GP decomposition of per specie energy. Optional. Defaults to 0.1 +# keywords for ridge regression decomposition of per specie energy. Optional. Defaults to 0.001. The value should be in the range of 1e-3 to 1e-2 # per_species_rescale_arguments_in_dataset_units: True # if explicit numbers are given for the shifts/scales, this parameter must specify whether the given numbers are unitless shifts/scales or are in the units of the dataset. If ``True``, any global rescalings will correctly be applied to the per-species values. diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py index 3d23cf84..30c8f9ab 100644 --- a/nequip/utils/regressor.py +++ b/nequip/utils/regressor.py @@ -1,181 +1,72 @@ import logging import torch -import numpy as np -from typing import Optional -from sklearn.gaussian_process import GaussianProcessRegressor -from sklearn.gaussian_process.kernels import DotProduct, Kernel, Hyperparameter +from torch import matmul +from torch.linalg import solve, inv +from typing import Optional, Sequence +from opt_einsum import contract -def solver(X, y, regressor: Optional[str] = "NormalizedGaussianProcess", **kwargs): - if regressor == "GaussianProcess": - return gp(X, y, **kwargs) - elif regressor == "NormalizedGaussianProcess": - return normalized_gp(X, y, **kwargs) - else: - raise NotImplementedError(f"{regressor} is not implemented") +def solver(X, y, alpha: Optional[float] = 0.001, stride: Optional[int] = 1, **kwargs): + + dtype = torch.get_default_dtype() + X = X[::stride].to(dtype) + y = y[::stride].to(dtype) + + X, y = down_sampling_by_composition(X, y) + + X_norm = torch.sum(X) + + X = X / X_norm + y = y / X_norm -def normalized_gp(X, y, **kwargs): - feature_rms = 1.0 / np.sqrt(np.average(X**2, axis=0)) - feature_rms = np.nan_to_num(feature_rms, 1) y_mean = torch.sum(y) / torch.sum(X) - mean, std = base_gp( - X, - y - (torch.sum(X, axis=1) * y_mean).reshape(y.shape), - NormalizedDotProduct, - {"diagonal_elements": feature_rms}, - **kwargs, - ) - return mean + y_mean, std - - -def gp(X, y, **kwargs): - return base_gp( - X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, **kwargs - ) - - -def base_gp( - X, - y, - kernel, - kernel_kwargs, - alpha: Optional[float] = 0.1, - max_iteration: int = 20, - stride: Optional[int] = None, + + feature_rms = torch.sqrt(torch.mean(X**2, axis=0)) + + alpha_mat = torch.diag(feature_rms) * alpha * alpha + + A = matmul(X.T, X) + alpha_mat + dy = y - (torch.sum(X, axis=1, keepdim=True) * y_mean).reshape(y.shape) + Xy = matmul(X.T, dy) + + mean = solve(A, Xy) + + sigma2 = torch.var(matmul(X, mean) - dy) + Ainv = inv(A) + cov = torch.sqrt(sigma2 * contract("ij,kj,kl,li->i", Ainv, X, X, Ainv)) + + mean = mean + y_mean.reshape([-1]) + + logging.debug(f"Ridge Regression, residue {sigma2}") + + return mean, cov + + +def down_sampling_by_composition( + X: torch.Tensor, y: torch.Tensor, percentage: Sequence = [0.25, 0.5, 0.75] ): - if len(y.shape) == 1: - y = y.reshape([-1, 1]) - - if stride is not None: - X = X[::stride] - y = y[::stride] - - not_fit = True - iteration = 0 - mean = None - std = None - while not_fit: - logging.debug(f"GP fitting iteration {iteration} {alpha}") - try: - _kernel = kernel(**kernel_kwargs) - gpr = GaussianProcessRegressor(kernel=_kernel, random_state=0, alpha=alpha) - gpr = gpr.fit(X, y) - - vec = torch.diag(torch.ones(X.shape[1])) - mean, std = gpr.predict(vec, return_std=True) - - mean = torch.as_tensor(mean, dtype=torch.get_default_dtype()).reshape([-1]) - # ignore all the off-diagonal terms - std = torch.as_tensor(std, dtype=torch.get_default_dtype()).reshape([-1]) - likelihood = gpr.log_marginal_likelihood() - - res = torch.sqrt( - torch.square(torch.matmul(X, mean.reshape([-1, 1])) - y).mean() - ) - - logging.debug( - f"GP fitting: alpha {alpha}:\n" - f" residue {res}\n" - f" mean {mean} std {std}\n" - f" log marginal likelihood {likelihood}" - ) - not_fit = False - - except Exception as e: - logging.info(f"GP fitting failed for alpha={alpha} and {e.args}") - if alpha == 0 or alpha is None: - logging.info("try a non-zero alpha") - not_fit = False - raise ValueError( - f"Please set the {alpha} to non-zero value. \n" - "The dataset energy is rank deficient to be solved with GP" - ) - else: - alpha = alpha * 2 - iteration += 1 - logging.debug(f" increase alpha to {alpha}") - - if iteration >= max_iteration or not_fit is False: - raise ValueError( - "Please set the per species shift and scale to zeros and ones. \n" - "The dataset energy is to diverge to be solved with GP" - ) - - return mean, std - - -class NormalizedDotProduct(Kernel): - r"""Dot-Product kernel. - .. math:: - k(x_i, x_j) = x_i \cdot A \cdot x_j - """ - - def __init__(self, diagonal_elements): - # TO DO: check shape - self.diagonal_elements = diagonal_elements - self.A = np.diag(diagonal_elements) - - def __call__(self, X, Y=None, eval_gradient=False): - """Return the kernel k(X, Y) and optionally its gradient. - Parameters - ---------- - X : ndarray of shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y) - Y : ndarray of shape (n_samples_Y, n_features), default=None - Right argument of the returned kernel k(X, Y). If None, k(X, X) - if evaluated instead. - eval_gradient : bool, default=False - Determines whether the gradient with respect to the log of - the kernel hyperparameter is computed. - Only supported when Y is None. - Returns - ------- - K : ndarray of shape (n_samples_X, n_samples_Y) - Kernel k(X, Y) - K_gradient : ndarray of shape (n_samples_X, n_samples_X, n_dims),\ - optional - The gradient of the kernel k(X, X) with respect to the log of the - hyperparameter of the kernel. Only returned when `eval_gradient` - is True. - """ - X = np.atleast_2d(X) - if Y is None: - K = (X.dot(self.A)).dot(X.T) - else: - if eval_gradient: - raise ValueError("Gradient can only be evaluated when Y is None.") - K = (X.dot(self.A)).dot(Y.T) - - if eval_gradient: - return K, np.empty((X.shape[0], X.shape[0], 0)) - else: - return K - - def diag(self, X): - """Returns the diagonal of the kernel k(X, X). - The result of this method is identical to np.diag(self(X)); however, - it can be evaluated more efficiently since only the diagonal is - evaluated. - Parameters - ---------- - X : ndarray of shape (n_samples_X, n_features) - Left argument of the returned kernel k(X, Y). - Returns - ------- - K_diag : ndarray of shape (n_samples_X,) - Diagonal of kernel k(X, X). - """ - return np.einsum("ij,ij,jj->i", X, X, self.A) - - def __repr__(self): - return "" - - def is_stationary(self): - """Returns whether the kernel is stationary.""" - return False - - @property - def hyperparameter_diagonal_elements(self): - return Hyperparameter("diagonal_elements", "numeric", "fixed") + unique_comps, comp_ids = torch.unique(X, dim=0, return_inverse=True) + + n_types = torch.max(comp_ids) + 1 + + sort_by = torch.argsort(comp_ids) + + # find out the block for each composition + d_icomp = comp_ids[sort_by] + d_icomp = d_icomp[:-1] - d_icomp[1:] + node_icomp = torch.where(d_icomp != 0)[0] + id_start = torch.cat((torch.as_tensor([0]), node_icomp + 1)) + id_end = torch.cat((node_icomp + 1, torch.as_tensor([len(sort_by)]))) + + n_points = len(percentage) + new_X = torch.zeros((n_types * n_points, X.shape[1])) + new_y = torch.zeros((n_types * n_points)) + for i in range(n_types): + ids = sort_by[id_start[i] : id_end[i]] + for j, p in enumerate(percentage): + new_y[i * n_points + j] = torch.quantile(y[ids], p, interpolation="linear") + new_X[i * n_points + j] = unique_comps[i] + + return new_X, new_y diff --git a/nequip/utils/unittests/conftest.py b/nequip/utils/unittests/conftest.py index 060e5e7b..77a91930 100644 --- a/nequip/utils/unittests/conftest.py +++ b/nequip/utils/unittests/conftest.py @@ -133,5 +133,22 @@ def atomic_batch(nequip_dataset): return Batch.from_data_list([nequip_dataset[0], nequip_dataset[1]]) +@pytest.fixture(scope="function") +def per_species_set(): + dtype = torch.get_default_dtype() + torch.manual_seed(0) + mean_min = 1 + mean_max = 100 + std = 20 + n_sample = 1000 + n_species = 9 + ref_mean = torch.rand((n_species)) * (mean_max - mean_min) + mean_min + t_mean = torch.ones((n_sample, 1)) * ref_mean.reshape([1, -1]) + ref_std = torch.rand((n_species)) * std + t_std = torch.ones((n_sample, 1)) * ref_std.reshape([1, -1]) + E = torch.normal(t_mean, t_std) + return ref_mean.to(dtype), ref_std.to(dtype), E.to(dtype), n_sample, n_species + + # Use debug mode set_irreps_debug(True) diff --git a/setup.py b/setup.py index 8c977e0a..cba6b51f 100644 --- a/setup.py +++ b/setup.py @@ -37,7 +37,6 @@ "typing_extensions;python_version<'3.8'", # backport of Final "torch-runstats>=0.2.0", "torch-ema>=0.3.0", - "scikit_learn<=1.0.1", # for GaussianProcess for per-species statistics; 1.0.2 has a bug! ], zip_safe=True, ) diff --git a/tests/unit/data/test_dataset.py b/tests/unit/data/test_dataset.py index f45e0ca8..bad796c3 100644 --- a/tests/unit/data/test_dataset.py +++ b/tests/unit/data/test_dataset.py @@ -31,7 +31,7 @@ def ase_file(molecules): MAX_ATOMIC_NUMBER: int = 5 -NATOMS = 3 +NATOMS = 10 @pytest.fixture(scope="function") @@ -277,16 +277,11 @@ def test_per_node_field(self, npz_dataset, fixed_field, mode, subset): ) print(result) - @pytest.mark.parametrize("alpha", [1e-5, 1e-3, 0.1, 0.5]) + @pytest.mark.parametrize("alpha", [0, 1e-3, 0.01]) @pytest.mark.parametrize("fixed_field", [True, False]) @pytest.mark.parametrize("full_rank", [True, False]) @pytest.mark.parametrize("subset", [True, False]) - @pytest.mark.parametrize( - "regressor", ["NormalizedGaussianProcess", "GaussianProcess"] - ) - def test_per_graph_field( - self, npz_dataset, alpha, fixed_field, full_rank, regressor, subset - ): + def test_per_graph_field(self, npz_dataset, alpha, fixed_field, full_rank, subset): if alpha <= 1e-4 and not full_rank: return @@ -308,10 +303,7 @@ def test_per_graph_field( del n_spec del Ns - if alpha == 1e-5: - ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.0) - else: - ref_mean, ref_std, E = generate_E(N, 100, 1000, 0.5) + ref_mean, ref_std, E = generate_E(N, 100, 1000, 10) if subset: E_orig_order = torch.zeros_like( @@ -333,7 +325,6 @@ def test_per_graph_field( AtomicDataDict.TOTAL_ENERGY_KEY + "per_species_mean_std": { "alpha": alpha, - "regressor": regressor, "stride": 1, } }, @@ -341,21 +332,18 @@ def test_per_graph_field( res = torch.matmul(N, mean.reshape([-1, 1])) - E.reshape([-1, 1]) res2 = torch.sum(torch.square(res)) - print("residue", alpha, res2 - ref_res2) + print("alpha, residue, actual residue", alpha, res2, ref_res2) print("mean", mean, ref_mean) print("diff in mean", mean - ref_mean) print("std", std, ref_std) + tolerance = torch.max(ref_std) * 4 if full_rank: - if alpha == 1e-5: - assert torch.allclose(mean, ref_mean, rtol=1e-1) - else: - assert torch.allclose(mean, ref_mean, rtol=1) - assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100) - elif regressor == "NormalizedGaussianProcess": - assert torch.std(mean).numpy() == 0 + assert torch.allclose(mean, ref_mean, atol=tolerance) + # assert torch.allclose(std, torch.zeros_like(ref_mean), atol=alpha * 100) else: - assert mean[0] == mean[1] * 2 + assert torch.allclose(mean, mean[0], atol=tolerance) + # assert torch.std(mean).numpy() == 0 class TestReload: diff --git a/tests/unit/utils/test_gp.py b/tests/unit/utils/test_gp.py deleted file mode 100644 index 4792b9d2..00000000 --- a/tests/unit/utils/test_gp.py +++ /dev/null @@ -1,37 +0,0 @@ -import torch -import pytest - -from nequip.utils.regressor import base_gp -from sklearn.gaussian_process.kernels import DotProduct - - -# @pytest.mark.parametrize("full_rank", [True, False]) -@pytest.mark.parametrize("full_rank", [False]) -@pytest.mark.parametrize("alpha", [0, 1e-3, 0.1, 1]) -def test_random(full_rank, alpha): - - if alpha == 0 and not full_rank: - return - - torch.manual_seed(0) - n_samples = 10 - n_dim = 3 - - if full_rank: - X = torch.randint(low=1, high=10, size=(n_samples, n_dim)) - else: - X = torch.randint(low=1, high=10, size=(n_samples, 1)) * torch.ones( - (n_samples, n_dim) - ) - - ref_mean = torch.rand((n_dim, 1)) - y = torch.matmul(X, ref_mean) - - mean, std = base_gp( - X, y, DotProduct, {"sigma_0": 0, "sigma_0_bounds": "fixed"}, alpha=0.1 - ) - - if full_rank: - assert torch.allclose(ref_mean, mean, rtol=0.5) - else: - assert torch.allclose(mean, mean[0], rtol=1e-3) diff --git a/tests/unit/utils/test_solver.py b/tests/unit/utils/test_solver.py new file mode 100644 index 00000000..049c897d --- /dev/null +++ b/tests/unit/utils/test_solver.py @@ -0,0 +1,38 @@ +import torch +import pytest + +from nequip.utils.regressor import solver + + +@pytest.mark.parametrize("full_rank", [True, False]) +@pytest.mark.parametrize("alpha", [0, 1e-3, 1e-2]) +def test_random(full_rank, alpha, per_species_set): + + if alpha == 0 and not full_rank: + return + + torch.manual_seed(0) + + ref_mean, ref_std, E, n_samples, n_dim = per_species_set + + dtype = torch.get_default_dtype() + + X = torch.randint(low=1, high=10, size=(n_samples, n_dim)).to(dtype) + if not full_rank: + X[:, n_dim - 2] = X[:, n_dim - 1] * 2 + y = (X * E).sum(axis=-1) + + mean, std = solver(X, y, alpha=alpha) + + tolerance = torch.max(ref_std) + + print("tolerance", tolerance) + print("solution", mean, std) + print("diff", mean - ref_mean) + + if full_rank: + assert torch.allclose(ref_mean, mean, atol=tolerance) + else: + assert torch.allclose(mean[n_dim - 1], mean[n_dim - 2], atol=tolerance) + + assert torch.max(std) < tolerance From 4f89a431ea6e1c47f4e065943cc8968c37140792 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Sun, 11 Dec 2022 21:54:15 -0500 Subject: [PATCH 37/50] nequip-benchmark --model --- nequip/scripts/benchmark.py | 42 ++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index 5814a838..16776652 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -16,7 +16,7 @@ from nequip.utils import Config from nequip.data import AtomicData, AtomicDataDict, dataset_from_config from nequip.model import model_from_config -from nequip.scripts.deploy import _compile_for_deploy +from nequip.scripts.deploy import _compile_for_deploy, load_deployed_model from nequip.scripts.train import default_config, check_code_version from nequip.utils._global_options import _set_global_options @@ -28,6 +28,12 @@ def main(args=None): ) ) parser.add_argument("config", help="configuration file") + parser.add_argument( + "--model", + help="A deployed model to load instead of building a new one from `config`. ", + type=str, + default=None, + ) parser.add_argument( "--profile", help="Profile instead of timing, creating and outputing a Chrome trace JSON to the given path.", @@ -146,17 +152,29 @@ def main(args=None): return # Load model: - print("Building model... ") - model_time = time.time() - try: - model = model_from_config(config, initialize=True, dataset=dataset, deploy=True) - except: # noqa: E722 - if args.pdb: - pdb.post_mortem() - else: - raise - model_time = time.time() - model_time - print(f" building model took {model_time:.4f}s") + if args.model is None: + print("Building model... ") + model_time = time.time() + try: + model = model_from_config( + config, initialize=True, dataset=dataset, deploy=True + ) + except: # noqa: E722 + if args.pdb: + pdb.post_mortem() + else: + raise + model_time = time.time() - model_time + print(f" building model took {model_time:.4f}s") + else: + print("Loading model...") + model, metadata = load_deployed_model(args.model, device=device, freeze=False) + print(" deployed model has metadata:") + print( + "\n".join( + " %s: %s" % e for e in metadata.items() if e[0] != "config" + ) + ) print(f" model has {sum(p.numel() for p in model.parameters())} weights") print( f" model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable weights" From ba214d093164f0dd38165c3974210371adf8b82c Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Sun, 11 Dec 2022 22:03:29 -0500 Subject: [PATCH 38/50] nequip-benchmark --equivariance-test --- nequip/scripts/benchmark.py | 40 +++++++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index 16776652..ab95548a 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -14,6 +14,7 @@ from e3nn.util.jit import script from nequip.utils import Config +from nequip.utils.test import assert_AtomicData_equivariant from nequip.data import AtomicData, AtomicDataDict, dataset_from_config from nequip.model import model_from_config from nequip.scripts.deploy import _compile_for_deploy, load_deployed_model @@ -40,6 +41,11 @@ def main(args=None): type=str, default=None, ) + parser.add_argument( + "--equivariance-test", + help="test the model's equivariance on `--n-data` frames.", + action="store_true", + ) parser.add_argument( "--device", help="Device to run the model on. If not provided, defaults to CUDA if available and CPU otherwise.", @@ -111,12 +117,12 @@ def main(args=None): print(f" loading dataset took {dataset_time:.4f}s") dataset_rng = torch.Generator() dataset_rng.manual_seed(config.get("dataset_seed", config.get("seed", 12345))) - datas = [ + datas_list = [ AtomicData.to_AtomicDataDict(dataset[i].to(device)) for i in torch.randperm(len(dataset), generator=dataset_rng)[: args.n_data] ] - n_atom: int = len(datas[0]["pos"]) - if not all(len(d["pos"]) == n_atom for d in datas): + n_atom: int = len(datas_list[0]["pos"]) + if not all(len(d["pos"]) == n_atom for d in datas_list): raise NotImplementedError( "nequip-benchmark does not currently handle benchmarking on data frames with variable number of atoms" ) @@ -128,7 +134,7 @@ def main(args=None): print(f" number of atoms: {n_atom}") print(f" number of types: {dataset.type_mapper.num_types}") print( - f" avg. num edges: {sum(d[AtomicDataDict.EDGE_INDEX_KEY].shape[1] for d in datas) / len(datas)}" + f" avg. num edges: {sum(d[AtomicDataDict.EDGE_INDEX_KEY].shape[1] for d in datas_list) / len(datas_list)}" ) avg_edges_per_atom = torch.mean( torch.cat( @@ -137,14 +143,14 @@ def main(args=None): d[AtomicDataDict.EDGE_INDEX_KEY][0], minlength=d[AtomicDataDict.POSITIONS_KEY].shape[0], ).float() - for d in datas + for d in datas_list ] ) ).item() print(f" avg. neigh/atom: {avg_edges_per_atom}") # cycle over the datas we loaded - datas = itertools.cycle(datas) + datas = itertools.cycle(datas_list) # short circut if args.n == 0: @@ -184,6 +190,11 @@ def main(args=None): ) model.eval() + if args.equivariance_test: + args.no_compile = True + if args.model is not None: + raise RuntimeError("Can't equivariance test a deployed model.") + if args.no_compile: model = model.to(device) else: @@ -214,7 +225,7 @@ def trace_handler(p): p.export_chrome_trace(args.profile) print(f"Wrote profiling trace to `{args.profile}`") - print("Starting...") + print("Starting profiling...") with torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, @@ -236,6 +247,19 @@ def trace_handler(p): except: # noqa: E722) pdb.post_mortem() print("Done.") + elif args.equivariance_test: + print("Running equivariance test...") + errstr = assert_AtomicData_equivariant(model, datas_list) + print( + " Equivariance test passed; equivariance errors:\n" + " Errors are in real units, where relevant.\n" + " Please note that the large scale of the typical\n" + " shifts to the (atomic) energy can cause\n" + " catastrophic cancellation and give incorrectly\n" + " the equivariance error as zero for those fields.\n" + f"{errstr}" + ) + del errstr else: print("Warmup...") warmup_time = time.time() @@ -244,7 +268,7 @@ def trace_handler(p): warmup_time = time.time() - warmup_time print(f" {warmup} calls of warmup took {warmup_time:.4f}s") - print("Starting...") + print("Benchmarking...") # just time t = Timer( stmt="model(next(datas).copy())", globals={"model": model, "datas": datas} From a4d7b3a1e26f64b5bdedfe96bbff1e9c8a134b25 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Sun, 11 Dec 2022 22:27:20 -0500 Subject: [PATCH 39/50] update config --- configs/minimal_toy_emt.yaml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/configs/minimal_toy_emt.yaml b/configs/minimal_toy_emt.yaml index c9c904d1..38b7f95d 100644 --- a/configs/minimal_toy_emt.yaml +++ b/configs/minimal_toy_emt.yaml @@ -6,15 +6,18 @@ dataset_seed: 456 # network model_builders: + - SimpleIrrepsConfig - EnergyModel - PerSpeciesRescale - StressForceOutput - RescaleEnergyEtc + num_basis: 8 r_max: 4.0 -irreps_edge_sh: 0e + 1o -conv_to_output_hidden_irreps_out: 16x0e -feature_irreps_hidden: 16x0o + 16x0e + 16x1o + 16x1e +l_max: 1 +parity: true +num_features: 16 +num_layers: 4 # data set dataset: EMTTest # type of data set, can be npz or ase @@ -23,10 +26,6 @@ dataset_num_frames: 100 chemical_symbols: - Cu -global_rescale_scale: dataset_total_energy_std -per_species_rescale_shifts: dataset_per_atom_total_energy_mean -per_species_rescale_scales: dataset_per_atom_total_energy_std - # logging wandb: false # verbose: debug From 838e164f309cc056fb44e560d99ddd34fe8decdc Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Wed, 14 Dec 2022 23:43:42 -0500 Subject: [PATCH 40/50] better nequip-deploy info --- CHANGELOG.md | 1 + nequip/scripts/deploy.py | 30 ++++++++++++++++++++++++------ 2 files changed, 25 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 67bd0c81..1a7bb63c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ Most recent change on the bottom. - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights - `nequip-benchmark --no-compile` and `--verbose` and `--memory-summary` - `nequip-benchmark --pdb` for debugging model (builder) errors +- More information in `nequip-deploy info` ### Fixed - Git utilities when installed as ZIPed `.egg` (#264) diff --git a/nequip/scripts/deploy.py b/nequip/scripts/deploy.py index 4d7686e1..394c0005 100644 --- a/nequip/scripts/deploy.py +++ b/nequip/scripts/deploy.py @@ -9,6 +9,7 @@ import pathlib import logging import yaml +import itertools # This is a weird hack to avoid Intel MKL issues on the cluster when this is called as a subprocess of a process that has itself initialized PyTorch. # Since numpy gets imported later anyway for dataset stuff, this shouldn't affect performance. @@ -146,6 +147,11 @@ def main(args=None): help="Path to a deployed model file.", type=pathlib.Path, ) + info_parser.add_argument( + "--print-config", + help="Print the full config of the model.", + action="store_true", + ) build_parser = subparsers.add_parser("build", help="Build a deployment model") build_parser.add_argument( @@ -169,13 +175,25 @@ def main(args=None): logging.basicConfig(level=getattr(logging, args.verbose.upper())) if args.command == "info": - model, metadata = load_deployed_model(args.model_path, set_global_options=False) - del model + model, metadata = load_deployed_model( + args.model_path, set_global_options=False, freeze=False + ) config = metadata.pop(CONFIG_KEY) - metadata_str = "\n".join(" %s: %s" % e for e in metadata.items()) - logging.info(f"Loaded TorchScript model with metadata:\n{metadata_str}\n") - logging.info("Model was built with config:") - print(config) + if args.print_config: + print(config) + else: + metadata_str = "\n".join(" %s: %s" % e for e in metadata.items()) + logging.info(f"Loaded TorchScript model with metadata:\n{metadata_str}\n") + logging.info( + f"Model has {sum(p.numel() for p in model.parameters())} weights" + ) + logging.info( + f"Model has {sum(p.numel() for p in model.parameters() if p.requires_grad)} trainable weights" + ) + logging.info( + f"Model weights and buffers take {sum(p.numel() * p.element_size() for p in itertools.chain(model.parameters(), model.buffers())) / (1024 * 1024):.2f} MB" + ) + logging.debug(f"Model had config:\n{config}") elif args.command == "build": if args.model and args.train_dir: From 4691fd6cb1c0f183c4360c0e8aac30810dd4cc17 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Thu, 15 Dec 2022 10:54:08 -0700 Subject: [PATCH 41/50] fix benchmark pdb --- nequip/scripts/benchmark.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index ab95548a..38690e33 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -6,6 +6,7 @@ import logging import sys import pdb +import traceback import torch from torch.utils.benchmark import Timer, Measurement @@ -167,6 +168,7 @@ def main(args=None): ) except: # noqa: E722 if args.pdb: + traceback.print_exc() pdb.post_mortem() else: raise @@ -244,7 +246,8 @@ def trace_handler(p): try: for _ in range(args.n): model(next(datas).copy()) - except: # noqa: E722) + except: # noqa: E722 + traceback.print_exc() pdb.post_mortem() print("Done.") elif args.equivariance_test: From 7faad2386ee74cd14408d6d58ccfe3a24cfd2411 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Thu, 15 Dec 2022 22:18:10 -0500 Subject: [PATCH 42/50] print complete information during equivariance failures --- CHANGELOG.md | 4 ++ nequip/utils/test.py | 114 +++++++++++++++++-------------------------- setup.py | 2 +- 3 files changed, 49 insertions(+), 71 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1a7bb63c..20146a16 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,10 @@ Most recent change on the bottom. - `nequip-benchmark --pdb` for debugging model (builder) errors - More information in `nequip-deploy info` +### Changed +- Minimum e3nn is now 0.4.4 +- `--equivariance-test` now prints much more information, especially when there is a failure + ### Fixed - Git utilities when installed as ZIPed `.egg` (#264) diff --git a/nequip/utils/test.py b/nequip/utils/test.py index edf2c1e8..60e68730 100644 --- a/nequip/utils/test.py +++ b/nequip/utils/test.py @@ -28,7 +28,7 @@ def assert_permutation_equivariant( data_in: AtomicDataDict.Type, tolerance: Optional[float] = None, raise_error: bool = True, -): +) -> str: r"""Test the permutation equivariance of ``func``. Standard fields are assumed to be equivariant to node or edge permutations according to their standard interpretions; all other fields are assumed to be invariant to all permutations. Non-standard fields can be registered as node/edge permutation equivariant using ``register_fields``. @@ -93,38 +93,42 @@ def assert_permutation_equivariant( out_perm.keys() ), "Permutation changed the set of fields returned by model" - problems = [] + messages = [] + num_problems: int = 0 for k in out_orig.keys(): if k in node_permute_fields: - if not torch.allclose(out_orig[k][node_perm], out_perm[k], atol=atol): - err = (out_orig[k][node_perm] - out_perm[k]).abs().max() - problems.append( - f"node permutation equivariance violated for field {k}; maximum componentwise error: {err:e}" - ) + err = (out_orig[k][node_perm] - out_perm[k]).abs().max() + fail = not torch.allclose(out_orig[k][node_perm], out_perm[k], atol=atol) + if fail: + num_problems += 1 + messages.append( + f" node permutation equivariance of field {k:20} -> max error={err:.3e}{' FAIL' if fail else ''}" + ) elif k in edge_permute_fields: - if not torch.allclose(out_orig[k][edge_perm], out_perm[k], atol=atol): - err = (out_orig[k][edge_perm] - out_perm[k]).abs().max() - problems.append( - f"edge permutation equivariance violated for field {k}; maximum componentwise error: {err:e}" - ) + err = (out_orig[k][edge_perm] - out_perm[k]).abs().max() + fail = not torch.allclose(out_orig[k][edge_perm], out_perm[k], atol=atol) + if fail: + num_problems += 1 + messages.append( + f" edge permutation equivariance of field {k:20} -> max error={err:.3e}{' FAIL' if fail else ''}" + ) elif k == AtomicDataDict.EDGE_INDEX_KEY: pass else: # Assume invariant if out_orig[k].dtype == torch.bool: - if not torch.all(out_orig[k] == out_perm[k]): - problems.append( - f"edge/node permutation invariance violated for field {k} ({k} was assumed to be invariant, should it have been marked as equivariant?)" - ) + err = (out_orig[k] != out_perm[k]).max() else: - if not torch.allclose(out_orig[k], out_perm[k], atol=atol): - err = (out_orig[k] - out_perm[k]).abs().max() - problems.append( - f"edge/node permutation invariance violated for field {k}; maximum componentwise error: {err:e}. (`{k}` was assumed to be invariant, should it have been marked as equivariant?)" - ) - msg = "\n".join(problems) - if len(problems) == 0: - return + err = (out_orig[k] - out_perm[k]).abs().max() + fail = not torch.allclose(out_orig[k], out_perm[k], atol=atol) + if fail: + num_problems += 1 + messages.append( + f" edge & node permutation invariance for field {k:20} -> max error={err:.3e}{' FAIL' if fail else ''}" + ) + msg = "\n".join(messages) + if num_problems == 0: + return msg else: if raise_error: raise AssertionError(msg) @@ -169,7 +173,7 @@ def assert_AtomicData_equivariant( # == Test permutation of graph nodes == # since permutation is discrete and should not be data dependent, run only on one frame. - permutation_problems = assert_permutation_equivariant( + permutation_message = assert_permutation_equivariant( func, data_in[0], tolerance=permutation_tolerance, raise_error=False ) @@ -255,53 +259,23 @@ def wrapper(*args): if o3_tolerance is None: o3_tolerance = FLOAT_TOLERANCE[torch.get_default_dtype()] - anerr = next(iter(errs.values())) - if isinstance(anerr, float) or anerr.ndim == 0: - # old e3nn doesn't report which key - problems = {k: v for k, v in errs.items() if v > o3_tolerance} - - def _describe(errors): - return ( - permutation_problems + "\n" if permutation_problems is not None else "" - ) + "\n".join( - "(parity_k={:d}, did_translate={}) -> max error={:.3e}".format( - int(k[0]), - bool(k[1]), - float(v), - ) - for k, v in errors.items() - ) - - if len(problems) > 0 or permutation_problems is not None: - raise AssertionError( - "Equivariance test failed for cases:" + _describe(problems) - ) - - return _describe(errs) - else: - # it's newer and tells us which is which - all_errs = [] - for case, err in errs.items(): - for key, this_err in zip(irreps_out.keys(), err): - all_errs.append(case + (key, this_err)) - problems = [e for e in all_errs if e[-1] > o3_tolerance] - - def _describe(errors): - return ( - permutation_problems + "\n" if permutation_problems is not None else "" - ) + "\n".join( - " (parity_k={:1d}, did_translate={:5}, field={:20}) -> max error={:.3e}".format( - int(k[0]), str(bool(k[1])), str(k[2]), float(k[3]) - ) - for k in errors - ) + all_errs = [] + for case, err in errs.items(): + for key, this_err in zip(irreps_out.keys(), err): + all_errs.append(case + (key, this_err)) + is_problem = [e[-1] > o3_tolerance for e in all_errs] + + message = (permutation_message + "\n") + "\n".join( + " (parity_k={:1d}, did_translate={:5}, field={:20}) -> max error={:.3e}".format( + int(k[0]), str(bool(k[1])), str(k[2]), float(k[3]) + ) + for k, prob in zip(all_errs, is_problem) + ) - if len(problems) > 0 or permutation_problems is not None: - raise AssertionError( - "Equivariance test failed for cases:\n" + _describe(problems) - ) + if sum(is_problem) > 0 or "FAIL" in permutation_message: + raise AssertionError(f"Equivariance test failed for cases:\n{message}") - return _describe(all_errs) + return message _DEBUG_HOOKS = None diff --git a/setup.py b/setup.py index cba6b51f..7583dea3 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ "ase", "tqdm", "torch>=1.8,<=1.12,!=1.9.0", # torch.fx added in 1.8 - "e3nn>=0.3.5,<0.6.0", + "e3nn>=0.4,4,<0.6.0", "pyyaml", "contextlib2;python_version<'3.7'", # backport of nullcontext 'contextvars;python_version<"3.7"', # backport of contextvars for savenload From 5a33ecc06d4d83efd2073b6da2dc5a8cdda07c28 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Thu, 15 Dec 2022 22:18:25 -0500 Subject: [PATCH 43/50] warmup to test equivariance of JITed model --- nequip/scripts/benchmark.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/nequip/scripts/benchmark.py b/nequip/scripts/benchmark.py index 38690e33..1deb0de2 100644 --- a/nequip/scripts/benchmark.py +++ b/nequip/scripts/benchmark.py @@ -251,6 +251,12 @@ def trace_handler(p): pdb.post_mortem() print("Done.") elif args.equivariance_test: + print("Warmup...") + warmup_time = time.time() + for _ in range(warmup): + model(next(datas).copy()) + warmup_time = time.time() - warmup_time + print(f" {warmup} calls of warmup took {warmup_time:.4f}s") print("Running equivariance test...") errstr = assert_AtomicData_equivariant(model, datas_list) print( From c42f3d2285dbe7d9ff315ae16940452bf0fbe26b Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Fri, 16 Dec 2022 02:04:50 -0500 Subject: [PATCH 44/50] typo --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 7583dea3..24d37e24 100644 --- a/setup.py +++ b/setup.py @@ -30,7 +30,7 @@ "ase", "tqdm", "torch>=1.8,<=1.12,!=1.9.0", # torch.fx added in 1.8 - "e3nn>=0.4,4,<0.6.0", + "e3nn>=0.4.4,<0.6.0", "pyyaml", "contextlib2;python_version<'3.7'", # backport of nullcontext 'contextvars;python_version<"3.7"', # backport of contextvars for savenload From 0e0f4ee278ac5f363ef1db61fb6553fbe2fdd8f0 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Sun, 18 Dec 2022 23:12:35 -0700 Subject: [PATCH 45/50] bump --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 20146a16..cf50972d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 Most recent change on the bottom. -## [Unreleased] - 0.5.6 +## [0.5.6] - 2022-12-19 ### Added - sklearn dependency removed - `nequip-benchmark` and `nequip-train` report number of weights and number of trainable weights From 0f22b7b7d171cbf15aa801d083c3899568ea42c6 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 19 Dec 2022 14:00:35 -0700 Subject: [PATCH 46/50] PyTorch 1.10 compatibility --- nequip/utils/regressor.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py index 30c8f9ab..9db6c86f 100644 --- a/nequip/utils/regressor.py +++ b/nequip/utils/regressor.py @@ -66,7 +66,13 @@ def down_sampling_by_composition( for i in range(n_types): ids = sort_by[id_start[i] : id_end[i]] for j, p in enumerate(percentage): - new_y[i * n_points + j] = torch.quantile(y[ids], p, interpolation="linear") + # We comment out `interpolation="linear"` for PyTorch 1.10, + # where linear is the default but the `interpolation` option + # does not exist: + # https://pytorch.org/docs/1.10/generated/torch.quantile.html?highlight=quantile#torch.quantile + # https://pytorch.org/docs/1.13/generated/torch.quantile.html?highlight=quantile#torch.quantile + # previously, call had `, interpolation="linear")` + new_y[i * n_points + j] = torch.quantile(y[ids], p) new_X[i * n_points + j] = unique_comps[i] return new_X, new_y From 90fd9ed4bc604b3055109834d9d65df9b6b5b87d Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 19 Dec 2022 22:25:47 -0700 Subject: [PATCH 47/50] version bumps --- .github/workflows/tests.yml | 2 +- .github/workflows/tests_develop.yml | 2 +- nequip/utils/regressor.py | 8 +------- setup.py | 2 +- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a6e9a8f0..1f835e90 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: python-version: [3.9] - torch-version: [1.10.1, 1.11.0] + torch-version: [1.11.0, 1.12.1] steps: - uses: actions/checkout@v2 diff --git a/.github/workflows/tests_develop.yml b/.github/workflows/tests_develop.yml index e301f5e3..2c23350c 100644 --- a/.github/workflows/tests_develop.yml +++ b/.github/workflows/tests_develop.yml @@ -16,7 +16,7 @@ jobs: strategy: matrix: python-version: [3.9] - torch-version: [1.11.0] + torch-version: [1.12.1] steps: - uses: actions/checkout@v2 diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py index 9db6c86f..1e4e9015 100644 --- a/nequip/utils/regressor.py +++ b/nequip/utils/regressor.py @@ -66,13 +66,7 @@ def down_sampling_by_composition( for i in range(n_types): ids = sort_by[id_start[i] : id_end[i]] for j, p in enumerate(percentage): - # We comment out `interpolation="linear"` for PyTorch 1.10, - # where linear is the default but the `interpolation` option - # does not exist: - # https://pytorch.org/docs/1.10/generated/torch.quantile.html?highlight=quantile#torch.quantile - # https://pytorch.org/docs/1.13/generated/torch.quantile.html?highlight=quantile#torch.quantile - # previously, call had `, interpolation="linear")` - new_y[i * n_points + j] = torch.quantile(y[ids], p) + new_y[i * n_points + j] = torch.quantile(y[ids], p, interpolation="linear")) new_X[i * n_points + j] = unique_comps[i] return new_X, new_y diff --git a/setup.py b/setup.py index 24d37e24..d7a5b465 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ "numpy", "ase", "tqdm", - "torch>=1.8,<=1.12,!=1.9.0", # torch.fx added in 1.8 + "torch>=1.10.0,<1.13,!=1.9.0", "e3nn>=0.4.4,<0.6.0", "pyyaml", "contextlib2;python_version<'3.7'", # backport of nullcontext From 82dbb9e8d7517d55267875f5279ef608e5ef8a9e Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 19 Dec 2022 22:29:53 -0700 Subject: [PATCH 48/50] typo --- nequip/utils/regressor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py index 1e4e9015..30c8f9ab 100644 --- a/nequip/utils/regressor.py +++ b/nequip/utils/regressor.py @@ -66,7 +66,7 @@ def down_sampling_by_composition( for i in range(n_types): ids = sort_by[id_start[i] : id_end[i]] for j, p in enumerate(percentage): - new_y[i * n_points + j] = torch.quantile(y[ids], p, interpolation="linear")) + new_y[i * n_points + j] = torch.quantile(y[ids], p, interpolation="linear") new_X[i * n_points + j] = unique_comps[i] return new_X, new_y From 4bb345d002153794cf2b9644e6052a3894cd5859 Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 19 Dec 2022 23:27:07 -0700 Subject: [PATCH 49/50] don't use `manual_seed` globally --- nequip/utils/unittests/conftest.py | 8 ++++---- tests/unit/data/test_dataset.py | 14 ++++++++------ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/nequip/utils/unittests/conftest.py b/nequip/utils/unittests/conftest.py index 77a91930..4cfa98ff 100644 --- a/nequip/utils/unittests/conftest.py +++ b/nequip/utils/unittests/conftest.py @@ -136,17 +136,17 @@ def atomic_batch(nequip_dataset): @pytest.fixture(scope="function") def per_species_set(): dtype = torch.get_default_dtype() - torch.manual_seed(0) + rng = torch.Generator().manual_seed(127) mean_min = 1 mean_max = 100 std = 20 n_sample = 1000 n_species = 9 - ref_mean = torch.rand((n_species)) * (mean_max - mean_min) + mean_min + ref_mean = torch.rand((n_species), generator=rng) * (mean_max - mean_min) + mean_min t_mean = torch.ones((n_sample, 1)) * ref_mean.reshape([1, -1]) - ref_std = torch.rand((n_species)) * std + ref_std = torch.rand((n_species), generator=rng) * std t_std = torch.ones((n_sample, 1)) * ref_std.reshape([1, -1]) - E = torch.normal(t_mean, t_std) + E = torch.normal(t_mean, t_std, generator=rng) return ref_mean.to(dtype), ref_std.to(dtype), E.to(dtype), n_sample, n_species diff --git a/tests/unit/data/test_dataset.py b/tests/unit/data/test_dataset.py index bad796c3..95cfe48d 100644 --- a/tests/unit/data/test_dataset.py +++ b/tests/unit/data/test_dataset.py @@ -231,8 +231,8 @@ def test_per_graph_field(self, npz_dataset, fixed_field, subset, key, dim): if npz_dataset is None: return - torch.manual_seed(0) - E = torch.rand((npz_dataset.len(),) + dim) + rng = torch.Generator().manual_seed(454) + E = torch.rand((npz_dataset.len(),) + dim, generator=rng) ref_mean = torch.mean(E / NATOMS, dim=0) ref_std = torch.std(E / NATOMS, dim=0) @@ -437,12 +437,14 @@ def test_from_atoms(self, molecules): def generate_E(N, mean_min, mean_max, std): - torch.manual_seed(0) - ref_mean = torch.rand((N.shape[1])) * (mean_max - mean_min) + mean_min + rng = torch.Generator().manual_seed(568) + ref_mean = ( + torch.rand((N.shape[1]), generator=rng) * (mean_max - mean_min) + mean_min + ) t_mean = torch.ones((N.shape[0], 1)) * ref_mean.reshape([1, -1]) - ref_std = torch.rand((N.shape[1])) * std + ref_std = torch.rand((N.shape[1]), generator=rng) * std t_std = torch.ones((N.shape[0], 1)) * ref_std.reshape([1, -1]) - E = torch.normal(t_mean, t_std) + E = torch.normal(t_mean, t_std, generator=rng) return ref_mean, ref_std, (N * E).sum(axis=-1) From 5a365e061019e8dae32bdb31dc830e378fe758ba Mon Sep 17 00:00:00 2001 From: Linux-cpp-lisp <1473644+Linux-cpp-lisp@users.noreply.github.com> Date: Mon, 19 Dec 2022 23:27:59 -0700 Subject: [PATCH 50/50] always solve in float64 --- nequip/utils/regressor.py | 14 +++++++++----- tests/unit/utils/test_solver.py | 8 ++++---- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/nequip/utils/regressor.py b/nequip/utils/regressor.py index 30c8f9ab..76d140bc 100644 --- a/nequip/utils/regressor.py +++ b/nequip/utils/regressor.py @@ -8,8 +8,10 @@ def solver(X, y, alpha: Optional[float] = 0.001, stride: Optional[int] = 1, **kwargs): - - dtype = torch.get_default_dtype() + # results are in the same "units" as y, so same dtype too: + dtype_out = y.dtype + # always solve in float64 for numerical stability + dtype = torch.float64 X = X[::stride].to(dtype) y = y[::stride].to(dtype) @@ -40,7 +42,7 @@ def solver(X, y, alpha: Optional[float] = 0.001, stride: Optional[int] = 1, **kw logging.debug(f"Ridge Regression, residue {sigma2}") - return mean, cov + return mean.to(dtype_out), cov.to(dtype_out) def down_sampling_by_composition( @@ -61,8 +63,10 @@ def down_sampling_by_composition( id_end = torch.cat((node_icomp + 1, torch.as_tensor([len(sort_by)]))) n_points = len(percentage) - new_X = torch.zeros((n_types * n_points, X.shape[1])) - new_y = torch.zeros((n_types * n_points)) + new_X = torch.zeros( + (n_types * n_points, X.shape[1]), dtype=X.dtype, device=X.device + ) + new_y = torch.zeros((n_types * n_points), dtype=y.dtype, device=y.device) for i in range(n_types): ids = sort_by[id_start[i] : id_end[i]] for j, p in enumerate(percentage): diff --git a/tests/unit/utils/test_solver.py b/tests/unit/utils/test_solver.py index 049c897d..de78cbd8 100644 --- a/tests/unit/utils/test_solver.py +++ b/tests/unit/utils/test_solver.py @@ -11,13 +11,13 @@ def test_random(full_rank, alpha, per_species_set): if alpha == 0 and not full_rank: return - torch.manual_seed(0) + rng = torch.Generator().manual_seed(343) ref_mean, ref_std, E, n_samples, n_dim = per_species_set - dtype = torch.get_default_dtype() - - X = torch.randint(low=1, high=10, size=(n_samples, n_dim)).to(dtype) + X = torch.randint(low=1, high=10, size=(n_samples, n_dim), generator=rng).to( + torch.get_default_dtype() + ) if not full_rank: X[:, n_dim - 2] = X[:, n_dim - 1] * 2 y = (X * E).sum(axis=-1)