Merge pull request #1243 from pyiron/remap_main

Add maintenance function to update TYPE in HDF5 files
pyiron · Dec 7, 2023 · 56a2fba · 56a2fba
2 parents e6cca77 + f53cb5a
commit 56a2fba
Show file tree

Hide file tree

Showing 4 changed files with 208 additions and 34 deletions.
diff --git a/pyiron_base/project/maintenance.py b/pyiron_base/project/maintenance.py
@@ -2,14 +2,49 @@
 import os
 import pkgutil
 import warnings
+import sys
 
 import pandas
 
 from pyiron_base import state
 from pyiron_base.database.performance import get_database_statistics
+import pyiron_base.storage.hdfio
 from pyiron_base.project.update.pyiron_base_03x_to_04x import pyiron_base_03x_to_04x
 
 
+# we sometimes move classes between modules; this would break HDF storage,
+# since objects save there the module path from which their classes can be
+# imported.  We can work around this by defining here an explicit map that
+# _to_object can use to find the new modules and update the HDF5 files
+_MODULE_CONVERSION_DICT = {
+    "pyiron_base.generic.datacontainer": "pyiron_base.storage.datacontainer",
+    "pyiron_base.generic.inputlist": "pyiron_base.storage.inputlist",
+    "pyiron_base.generic.flattenedstorage": "pyiron_base.storage.flattenedstorage",
+    "pyiron_base.table.datamining": "pyiron_base.jobs.datamining",
+}
+
+
+def add_module_conversion(old: str, new: str):
+    """
+    Add a new module conversion.
+
+    After setting up a conversion, call :meth:`.Project.maintenance.local.update_hdf_types` to rewrite the HDF5 files to
+    make this change and allow loading of previously saved objects.
+
+    Args:
+        old (str): path to module that previously defined objects in storage
+        new (str): path to module that should be imported instead
+    Raises:
+        ValueError: if an entry for `old` already exists and does not point to `new`.
+    """
+    if old not in _MODULE_CONVERSION_DICT:
+        _MODULE_CONVERSION_DICT[old] = new
+    elif _MODULE_CONVERSION_DICT[old] != new:
+        raise ValueError(
+            f"Module path '{old}' already found in conversion dict, pointing to '{_MODULE_CONVERSION_DICT[old]}'!"
+        )
+
+
 class Maintenance:
     """
     The purpose of maintenance class is to provide
@@ -87,7 +122,10 @@ def defragment_storage(
         **kwargs: dict,
     ):
         """
-        Iterate over the jobs within the current project and it is sub projects and rewrite the hdf file
+        Rewrite the hdf5 files of jobs.  This can free up unused space.
+
+        By default iterate recursively over the jobs within the current
+        project.  This can be controlled with `recursive` and `kwargs`.
 
         Args:
             recursive (bool): search subprojects [True/False] - True by default
@@ -102,6 +140,72 @@ def defragment_storage(
             hdf = job.project_hdf5
             hdf.rewrite_hdf5(job.name)
 
+    def update_hdf_types(
+        self,
+        recursive: bool = True,
+        progress: bool = True,
+        **kwargs: dict,
+    ):
+        """
+        Rewrite TYPE fields in hdf5 files for renamed modules.
+
+        New module conversions can be added with
+        :func:`.add_module_conversion(old, new)`.  This method will then
+        consider all objects previously imported from `old` to be imported from
+        `new`.
+
+        Args:
+            recursive (bool): search subprojects [True/False] - True by default
+            progress (bool): if True (default), add an interactive progress bar to the iteration
+            **kwargs (dict): Optional arguments for filtering with keys matching the project database column name
+                            (eg. status="finished"). Asterisk can be used to denote a wildcard, for zero or more
+                            instances of any character
+        """
+
+        def recurse(hdf):
+            contents = hdf.list_all()
+            for group in contents["groups"]:
+                recurse(hdf[group])
+            if "TYPE" in contents["nodes"]:
+                (
+                    module_path,
+                    class_name,
+                ) = pyiron_base.storage.hdfio._extract_module_class_name(hdf["TYPE"])
+                if module_path in _MODULE_CONVERSION_DICT:
+                    new_module_path = _MODULE_CONVERSION_DICT[module_path]
+                    hdf["TYPE"] = f"<class '{new_module_path}.{class_name}'>"
+
+        for job in self._project.iter_jobs(
+            recursive=recursive, progress=progress, convert_to_object=False, **kwargs
+        ):
+            hdf = job.project_hdf5
+            recurse(hdf)
+
+        def fix_project_data(pr):
+            hdf = pr.create_hdf(pr.path, "project_data")["../data"]
+            recurse(hdf)
+
+        fix_project_data(self._project)
+        for sub in self._project.iter_groups():
+            fix_project_data(sub)
+
+        self.update_pyiron_tables(recursive=recursive, progress=progress, **kwargs)
+
+    def update_pyiron_tables(
+        self,
+        recursive: bool = True,
+        progress: bool = True,
+        **kwargs: dict,
+    ):
+        kwargs["hamilton"] = "PyironTable"
+        for old, new in _MODULE_CONVERSION_DICT.items():
+            sys.modules[old] = importlib.import_module(new)
+
+        for job in self._project.iter_jobs(
+            recursive=recursive, progress=progress, **kwargs
+        ):
+            job.to_hdf()
+
 
 class UpdateMaintenance:
     def __init__(self, project):

diff --git a/pyiron_base/storage/hdfio.py b/pyiron_base/storage/hdfio.py
@@ -14,14 +14,15 @@
 import posixpath
 import numpy as np
 import sys
-from typing import Union, Optional, Any
+from typing import Union, Optional, Any, Tuple
 
 from pyiron_base.utils.deprecate import deprecate
 from pyiron_base.storage.helper_functions import read_hdf5, write_hdf5
 from pyiron_base.interfaces.has_groups import HasGroups
 from pyiron_base.state import state
 from pyiron_base.jobs.dynamic import JOB_DYN_DICT, class_constructor
 from pyiron_base.jobs.job.util import _get_safe_job_name
+import pyiron_base.project.maintenance
 import warnings
 
 __author__ = "Joerg Neugebauer, Jan Janssen"
@@ -62,42 +63,57 @@ def extract_dims(v):
         return len(set(dim1)) > 1 and len(set(dim_other)) == 1
 
 
-def _import_class(class_name):
+# for historic reasons we write str(class) into the HDF 'TYPE' field of objects, so we need to parse this back out
+def _extract_fully_qualified_name(type_field: str) -> str:
+    return type_field.split("'")[1]
+
+
+def _extract_module_class_name(type_field: str) -> Tuple[str, str]:
+    fully_qualified_path = _extract_fully_qualified_name(type_field)
+    return fully_qualified_path.rsplit(".", maxsplit=1)
+
+
+def _import_class(module_path, class_name):
     """
     Import given class from fully qualified name and return class object.
 
     Args:
+        module_path (str): fully qualified name of a pyiron class
         class_name (str): fully qualified name of a pyiron class
 
     Returns:
         type: class object of the given name
     """
-    internal_class_name = class_name.split(".")[-1][:-2]
-    class_path = class_name.split()[-1].split(".")[:-1]
-    class_path[0] = class_path[0][1:]
-    class_module_path = ".".join(class_path)
     # ugly dynamic import, but only needed to log the warning anyway
     from pyiron_base.jobs.job.jobtype import JobTypeChoice
 
     job_class_dict = JobTypeChoice().job_class_dict  # access global singleton
-    if internal_class_name in job_class_dict:
-        module_path = job_class_dict[internal_class_name]
+    if class_name in job_class_dict:
+        known_module_path = job_class_dict[class_name]
         # entries in the job_class_dict are either strings of modules or fully
         # loaded class object; in the latter case our work here is done we just
         # return the class
         if isinstance(module_path, type):
             return module_path
-        if class_module_path != module_path:
+        if module_path != known_module_path:
             state.logger.info(
-                f'Using registered module "{module_path}" instead of custom/old module "{class_module_path}" to'
-                f' import job type "{internal_class_name}"!'
+                f'Using registered module "{known_module_path}" instead of custom/old module "{module_path}" to'
+                f' import job type "{class_name}"!'
             )
-    else:
-        module_path = class_module_path
-    return getattr(
-        importlib.import_module(module_path),
-        internal_class_name,
-    )
+            module_path = known_module_path
+    try:
+        return getattr(
+            importlib.import_module(module_path),
+            class_name,
+        )
+    except ImportError:
+        if module_path in pyiron_base.project.maintenance._MODULE_CONVERSION_DICT:
+            raise RuntimeError(
+                f"Could not import {class_name} from {module_path}, but module path known to have changed. "
+                "Call project.maintenance.local.update_hdf_types() to upgrade storage!"
+            )
+        else:
+            raise
 
 
 def _to_object(hdf, class_name=None, **kwargs):
@@ -120,23 +136,18 @@ def _to_object(hdf, class_name=None, **kwargs):
         raise ValueError(
             "Object type in hdf5-file must be identical to input parameter"
         )
-    class_name = class_name or hdf.get("TYPE")
-    class_path = class_name.split("<class '")[-1].split("'>")[0]
-    class_convert_dict = {  # Fix backwards compatibility
-        "pyiron_base.generic.datacontainer.DataContainer": "pyiron_base.storage.datacontainer.DataContainer",
-        "pyiron_base.generic.inputlist.InputList": "pyiron_base.storage.inputlist.InputList",
-        "pyiron_base.generic.flattenedstorage.FlattenedStorage": "pyiron_base.storage.flattenedstorage.FlattenedStorage",
-    }
-    if class_path in class_convert_dict.keys():
-        class_name_new = "<class '" + class_convert_dict[class_path] + "'>"
-        class_object = _import_class(class_name_new)
-    elif not class_path.startswith("abc."):
-        class_object = _import_class(class_name)
+    type_field = class_name or hdf.get("TYPE")
+    module_path, class_name = _extract_module_class_name(type_field)
+
+    # objects that have classes starting with abc. were likely created by pyiron_base.jobs.dynamic, so we cannot import
+    # them the usual way.  Instead reconstruct the class here from JOB_DYN_DICT.
+    if not module_path.startswith("abc."):
+        class_object = _import_class(module_path, class_name)
     else:
-        class_object = class_constructor(cp=JOB_DYN_DICT[class_path.split(".")[-1]])
+        class_object = class_constructor(cp=JOB_DYN_DICT[class_name])
 
     # Backwards compatibility since the format of TYPE changed
-    if class_name != str(class_object):
+    if type_field != str(class_object):
         hdf["TYPE"] = str(class_object)
 
     if hasattr(class_object, "from_hdf_args"):

diff --git a/tests/generic/test_fileHDFio.py → tests/storage/test_fileHDFio.py b/tests/generic/test_fileHDFio.py → tests/storage/test_fileHDFio.py
@@ -633,7 +633,7 @@ def test_rewrite_hdf5(self):
     def test_import_class(self):
 
         with self.subTest("import ToyJob without interfering:"):
-            toy_job_cls = _import_class(str(BaseToyJob))
+            toy_job_cls = _import_class(BaseToyJob.__module__, BaseToyJob.__name__)
             self.assertIs(
                 toy_job_cls, BaseToyJob, msg="Did not return the requested class."
             )
@@ -643,7 +643,7 @@ def test_import_class(self):
 
             with self.subTest("Import ToyJob while another ToyJob is registered"):
                 with self.assertLogs(state.logger) as log:
-                    toy_job_cls = _import_class(str(BaseToyJob))
+                    toy_job_cls = _import_class(BaseToyJob.__module__, BaseToyJob.__name__)
                     self.assertEqual(
                         len(log.output),
                         1,

diff --git a/tests/storage/test_module_path_maintenance.py b/tests/storage/test_module_path_maintenance.py
@@ -0,0 +1,59 @@
+import pyiron_base.project.maintenance
+
+from pyiron_base import DataContainer
+from pyiron_base._tests import TestWithProject, ToyJob
+
+class TestModulePath(TestWithProject):
+
+    def test_add_module_conversion(self):
+        """module paths should only be able to be added once!"""
+
+        # need to add real modules here, because they need to be importable.
+        pyiron_base.project.maintenance.add_module_conversion('foo.bar', 'os.path')
+        with self.assertRaises(ValueError, msg="Adding paths twice should raise an error!"):
+            pyiron_base.project.maintenance.add_module_conversion('foo.bar', 'os')
+
+    def test_maintenance(self):
+        """Objects should be loaded correctly after maintenance is run."""
+
+        # dummy data
+        dc = DataContainer({"a": 42, "b": [1,2,3]})
+        self.project.data["test_data"] = dc
+        self.project.data.write()
+
+        hdf = self.project.create_hdf(self.project.path, "project_data")["../data"]
+        # manipulate type to fake an old module
+        old_path_project_data = "project.data.module"
+        hdf["test_data__index_0/TYPE"] = f"<class '{old_path_project_data}.DataContainer'>"
+
+        job = self.project.create_job(ToyJob, "test_job")
+        job.run()
+
+        # manipulate type to fake an old module
+        old_path_job = "job.module"
+        job.project_hdf5["TYPE"] = f"<class '{old_path_job}.ToyJob'>"
+
+        pyiron_base.project.maintenance.add_module_conversion(old_path_project_data, DataContainer.__module__)
+        pyiron_base.project.maintenance.add_module_conversion(old_path_job, ToyJob.__module__)
+
+        with self.assertRaises(RuntimeError, msg="Project data should raise a special exception for objects that can be fixed."):
+            self.project.data.read()
+            self.project.data["test_data"] # need to access it, otherwise lazy loading hides the error
+
+        with self.assertRaises(RuntimeError, msg="Job loading should raise a special exception for objects that can be fixed."):
+            self.project["test_job"]
+
+        self.project.maintenance.local.update_hdf_types()
+
+        try:
+            self.project.data.read()
+            self.project.data["test_data"] # need to access it, otherwise lazy loading hides the error
+            self.project["test_job"]
+        except:
+            self.fail("Objects still not loadable after maintenance!")
+
+        self.assertFalse(old_path_project_data in hdf["test_data__index_0/TYPE"],
+                         "Module path not updated in project data!")
+
+        self.assertFalse(old_path_job in job.project_hdf5["TYPE"],
+                         "Module path not updated in job hdf5!")