Skip to content

Commit

Permalink
Merge pull request #1243 from pyiron/remap_main
Browse files Browse the repository at this point in the history
Add maintenance function to update TYPE in HDF5 files
  • Loading branch information
pmrv authored Dec 7, 2023
2 parents e6cca77 + f53cb5a commit 56a2fba
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 34 deletions.
106 changes: 105 additions & 1 deletion pyiron_base/project/maintenance.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,49 @@
import os
import pkgutil
import warnings
import sys

import pandas

from pyiron_base import state
from pyiron_base.database.performance import get_database_statistics
import pyiron_base.storage.hdfio
from pyiron_base.project.update.pyiron_base_03x_to_04x import pyiron_base_03x_to_04x


# we sometimes move classes between modules; this would break HDF storage,
# since objects save there the module path from which their classes can be
# imported. We can work around this by defining here an explicit map that
# _to_object can use to find the new modules and update the HDF5 files
_MODULE_CONVERSION_DICT = {
"pyiron_base.generic.datacontainer": "pyiron_base.storage.datacontainer",
"pyiron_base.generic.inputlist": "pyiron_base.storage.inputlist",
"pyiron_base.generic.flattenedstorage": "pyiron_base.storage.flattenedstorage",
"pyiron_base.table.datamining": "pyiron_base.jobs.datamining",
}


def add_module_conversion(old: str, new: str):
"""
Add a new module conversion.
After setting up a conversion, call :meth:`.Project.maintenance.local.update_hdf_types` to rewrite the HDF5 files to
make this change and allow loading of previously saved objects.
Args:
old (str): path to module that previously defined objects in storage
new (str): path to module that should be imported instead
Raises:
ValueError: if an entry for `old` already exists and does not point to `new`.
"""
if old not in _MODULE_CONVERSION_DICT:
_MODULE_CONVERSION_DICT[old] = new
elif _MODULE_CONVERSION_DICT[old] != new:
raise ValueError(
f"Module path '{old}' already found in conversion dict, pointing to '{_MODULE_CONVERSION_DICT[old]}'!"
)


class Maintenance:
"""
The purpose of maintenance class is to provide
Expand Down Expand Up @@ -87,7 +122,10 @@ def defragment_storage(
**kwargs: dict,
):
"""
Iterate over the jobs within the current project and it is sub projects and rewrite the hdf file
Rewrite the hdf5 files of jobs. This can free up unused space.
By default iterate recursively over the jobs within the current
project. This can be controlled with `recursive` and `kwargs`.
Args:
recursive (bool): search subprojects [True/False] - True by default
Expand All @@ -102,6 +140,72 @@ def defragment_storage(
hdf = job.project_hdf5
hdf.rewrite_hdf5(job.name)

def update_hdf_types(
self,
recursive: bool = True,
progress: bool = True,
**kwargs: dict,
):
"""
Rewrite TYPE fields in hdf5 files for renamed modules.
New module conversions can be added with
:func:`.add_module_conversion(old, new)`. This method will then
consider all objects previously imported from `old` to be imported from
`new`.
Args:
recursive (bool): search subprojects [True/False] - True by default
progress (bool): if True (default), add an interactive progress bar to the iteration
**kwargs (dict): Optional arguments for filtering with keys matching the project database column name
(eg. status="finished"). Asterisk can be used to denote a wildcard, for zero or more
instances of any character
"""

def recurse(hdf):
contents = hdf.list_all()
for group in contents["groups"]:
recurse(hdf[group])
if "TYPE" in contents["nodes"]:
(
module_path,
class_name,
) = pyiron_base.storage.hdfio._extract_module_class_name(hdf["TYPE"])
if module_path in _MODULE_CONVERSION_DICT:
new_module_path = _MODULE_CONVERSION_DICT[module_path]
hdf["TYPE"] = f"<class '{new_module_path}.{class_name}'>"

for job in self._project.iter_jobs(
recursive=recursive, progress=progress, convert_to_object=False, **kwargs
):
hdf = job.project_hdf5
recurse(hdf)

def fix_project_data(pr):
hdf = pr.create_hdf(pr.path, "project_data")["../data"]
recurse(hdf)

fix_project_data(self._project)
for sub in self._project.iter_groups():
fix_project_data(sub)

self.update_pyiron_tables(recursive=recursive, progress=progress, **kwargs)

def update_pyiron_tables(
self,
recursive: bool = True,
progress: bool = True,
**kwargs: dict,
):
kwargs["hamilton"] = "PyironTable"
for old, new in _MODULE_CONVERSION_DICT.items():
sys.modules[old] = importlib.import_module(new)

for job in self._project.iter_jobs(
recursive=recursive, progress=progress, **kwargs
):
job.to_hdf()


class UpdateMaintenance:
def __init__(self, project):
Expand Down
73 changes: 42 additions & 31 deletions pyiron_base/storage/hdfio.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,15 @@
import posixpath
import numpy as np
import sys
from typing import Union, Optional, Any
from typing import Union, Optional, Any, Tuple

from pyiron_base.utils.deprecate import deprecate
from pyiron_base.storage.helper_functions import read_hdf5, write_hdf5
from pyiron_base.interfaces.has_groups import HasGroups
from pyiron_base.state import state
from pyiron_base.jobs.dynamic import JOB_DYN_DICT, class_constructor
from pyiron_base.jobs.job.util import _get_safe_job_name
import pyiron_base.project.maintenance
import warnings

__author__ = "Joerg Neugebauer, Jan Janssen"
Expand Down Expand Up @@ -62,42 +63,57 @@ def extract_dims(v):
return len(set(dim1)) > 1 and len(set(dim_other)) == 1


def _import_class(class_name):
# for historic reasons we write str(class) into the HDF 'TYPE' field of objects, so we need to parse this back out
def _extract_fully_qualified_name(type_field: str) -> str:
return type_field.split("'")[1]


def _extract_module_class_name(type_field: str) -> Tuple[str, str]:
fully_qualified_path = _extract_fully_qualified_name(type_field)
return fully_qualified_path.rsplit(".", maxsplit=1)


def _import_class(module_path, class_name):
"""
Import given class from fully qualified name and return class object.
Args:
module_path (str): fully qualified name of a pyiron class
class_name (str): fully qualified name of a pyiron class
Returns:
type: class object of the given name
"""
internal_class_name = class_name.split(".")[-1][:-2]
class_path = class_name.split()[-1].split(".")[:-1]
class_path[0] = class_path[0][1:]
class_module_path = ".".join(class_path)
# ugly dynamic import, but only needed to log the warning anyway
from pyiron_base.jobs.job.jobtype import JobTypeChoice

job_class_dict = JobTypeChoice().job_class_dict # access global singleton
if internal_class_name in job_class_dict:
module_path = job_class_dict[internal_class_name]
if class_name in job_class_dict:
known_module_path = job_class_dict[class_name]
# entries in the job_class_dict are either strings of modules or fully
# loaded class object; in the latter case our work here is done we just
# return the class
if isinstance(module_path, type):
return module_path
if class_module_path != module_path:
if module_path != known_module_path:
state.logger.info(
f'Using registered module "{module_path}" instead of custom/old module "{class_module_path}" to'
f' import job type "{internal_class_name}"!'
f'Using registered module "{known_module_path}" instead of custom/old module "{module_path}" to'
f' import job type "{class_name}"!'
)
else:
module_path = class_module_path
return getattr(
importlib.import_module(module_path),
internal_class_name,
)
module_path = known_module_path
try:
return getattr(
importlib.import_module(module_path),
class_name,
)
except ImportError:
if module_path in pyiron_base.project.maintenance._MODULE_CONVERSION_DICT:
raise RuntimeError(
f"Could not import {class_name} from {module_path}, but module path known to have changed. "
"Call project.maintenance.local.update_hdf_types() to upgrade storage!"
)
else:
raise


def _to_object(hdf, class_name=None, **kwargs):
Expand All @@ -120,23 +136,18 @@ def _to_object(hdf, class_name=None, **kwargs):
raise ValueError(
"Object type in hdf5-file must be identical to input parameter"
)
class_name = class_name or hdf.get("TYPE")
class_path = class_name.split("<class '")[-1].split("'>")[0]
class_convert_dict = { # Fix backwards compatibility
"pyiron_base.generic.datacontainer.DataContainer": "pyiron_base.storage.datacontainer.DataContainer",
"pyiron_base.generic.inputlist.InputList": "pyiron_base.storage.inputlist.InputList",
"pyiron_base.generic.flattenedstorage.FlattenedStorage": "pyiron_base.storage.flattenedstorage.FlattenedStorage",
}
if class_path in class_convert_dict.keys():
class_name_new = "<class '" + class_convert_dict[class_path] + "'>"
class_object = _import_class(class_name_new)
elif not class_path.startswith("abc."):
class_object = _import_class(class_name)
type_field = class_name or hdf.get("TYPE")
module_path, class_name = _extract_module_class_name(type_field)

# objects that have classes starting with abc. were likely created by pyiron_base.jobs.dynamic, so we cannot import
# them the usual way. Instead reconstruct the class here from JOB_DYN_DICT.
if not module_path.startswith("abc."):
class_object = _import_class(module_path, class_name)
else:
class_object = class_constructor(cp=JOB_DYN_DICT[class_path.split(".")[-1]])
class_object = class_constructor(cp=JOB_DYN_DICT[class_name])

# Backwards compatibility since the format of TYPE changed
if class_name != str(class_object):
if type_field != str(class_object):
hdf["TYPE"] = str(class_object)

if hasattr(class_object, "from_hdf_args"):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -633,7 +633,7 @@ def test_rewrite_hdf5(self):
def test_import_class(self):

with self.subTest("import ToyJob without interfering:"):
toy_job_cls = _import_class(str(BaseToyJob))
toy_job_cls = _import_class(BaseToyJob.__module__, BaseToyJob.__name__)
self.assertIs(
toy_job_cls, BaseToyJob, msg="Did not return the requested class."
)
Expand All @@ -643,7 +643,7 @@ def test_import_class(self):

with self.subTest("Import ToyJob while another ToyJob is registered"):
with self.assertLogs(state.logger) as log:
toy_job_cls = _import_class(str(BaseToyJob))
toy_job_cls = _import_class(BaseToyJob.__module__, BaseToyJob.__name__)
self.assertEqual(
len(log.output),
1,
Expand Down
59 changes: 59 additions & 0 deletions tests/storage/test_module_path_maintenance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import pyiron_base.project.maintenance

from pyiron_base import DataContainer
from pyiron_base._tests import TestWithProject, ToyJob

class TestModulePath(TestWithProject):

def test_add_module_conversion(self):
"""module paths should only be able to be added once!"""

# need to add real modules here, because they need to be importable.
pyiron_base.project.maintenance.add_module_conversion('foo.bar', 'os.path')
with self.assertRaises(ValueError, msg="Adding paths twice should raise an error!"):
pyiron_base.project.maintenance.add_module_conversion('foo.bar', 'os')

def test_maintenance(self):
"""Objects should be loaded correctly after maintenance is run."""

# dummy data
dc = DataContainer({"a": 42, "b": [1,2,3]})
self.project.data["test_data"] = dc
self.project.data.write()

hdf = self.project.create_hdf(self.project.path, "project_data")["../data"]
# manipulate type to fake an old module
old_path_project_data = "project.data.module"
hdf["test_data__index_0/TYPE"] = f"<class '{old_path_project_data}.DataContainer'>"

job = self.project.create_job(ToyJob, "test_job")
job.run()

# manipulate type to fake an old module
old_path_job = "job.module"
job.project_hdf5["TYPE"] = f"<class '{old_path_job}.ToyJob'>"

pyiron_base.project.maintenance.add_module_conversion(old_path_project_data, DataContainer.__module__)
pyiron_base.project.maintenance.add_module_conversion(old_path_job, ToyJob.__module__)

with self.assertRaises(RuntimeError, msg="Project data should raise a special exception for objects that can be fixed."):
self.project.data.read()
self.project.data["test_data"] # need to access it, otherwise lazy loading hides the error

with self.assertRaises(RuntimeError, msg="Job loading should raise a special exception for objects that can be fixed."):
self.project["test_job"]

self.project.maintenance.local.update_hdf_types()

try:
self.project.data.read()
self.project.data["test_data"] # need to access it, otherwise lazy loading hides the error
self.project["test_job"]
except:
self.fail("Objects still not loadable after maintenance!")

self.assertFalse(old_path_project_data in hdf["test_data__index_0/TYPE"],
"Module path not updated in project data!")

self.assertFalse(old_path_job in job.project_hdf5["TYPE"],
"Module path not updated in job hdf5!")

0 comments on commit 56a2fba

Please sign in to comment.